예제 #1
0
    def __init__(self, config):
        super(XfmrDecoder, self).__init__()

        self.vocab = Vocab.load(config["vocab_file"])
        with open(config["typelib_file"]) as type_f:
            self.typelib = TypeLibCodec.decode(type_f.read())

        retype_vocab_size = len(self.vocab.types)
        rename_vocab_size = len(self.vocab.names)
        self.target_embedding = nn.Embedding(
            retype_vocab_size + rename_vocab_size,
            config["target_embedding_size"],
        )
        self.target_transform = nn.Linear(
            config["target_embedding_size"] + config["hidden_size"],
            config["hidden_size"],
        )

        # concat variable encoding and previous target token embedding as input
        decoder_layer = TransformerDecoderLayer(
            config["hidden_size"],
            1,
            config["hidden_size"],
            config["dropout"],
            activation="gelu",
        )
        decoder_norm = LayerNorm(config["hidden_size"])
        self.decoder = TransformerDecoder(decoder_layer, config["num_layers"],
                                          decoder_norm)
        self.output = nn.Linear(config["hidden_size"],
                                retype_vocab_size + rename_vocab_size)
        self.mem_mask = config["mem_mask"]
        self.config: Dict = config
        self.retype_vocab_size = retype_vocab_size
예제 #2
0
    def __init__(self, config):
        super(XfmrDecoder, self).__init__()

        self.vocab = Vocab.load(config["vocab_file"])
        with open(config["typelib_file"]) as type_f:
            self.typelib = TypeLibCodec.decode(type_f.read())
            self.typelib = self.typelib.fix()
        self.target_embedding = nn.Embedding(len(self.vocab.subtypes),
                                             config["target_embedding_size"])
        self.target_transform = nn.Linear(
            config["target_embedding_size"] + config["hidden_size"],
            config["hidden_size"],
        )
        # self.cached_decode_mask: Dict[int, torch.Tensor] = {}
        # self.size = torch.zeros(len(self.vocab.types), dtype=torch.long)

        # concat variable encoding and previous target token embedding as input
        decoder_layer = TransformerDecoderLayer(
            config["hidden_size"],
            1,
            config["hidden_size"],
            config["dropout"],
            activation="gelu",
        )
        decoder_norm = LayerNorm(config["hidden_size"])
        self.decoder = TransformerDecoder(decoder_layer, config["num_layers"],
                                          decoder_norm)
        self.output = nn.Linear(config["hidden_size"],
                                len(self.vocab.subtypes))

        self.config: Dict = config
예제 #3
0
    def __init__(self,
                 url: str,
                 config: Optional[Dict] = None,
                 percent: float = 1.0):
        # support wildcards
        urls = sorted(glob.glob(url))
        urls = urls[:int(percent * len(urls))]
        super().__init__(urls)
        if config:
            # annotate example for training
            from dirty.utils.vocab import Vocab

            self.vocab = Vocab.load(config["vocab_file"])
            with open(config["typelib_file"]) as type_f:
                self.typelib = TypeLibCodec.decode(type_f.read())
            self.max_src_tokens_len = config["max_src_tokens_len"]
            self.max_num_var = config["max_num_var"]
            annotate = self._annotate
            self.rename = config.get("rename", False)
            # sort = Dataset._sort
            sort = identity
        else:
            # for creating the vocab
            annotate = identity
            sort = identity
        self = (self.pipe(Dataset._file_iter_to_line_iter).map(
            Example.from_json).map(annotate).shuffle(
                Dataset.SHUFFLE_BUFFER).pipe(sort))
예제 #4
0
파일: collect.py 프로젝트: CMUSTRUDEL/DIRTY
 def write_type_lib(self) -> None:
     """Dumps the type library to the file specified by the environment variable
     `TYPE_LIB`.
     """
     with gzip.open(self.type_lib_file_name, "wt") as type_lib_file:
         encoded = TypeLibCodec.encode(self.type_lib)
         type_lib_file.write(encoded)
         type_lib_file.flush()
예제 #5
0
파일: ida_ast.py 프로젝트: CMUSTRUDEL/DIRTY
 def from_json(cls, d) -> "Call.Arg":
     formal_type: TypeInfo = TypeLibCodec.decode(dumps(
         d["t"]))  # type: ignore
     return cls(
         node_id=d["id"],
         is_vararg=d["va"],
         idx=d["i"],
         name=d["n"],
         formal_type=formal_type,
     )
예제 #6
0
파일: model.py 프로젝트: CMUSTRUDEL/DIRTY
 def _preprocess(self):
     self.vocab.types.struct_set = set()
     for idx, type_str in self.vocab.types.id2word.items():
         if type_str.startswith("struct"):
             self.vocab.types.struct_set.add(idx)
     with open(self.config["data"]["typelib_file"]) as type_f:
         typelib = TypeLibCodec.decode(type_f.read())
         self.typstr_to_piece = {}
         for size in typelib:
             for _, tp in typelib[size]:
                 self.typstr_to_piece[str(tp)] = tp.tokenize()[:-1]
     self.typstr_to_piece["<unk>"] = ["<unk>"]
예제 #7
0
def type_dumper(args):
    tgt_folder, fname = args
    typelib = TypeLib()
    with open(fname, "r") as f:
        for line in f:
            e = Example.from_json(json.loads(line))
            for var in e.target.values():
                typelib.add(var.typ)
    typelib.sort()
    with open(os.path.join(tgt_folder, "types",
                           fname.split("/")[-1]), "w") as type_lib_file:
        encoded = TypeLibCodec.encode(typelib)
        type_lib_file.write(encoded)
예제 #8
0
파일: collect.py 프로젝트: CMUSTRUDEL/DIRTY
 def __init__(self):
     # Load the type library
     self.type_lib_file_name = os.path.join(
         os.environ["OUTPUT_DIR"],
         "types",
         os.environ["PREFIX"] + ".json.gz",
     )
     try:
         with gzip.open(self.type_lib_file_name, "rt") as type_lib_file:
             self.type_lib = TypeLibCodec.decode(type_lib_file.read())
     except Exception as e:
         print(e)
         print("Could not find type library, creating a new one")
         self.type_lib = TypeLib()
     super().__init__()
예제 #9
0
 def from_json(cls, d):
     ast = AST.from_json(d["t"]) if d["t"] else None
     return_type = TypeLibCodec.decode(dumps(d["r"]))
     arguments = dict()
     for key, args in d["a"].items():
         arguments[location_from_json_key(key)] = {
             Variable.from_json(arg)
             for arg in args
         }
     local_vars = dict()
     for key, locs in d["l"].items():
         local_vars[location_from_json_key(key)] = {
             Variable.from_json(loc)
             for loc in locs
         }
     return cls(
         ast=ast,
         name=d["n"],
         return_type=return_type,
         arguments=arguments,
         local_vars=local_vars,
         raw_code=d["c"],
     )
예제 #10
0
    def __init__(self, config):
        super(XfmrDecoder, self).__init__()

        self.vocab = Vocab.load(config["vocab_file"])
        with open(config["typelib_file"]) as type_f:
            self.typelib = TypeLibCodec.decode(type_f.read())
        vocab_size = (len(self.vocab.names) if config.get("rename", False) else
                      len(self.vocab.types))
        self.target_id_key = ("target_name_id" if config.get("rename", False)
                              else "target_type_id")
        self.target_embedding = nn.Embedding(vocab_size,
                                             config["target_embedding_size"])
        self.target_transform = nn.Linear(
            config["target_embedding_size"] + config["hidden_size"],
            config["hidden_size"],
        )
        self.cached_decode_mask: Dict[int, torch.Tensor] = {}
        self.size = torch.zeros(vocab_size, dtype=torch.long)

        # concat variable encoding and previous target token embedding as input
        decoder_layer = TransformerDecoderLayer(
            config["hidden_size"],
            config["num_heads"],
            4 * config["hidden_size"],
            config["dropout"],
            activation="gelu",
        )
        decoder_norm = LayerNorm(config["hidden_size"])
        self.decoder = TransformerDecoder(decoder_layer, config["num_layers"],
                                          decoder_norm)
        self.output = nn.Linear(config["hidden_size"], vocab_size)
        self.mem_mask = config["mem_mask"]
        if config.get("rename", False):
            self.mem_mask = "none"

        self.config: Dict = config
예제 #11
0
파일: ida_ast.py 프로젝트: CMUSTRUDEL/DIRTY
 def from_json(cls, d) -> "Type":
     typ: TypeInfo = TypeLibCodec.decode(dumps(d["t"]))  # type: ignore
     return cls(node_id=d["id"], typ=typ)
예제 #12
0
import _jsonnet
from csvnpm.binary.dire_types import TypeLibCodec
from torch.utils.data import DataLoader
from tqdm import tqdm

from dirty.utils.dataset import Dataset  # type: ignore

if __name__ == "__main__":
    config = json.loads(_jsonnet.evaluate_file("retype.xfmr.jsonnet"))
    dataset = Dataset(config["data"]["test_file"], config["data"])
    dataloader: DataLoader = DataLoader(dataset,
                                        num_workers=8,
                                        batch_size=None)
    with open(config["data"]["typelib_file"]) as type_f:
        typelib = TypeLibCodec.decode(type_f.read())
    most_common_for_size = {}
    types_model = dataset.vocab.types
    for size in typelib:
        freq, tp = typelib[size][0]
        most_common_for_size[size] = str(tp)

    results: Dict[str, Dict[str, Any]] = {}
    for example in tqdm(dataloader):
        for src_name, src_type, tgt_var_mem in zip(
                example.src_var_names,
                example.src_var_types_str,
                example.tgt_var_src_mems,
        ):
            results.setdefault(example.binary, {}).setdefault(
                example.name, {})[src_name[2:-2]] = (
예제 #13
0
def main(args):
    np.random.seed(1234)
    random.seed(1992)

    tgt_folder = args["TARGET_FOLDER"]
    input_folder = args["INPUT_FOLDER"]
    input_fnames_file = args["INPUT_FNAMES"]
    input_fnames = []
    max_files = int(args["--max"])
    with open(input_fnames_file) as f:
        for s in f:
            s = s.strip()
            if s.endswith(".gz"):
                input_fnames.append(s)
            if len(input_fnames) >= max_files:
                break
    shard_size = int(args["--shard-size"])

    if os.path.exists(tgt_folder):
        op = input(f"{tgt_folder} exists. remove? (y/n) ")
        if op == "y":
            shutil.rmtree(tgt_folder)

    os.system(f"mkdir -p {tgt_folder}")
    os.system(f"mkdir -p {tgt_folder}/files")
    os.system(f"mkdir -p {tgt_folder}/types")
    num_workers = 16

    valid_example_count = 0

    print("loading examples")
    with multiprocessing.Pool(num_workers) as pool:
        json_iter = pool.imap(
            json_line_reader,
            ((input_folder, fname) for fname in input_fnames),
            chunksize=64,
        )

        example_iter = pool.imap(example_generator, json_iter, chunksize=64)

        for examples in tqdm(example_iter):
            if not examples:
                continue
            json_file_name = examples[0].binary_file["file_name"].split(
                "/")[-1]
            with open(os.path.join(tgt_folder, "files/", json_file_name),
                      "w") as f:
                for example in examples:
                    f.write(dumps(example.to_json()) + "\n")
                    all_functions.setdefault(
                        json_file_name,
                        dict())[example.name] = example.canonical_code

            valid_example_count += len(examples)

    print("valid examples: ", valid_example_count)

    cur_dir = os.getcwd()
    all_files = glob.glob(os.path.join(tgt_folder, "files/*.jsonl"))
    file_prefix = os.path.join(tgt_folder, "files/")
    sorted(all_files)  # sort all files by names
    all_files = list(all_files)
    file_num = len(all_files)
    print("Total valid binary file num: ", file_num)

    test_file = args["--test-file"]
    if test_file:
        print(f"using test file {test_file}")
        with tarfile.open(test_file, "r") as f:
            test_files = [
                os.path.join(file_prefix,
                             x.name.split("/")[-1]) for x in f.getmembers()
                if x.name.endswith(".jsonl")
            ]
        dev_file_num = 0
    else:
        print(f"randomly sample test file {test_file}")
        test_file_num = int(file_num * 0.1)
        dev_file_num = int(file_num * 0.1)
        test_files = list(
            np.random.choice(all_files, size=test_file_num, replace=False))

    test_files_set = set(test_files)
    train_files = [fname for fname in all_files if fname not in test_files_set]

    if dev_file_num == 0:
        dev_file_num = int(len(train_files) * 0.1)

    np.random.shuffle(train_files)
    dev_files = train_files[-dev_file_num:]
    train_files = train_files[:-dev_file_num]

    # Create types from filtered training set
    with multiprocessing.Pool(num_workers) as pool:
        pool.map(
            type_dumper,
            ((tgt_folder, fname) for fname in train_files),
            chunksize=64,
        )
    print("reading typelib")
    typelib = TypeLib()
    for fname in tqdm(train_files):
        fname = os.path.basename(fname)
        fname = fname[:fname.index(".")] + ".jsonl"
        typelib.add_json_file(os.path.join(tgt_folder, "types", fname))
    typelib.prune(5)
    typelib.sort()

    print("dumping typelib")
    with open(os.path.join(tgt_folder, "typelib.json"), "w") as type_lib_file:
        encoded = TypeLibCodec.encode(typelib)
        type_lib_file.write(encoded)

    train_functions = dict()
    for train_file in train_files:
        file_name = train_file.split("/")[-1]
        for func_name, func in all_functions[file_name].items():
            train_functions.setdefault(func_name, set()).add(func)

    print(
        f"number training: {len(train_files)}",
        f"number dev: {len(dev_files)}",
        f"number test: {len(test_files)}",
        sep=", ",
    )
    print("dump training files")
    shards = [
        train_files[i:i + shard_size]
        for i in range(0, len(train_files), shard_size)
    ]
    for shard_id, shard_files in enumerate(shards):
        print(f"Preparing shard {shard_id}, {len(shard_files)} files: ")
        with open(os.path.join(tgt_folder, "file_list.txt"), "w") as f:
            for file_name in shard_files:
                f.write(file_name.split("/")[-1] + "\n")

        os.chdir(os.path.join(tgt_folder, "files"))
        print("creating tar file...")
        os.system(f"tar cf ../train-shard-{shard_id}.tar -T ../file_list.txt")
        os.chdir(cur_dir)

    def _dump_dev_file(tgt_file_name, file_names):
        with open(os.path.join(tgt_folder, "file_list.txt"), "w") as f:
            for file_name in file_names:
                last_file_name = file_name.split("/")[-1]
                f.write(last_file_name + "\n")

                with open(file_name) as fr:
                    all_lines = fr.readlines()

                replace_lines = []
                for line in all_lines:
                    json_dict = json.loads(line.strip())
                    func_name = json_dict["name"]
                    canonical_code = all_functions[last_file_name][func_name]
                    func_name_in_train = False
                    func_body_in_train = False
                    if func_name in train_functions:
                        func_name_in_train = True
                        if canonical_code in train_functions[func_name]:
                            func_body_in_train = True

                    json_dict["test_meta"] = dict(
                        function_name_in_train=func_name_in_train,
                        function_body_in_train=func_body_in_train,
                    )
                    new_json_str = json.dumps(json_dict)
                    replace_lines.append(new_json_str.strip())

                with open(file_name, "w") as fw:
                    for line in replace_lines:
                        fw.write(line + "\n")

        os.chdir(os.path.join(tgt_folder, "files"))
        print("creating tar file...")
        os.system(f"tar cf ../{tgt_file_name} -T ../file_list.txt")
        os.chdir(cur_dir)

    print("dump dev files")
    _dump_dev_file("dev.tar", dev_files)
    print("dump test files")
    _dump_dev_file("test.tar", test_files)
예제 #14
0
 def from_json(cls, d):
     typ = TypeLibCodec.decode(dumps(d["t"]))
     return cls(typ=typ, name=d["n"], user=d["u"])