def __init__(self, config): super(XfmrDecoder, self).__init__() self.vocab = Vocab.load(config["vocab_file"]) with open(config["typelib_file"]) as type_f: self.typelib = TypeLibCodec.decode(type_f.read()) retype_vocab_size = len(self.vocab.types) rename_vocab_size = len(self.vocab.names) self.target_embedding = nn.Embedding( retype_vocab_size + rename_vocab_size, config["target_embedding_size"], ) self.target_transform = nn.Linear( config["target_embedding_size"] + config["hidden_size"], config["hidden_size"], ) # concat variable encoding and previous target token embedding as input decoder_layer = TransformerDecoderLayer( config["hidden_size"], 1, config["hidden_size"], config["dropout"], activation="gelu", ) decoder_norm = LayerNorm(config["hidden_size"]) self.decoder = TransformerDecoder(decoder_layer, config["num_layers"], decoder_norm) self.output = nn.Linear(config["hidden_size"], retype_vocab_size + rename_vocab_size) self.mem_mask = config["mem_mask"] self.config: Dict = config self.retype_vocab_size = retype_vocab_size
def __init__(self, config): super(XfmrDecoder, self).__init__() self.vocab = Vocab.load(config["vocab_file"]) with open(config["typelib_file"]) as type_f: self.typelib = TypeLibCodec.decode(type_f.read()) self.typelib = self.typelib.fix() self.target_embedding = nn.Embedding(len(self.vocab.subtypes), config["target_embedding_size"]) self.target_transform = nn.Linear( config["target_embedding_size"] + config["hidden_size"], config["hidden_size"], ) # self.cached_decode_mask: Dict[int, torch.Tensor] = {} # self.size = torch.zeros(len(self.vocab.types), dtype=torch.long) # concat variable encoding and previous target token embedding as input decoder_layer = TransformerDecoderLayer( config["hidden_size"], 1, config["hidden_size"], config["dropout"], activation="gelu", ) decoder_norm = LayerNorm(config["hidden_size"]) self.decoder = TransformerDecoder(decoder_layer, config["num_layers"], decoder_norm) self.output = nn.Linear(config["hidden_size"], len(self.vocab.subtypes)) self.config: Dict = config
def __init__(self, url: str, config: Optional[Dict] = None, percent: float = 1.0): # support wildcards urls = sorted(glob.glob(url)) urls = urls[:int(percent * len(urls))] super().__init__(urls) if config: # annotate example for training from dirty.utils.vocab import Vocab self.vocab = Vocab.load(config["vocab_file"]) with open(config["typelib_file"]) as type_f: self.typelib = TypeLibCodec.decode(type_f.read()) self.max_src_tokens_len = config["max_src_tokens_len"] self.max_num_var = config["max_num_var"] annotate = self._annotate self.rename = config.get("rename", False) # sort = Dataset._sort sort = identity else: # for creating the vocab annotate = identity sort = identity self = (self.pipe(Dataset._file_iter_to_line_iter).map( Example.from_json).map(annotate).shuffle( Dataset.SHUFFLE_BUFFER).pipe(sort))
def write_type_lib(self) -> None: """Dumps the type library to the file specified by the environment variable `TYPE_LIB`. """ with gzip.open(self.type_lib_file_name, "wt") as type_lib_file: encoded = TypeLibCodec.encode(self.type_lib) type_lib_file.write(encoded) type_lib_file.flush()
def from_json(cls, d) -> "Call.Arg": formal_type: TypeInfo = TypeLibCodec.decode(dumps( d["t"])) # type: ignore return cls( node_id=d["id"], is_vararg=d["va"], idx=d["i"], name=d["n"], formal_type=formal_type, )
def _preprocess(self): self.vocab.types.struct_set = set() for idx, type_str in self.vocab.types.id2word.items(): if type_str.startswith("struct"): self.vocab.types.struct_set.add(idx) with open(self.config["data"]["typelib_file"]) as type_f: typelib = TypeLibCodec.decode(type_f.read()) self.typstr_to_piece = {} for size in typelib: for _, tp in typelib[size]: self.typstr_to_piece[str(tp)] = tp.tokenize()[:-1] self.typstr_to_piece["<unk>"] = ["<unk>"]
def type_dumper(args): tgt_folder, fname = args typelib = TypeLib() with open(fname, "r") as f: for line in f: e = Example.from_json(json.loads(line)) for var in e.target.values(): typelib.add(var.typ) typelib.sort() with open(os.path.join(tgt_folder, "types", fname.split("/")[-1]), "w") as type_lib_file: encoded = TypeLibCodec.encode(typelib) type_lib_file.write(encoded)
def __init__(self): # Load the type library self.type_lib_file_name = os.path.join( os.environ["OUTPUT_DIR"], "types", os.environ["PREFIX"] + ".json.gz", ) try: with gzip.open(self.type_lib_file_name, "rt") as type_lib_file: self.type_lib = TypeLibCodec.decode(type_lib_file.read()) except Exception as e: print(e) print("Could not find type library, creating a new one") self.type_lib = TypeLib() super().__init__()
def from_json(cls, d): ast = AST.from_json(d["t"]) if d["t"] else None return_type = TypeLibCodec.decode(dumps(d["r"])) arguments = dict() for key, args in d["a"].items(): arguments[location_from_json_key(key)] = { Variable.from_json(arg) for arg in args } local_vars = dict() for key, locs in d["l"].items(): local_vars[location_from_json_key(key)] = { Variable.from_json(loc) for loc in locs } return cls( ast=ast, name=d["n"], return_type=return_type, arguments=arguments, local_vars=local_vars, raw_code=d["c"], )
def __init__(self, config): super(XfmrDecoder, self).__init__() self.vocab = Vocab.load(config["vocab_file"]) with open(config["typelib_file"]) as type_f: self.typelib = TypeLibCodec.decode(type_f.read()) vocab_size = (len(self.vocab.names) if config.get("rename", False) else len(self.vocab.types)) self.target_id_key = ("target_name_id" if config.get("rename", False) else "target_type_id") self.target_embedding = nn.Embedding(vocab_size, config["target_embedding_size"]) self.target_transform = nn.Linear( config["target_embedding_size"] + config["hidden_size"], config["hidden_size"], ) self.cached_decode_mask: Dict[int, torch.Tensor] = {} self.size = torch.zeros(vocab_size, dtype=torch.long) # concat variable encoding and previous target token embedding as input decoder_layer = TransformerDecoderLayer( config["hidden_size"], config["num_heads"], 4 * config["hidden_size"], config["dropout"], activation="gelu", ) decoder_norm = LayerNorm(config["hidden_size"]) self.decoder = TransformerDecoder(decoder_layer, config["num_layers"], decoder_norm) self.output = nn.Linear(config["hidden_size"], vocab_size) self.mem_mask = config["mem_mask"] if config.get("rename", False): self.mem_mask = "none" self.config: Dict = config
def from_json(cls, d) -> "Type": typ: TypeInfo = TypeLibCodec.decode(dumps(d["t"])) # type: ignore return cls(node_id=d["id"], typ=typ)
import _jsonnet from csvnpm.binary.dire_types import TypeLibCodec from torch.utils.data import DataLoader from tqdm import tqdm from dirty.utils.dataset import Dataset # type: ignore if __name__ == "__main__": config = json.loads(_jsonnet.evaluate_file("retype.xfmr.jsonnet")) dataset = Dataset(config["data"]["test_file"], config["data"]) dataloader: DataLoader = DataLoader(dataset, num_workers=8, batch_size=None) with open(config["data"]["typelib_file"]) as type_f: typelib = TypeLibCodec.decode(type_f.read()) most_common_for_size = {} types_model = dataset.vocab.types for size in typelib: freq, tp = typelib[size][0] most_common_for_size[size] = str(tp) results: Dict[str, Dict[str, Any]] = {} for example in tqdm(dataloader): for src_name, src_type, tgt_var_mem in zip( example.src_var_names, example.src_var_types_str, example.tgt_var_src_mems, ): results.setdefault(example.binary, {}).setdefault( example.name, {})[src_name[2:-2]] = (
def main(args): np.random.seed(1234) random.seed(1992) tgt_folder = args["TARGET_FOLDER"] input_folder = args["INPUT_FOLDER"] input_fnames_file = args["INPUT_FNAMES"] input_fnames = [] max_files = int(args["--max"]) with open(input_fnames_file) as f: for s in f: s = s.strip() if s.endswith(".gz"): input_fnames.append(s) if len(input_fnames) >= max_files: break shard_size = int(args["--shard-size"]) if os.path.exists(tgt_folder): op = input(f"{tgt_folder} exists. remove? (y/n) ") if op == "y": shutil.rmtree(tgt_folder) os.system(f"mkdir -p {tgt_folder}") os.system(f"mkdir -p {tgt_folder}/files") os.system(f"mkdir -p {tgt_folder}/types") num_workers = 16 valid_example_count = 0 print("loading examples") with multiprocessing.Pool(num_workers) as pool: json_iter = pool.imap( json_line_reader, ((input_folder, fname) for fname in input_fnames), chunksize=64, ) example_iter = pool.imap(example_generator, json_iter, chunksize=64) for examples in tqdm(example_iter): if not examples: continue json_file_name = examples[0].binary_file["file_name"].split( "/")[-1] with open(os.path.join(tgt_folder, "files/", json_file_name), "w") as f: for example in examples: f.write(dumps(example.to_json()) + "\n") all_functions.setdefault( json_file_name, dict())[example.name] = example.canonical_code valid_example_count += len(examples) print("valid examples: ", valid_example_count) cur_dir = os.getcwd() all_files = glob.glob(os.path.join(tgt_folder, "files/*.jsonl")) file_prefix = os.path.join(tgt_folder, "files/") sorted(all_files) # sort all files by names all_files = list(all_files) file_num = len(all_files) print("Total valid binary file num: ", file_num) test_file = args["--test-file"] if test_file: print(f"using test file {test_file}") with tarfile.open(test_file, "r") as f: test_files = [ os.path.join(file_prefix, x.name.split("/")[-1]) for x in f.getmembers() if x.name.endswith(".jsonl") ] dev_file_num = 0 else: print(f"randomly sample test file {test_file}") test_file_num = int(file_num * 0.1) dev_file_num = int(file_num * 0.1) test_files = list( np.random.choice(all_files, size=test_file_num, replace=False)) test_files_set = set(test_files) train_files = [fname for fname in all_files if fname not in test_files_set] if dev_file_num == 0: dev_file_num = int(len(train_files) * 0.1) np.random.shuffle(train_files) dev_files = train_files[-dev_file_num:] train_files = train_files[:-dev_file_num] # Create types from filtered training set with multiprocessing.Pool(num_workers) as pool: pool.map( type_dumper, ((tgt_folder, fname) for fname in train_files), chunksize=64, ) print("reading typelib") typelib = TypeLib() for fname in tqdm(train_files): fname = os.path.basename(fname) fname = fname[:fname.index(".")] + ".jsonl" typelib.add_json_file(os.path.join(tgt_folder, "types", fname)) typelib.prune(5) typelib.sort() print("dumping typelib") with open(os.path.join(tgt_folder, "typelib.json"), "w") as type_lib_file: encoded = TypeLibCodec.encode(typelib) type_lib_file.write(encoded) train_functions = dict() for train_file in train_files: file_name = train_file.split("/")[-1] for func_name, func in all_functions[file_name].items(): train_functions.setdefault(func_name, set()).add(func) print( f"number training: {len(train_files)}", f"number dev: {len(dev_files)}", f"number test: {len(test_files)}", sep=", ", ) print("dump training files") shards = [ train_files[i:i + shard_size] for i in range(0, len(train_files), shard_size) ] for shard_id, shard_files in enumerate(shards): print(f"Preparing shard {shard_id}, {len(shard_files)} files: ") with open(os.path.join(tgt_folder, "file_list.txt"), "w") as f: for file_name in shard_files: f.write(file_name.split("/")[-1] + "\n") os.chdir(os.path.join(tgt_folder, "files")) print("creating tar file...") os.system(f"tar cf ../train-shard-{shard_id}.tar -T ../file_list.txt") os.chdir(cur_dir) def _dump_dev_file(tgt_file_name, file_names): with open(os.path.join(tgt_folder, "file_list.txt"), "w") as f: for file_name in file_names: last_file_name = file_name.split("/")[-1] f.write(last_file_name + "\n") with open(file_name) as fr: all_lines = fr.readlines() replace_lines = [] for line in all_lines: json_dict = json.loads(line.strip()) func_name = json_dict["name"] canonical_code = all_functions[last_file_name][func_name] func_name_in_train = False func_body_in_train = False if func_name in train_functions: func_name_in_train = True if canonical_code in train_functions[func_name]: func_body_in_train = True json_dict["test_meta"] = dict( function_name_in_train=func_name_in_train, function_body_in_train=func_body_in_train, ) new_json_str = json.dumps(json_dict) replace_lines.append(new_json_str.strip()) with open(file_name, "w") as fw: for line in replace_lines: fw.write(line + "\n") os.chdir(os.path.join(tgt_folder, "files")) print("creating tar file...") os.system(f"tar cf ../{tgt_file_name} -T ../file_list.txt") os.chdir(cur_dir) print("dump dev files") _dump_dev_file("dev.tar", dev_files) print("dump test files") _dump_dev_file("test.tar", test_files)
def from_json(cls, d): typ = TypeLibCodec.decode(dumps(d["t"])) return cls(typ=typ, name=d["n"], user=d["u"])