def type_dumper(args): tgt_folder, fname = args typelib = TypeLib() with open(fname, "r") as f: for line in f: e = Example.from_json(json.loads(line)) for var in e.target.values(): typelib.add(var.typ) typelib.sort() with open(os.path.join(tgt_folder, "types", fname.split("/")[-1]), "w") as type_lib_file: encoded = TypeLibCodec.encode(typelib) type_lib_file.write(encoded)
def __init__(self): # Load the type library self.type_lib_file_name = os.path.join( os.environ["OUTPUT_DIR"], "types", os.environ["PREFIX"] + ".json.gz", ) try: with gzip.open(self.type_lib_file_name, "rt") as type_lib_file: self.type_lib = TypeLibCodec.decode(type_lib_file.read()) except Exception as e: print(e) print("Could not find type library, creating a new one") self.type_lib = TypeLib() super().__init__()
def main(args): np.random.seed(1234) random.seed(1992) tgt_folder = args["TARGET_FOLDER"] input_folder = args["INPUT_FOLDER"] input_fnames_file = args["INPUT_FNAMES"] input_fnames = [] max_files = int(args["--max"]) with open(input_fnames_file) as f: for s in f: s = s.strip() if s.endswith(".gz"): input_fnames.append(s) if len(input_fnames) >= max_files: break shard_size = int(args["--shard-size"]) if os.path.exists(tgt_folder): op = input(f"{tgt_folder} exists. remove? (y/n) ") if op == "y": shutil.rmtree(tgt_folder) os.system(f"mkdir -p {tgt_folder}") os.system(f"mkdir -p {tgt_folder}/files") os.system(f"mkdir -p {tgt_folder}/types") num_workers = 16 valid_example_count = 0 print("loading examples") with multiprocessing.Pool(num_workers) as pool: json_iter = pool.imap( json_line_reader, ((input_folder, fname) for fname in input_fnames), chunksize=64, ) example_iter = pool.imap(example_generator, json_iter, chunksize=64) for examples in tqdm(example_iter): if not examples: continue json_file_name = examples[0].binary_file["file_name"].split( "/")[-1] with open(os.path.join(tgt_folder, "files/", json_file_name), "w") as f: for example in examples: f.write(dumps(example.to_json()) + "\n") all_functions.setdefault( json_file_name, dict())[example.name] = example.canonical_code valid_example_count += len(examples) print("valid examples: ", valid_example_count) cur_dir = os.getcwd() all_files = glob.glob(os.path.join(tgt_folder, "files/*.jsonl")) file_prefix = os.path.join(tgt_folder, "files/") sorted(all_files) # sort all files by names all_files = list(all_files) file_num = len(all_files) print("Total valid binary file num: ", file_num) test_file = args["--test-file"] if test_file: print(f"using test file {test_file}") with tarfile.open(test_file, "r") as f: test_files = [ os.path.join(file_prefix, x.name.split("/")[-1]) for x in f.getmembers() if x.name.endswith(".jsonl") ] dev_file_num = 0 else: print(f"randomly sample test file {test_file}") test_file_num = int(file_num * 0.1) dev_file_num = int(file_num * 0.1) test_files = list( np.random.choice(all_files, size=test_file_num, replace=False)) test_files_set = set(test_files) train_files = [fname for fname in all_files if fname not in test_files_set] if dev_file_num == 0: dev_file_num = int(len(train_files) * 0.1) np.random.shuffle(train_files) dev_files = train_files[-dev_file_num:] train_files = train_files[:-dev_file_num] # Create types from filtered training set with multiprocessing.Pool(num_workers) as pool: pool.map( type_dumper, ((tgt_folder, fname) for fname in train_files), chunksize=64, ) print("reading typelib") typelib = TypeLib() for fname in tqdm(train_files): fname = os.path.basename(fname) fname = fname[:fname.index(".")] + ".jsonl" typelib.add_json_file(os.path.join(tgt_folder, "types", fname)) typelib.prune(5) typelib.sort() print("dumping typelib") with open(os.path.join(tgt_folder, "typelib.json"), "w") as type_lib_file: encoded = TypeLibCodec.encode(typelib) type_lib_file.write(encoded) train_functions = dict() for train_file in train_files: file_name = train_file.split("/")[-1] for func_name, func in all_functions[file_name].items(): train_functions.setdefault(func_name, set()).add(func) print( f"number training: {len(train_files)}", f"number dev: {len(dev_files)}", f"number test: {len(test_files)}", sep=", ", ) print("dump training files") shards = [ train_files[i:i + shard_size] for i in range(0, len(train_files), shard_size) ] for shard_id, shard_files in enumerate(shards): print(f"Preparing shard {shard_id}, {len(shard_files)} files: ") with open(os.path.join(tgt_folder, "file_list.txt"), "w") as f: for file_name in shard_files: f.write(file_name.split("/")[-1] + "\n") os.chdir(os.path.join(tgt_folder, "files")) print("creating tar file...") os.system(f"tar cf ../train-shard-{shard_id}.tar -T ../file_list.txt") os.chdir(cur_dir) def _dump_dev_file(tgt_file_name, file_names): with open(os.path.join(tgt_folder, "file_list.txt"), "w") as f: for file_name in file_names: last_file_name = file_name.split("/")[-1] f.write(last_file_name + "\n") with open(file_name) as fr: all_lines = fr.readlines() replace_lines = [] for line in all_lines: json_dict = json.loads(line.strip()) func_name = json_dict["name"] canonical_code = all_functions[last_file_name][func_name] func_name_in_train = False func_body_in_train = False if func_name in train_functions: func_name_in_train = True if canonical_code in train_functions[func_name]: func_body_in_train = True json_dict["test_meta"] = dict( function_name_in_train=func_name_in_train, function_body_in_train=func_body_in_train, ) new_json_str = json.dumps(json_dict) replace_lines.append(new_json_str.strip()) with open(file_name, "w") as fw: for line in replace_lines: fw.write(line + "\n") os.chdir(os.path.join(tgt_folder, "files")) print("creating tar file...") os.system(f"tar cf ../{tgt_file_name} -T ../file_list.txt") os.chdir(cur_dir) print("dump dev files") _dump_dev_file("dev.tar", dev_files) print("dump test files") _dump_dev_file("test.tar", test_files)