def split_graphs(data_dir, out_dir, train_ratio=0.7, valid_ratio=0.1): num_train, num_valid, num_test = 0, 0, 0 with ChunkWriter(os.path.join(out_dir, 'train'), file_prefix='graph-', max_chunk_size=1000, file_suffix='.jsonl.gz') as train_w,\ ChunkWriter(os.path.join(out_dir, 'valid'), file_prefix='graph-', max_chunk_size=1000, file_suffix='.jsonl.gz') as valid_w,\ ChunkWriter(os.path.join(out_dir, 'test'), file_prefix='graph-', max_chunk_size=1000, file_suffix='.jsonl.gz') as test_w: for f in tqdm(iglob(os.path.join(data_dir, "*.jsonl.gz"))): for ex in load_jsonl_gz(f): partition = get_fold(ex["filename"], train_ratio, valid_ratio) if partition == "train": train_w.add(ex) num_train += 1 elif partition == "valid": valid_w.add(ex) num_valid += 1 else: test_w.add(ex) num_test += 1 logging.basicConfig( level=logging.INFO, format="%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s", ) logging.info("Train: %d", num_train) logging.info("Valid: %d", num_valid) logging.info("Test: %d", num_test) logging.info("Total: %d", num_train + num_valid + num_test)
def run(args: Namespace): filenames = get_files(args.input) lattice = TypeLatticeGenerator(args.type_rules) skipped_files, errored_files = [], [] graph_generator = generate_graphs_for_files(filenames, skipped_files, errored_files, lattice) os.makedirs(args.output, exist_ok=True) if args.format == "dot": for i, (generator, graph, fname) in tqdm(enumerate(graph_generator)): generator.to_dot(os.path.join(args.output, f"graph_{i}.dot"), initial_comment=fname) elif args.format == "jsonl_gz": with ChunkWriter(out_folder=args.output, file_prefix='all-graphs', max_chunk_size=5000, file_suffix='.jsonl.gz') as writer: for i, (generator, graph, fname) in tqdm(enumerate(graph_generator)): graph['filename'] = fname writer.add(graph) else: raise ValueError(f'File format {args.format} is not supported') print(f'Finished parsing. Skipped {len(skipped_files)}, failed on {len(errored_files)}') if len(skipped_files) > 0: print('Skipped:') for fname in skipped_files: print(fname) if len(errored_files) > 0: print('Failed on:') for fname, e in errored_files: print(fname) if args.print_errors: print(str(e))
def test_write_read_msgpack_sequential(self): self.__test_write_read( lambda p: ChunkWriter(p, file_prefix='test', max_chunk_size=123, file_suffix='-test.msgpack.l.gz'), suffix='.msgpack.l.gz')
def test_write_read_jsonl_parallel(self): self.__test_write_read( lambda p: ChunkWriter(p, file_prefix='test', max_chunk_size=123, file_suffix='-test.jsonl.gz', parallel_writers=5))
def test_write_read_msgpack_parallel(self): self.__test_write_read( lambda p: ChunkWriter(p, file_prefix='test', max_chunk_size=123, file_suffix='-test.msgpack.l.gz', parallel_writers=5), suffix='.msgpack.l.gz')
def main(arguments): try: start_time = time.clock() print("Exploring folders ...") walk_dir = arguments['SOURCE_FOLDER'] monitoring = Monitoring() type_lattice = TypeLatticeGenerator(arguments['TYPING_RULES']) with open(arguments['DUPLICATES_JSON'], errors='ignore') as f: duplicates = json.load(f) all_to_remove = set() # type: Set[str] for duplicate_cluster in duplicates: # Keep the first element, everything else should be ignored all_to_remove.update(duplicate_cluster[1:]) # Extract graphs outputs = explore_files(walk_dir, all_to_remove, monitoring, type_lattice) # Save results with ChunkWriter(out_folder=arguments['SAVE_FOLDER'], file_prefix='all-graphs', max_chunk_size=5000, file_suffix='.jsonl.gz') as writer: for graph in outputs: writer.add(graph) except bdb.BdbQuit: return except Exception as e: print("e: ", e) print(monitoring.current_repo) print(monitoring.file) print("Building and saving the type graph...") type_lattice.build_graph() save_jsonl_gz([type_lattice.return_json()], os.path.join(arguments['SAVE_FOLDER'], "_type_lattice.json.gz")) print("Done.") print("Generated %d graphs out of %d snippets" % (monitoring.count - len(monitoring.errors), monitoring.count)) with open( os.path.join(arguments['SAVE_FOLDER'], 'logs_graph_generator.txt'), 'w') as f: for item in monitoring.errors: try: f.write("%s\n" % item) except: pass print("\nExecution in: ", time.clock() - start_time, " seconds")
def run(arguments): azure_info_path = arguments.get('--azure-info', None) input_folder = RichPath.create(arguments['INPUT_PATH'], azure_info_path) output_folder = RichPath.create(arguments['OUTPUT_PATH'], azure_info_path) with ChunkWriter(output_folder, file_prefix='codedata', max_chunk_size=500, file_suffix='.jsonl.gz') as chunked_writer: for file in input_folder.iterate_filtered_files_in_dir('*.jsonl.gz'): for line in file.read_by_file_suffix(): tokens = line['code_tokens'] chunked_writer.add( dict(filename='%s:%s:%s' % (line['repo'], line['path'], line['lineno']), tokens=tokens))
def extract_graphs(root_dir, typing_rules_path, files_to_extract: Set[str], target_folder): start_time = time.time() print("Traversing folders ...") monitoring = Monitoring() type_lattice = TypeLatticeGenerator(typing_rules_path) # Extract graphs outputs = explore_files(root_dir, files_to_extract, monitoring, type_lattice) # Save results with ChunkWriter( out_folder=target_folder, file_prefix="all-graphs", max_chunk_size=5000, file_suffix=".jsonl.gz", ) as writer: for graph in outputs: writer.add(graph) print("Building and saving the type graph...") type_lattice.build_graph() save_jsonl_gz( [type_lattice.return_json()], os.path.join(target_folder, "_type_lattice.json.gz"), ) print("Done.") print( "Generated %d graphs out of %d snippets" % (monitoring.count - len(monitoring.errors), monitoring.count) ) with open(os.path.join(target_folder, "logs_graph_generator.txt"), "w") as f: for item in monitoring.errors: try: f.write("%s\n" % item) except: pass print("\nGraph Execution in: ", time.time() - start_time, " seconds")
parser.add_argument( "-train-pct", type=float, default=0.7, help="ratio of training set to whole corpus", ) parser.add_argument( "-valid-pct", type=float, default=0.1, help="ratio of validation set to whole corpus", ) args = parser.parse_args() num_train, num_valid, num_test = 0, 0, 0 with ChunkWriter(os.path.join(args.out_dir, 'train'), file_prefix='graph-', max_chunk_size=1000, file_suffix='.jsonl.gz') as train_w,\ ChunkWriter(os.path.join(args.out_dir, 'valid'), file_prefix='graph-', max_chunk_size=1000, file_suffix='.jsonl.gz') as valid_w,\ ChunkWriter(os.path.join(args.out_dir, 'test'), file_prefix='graph-', max_chunk_size=1000, file_suffix='.jsonl.gz') as test_w: for f in tqdm(iglob(os.path.join(args.data_dir, "*.jsonl.gz"))): for ex in load_jsonl_gz(f): partition = get_fold( ex["filename"], args.train_pct, args.valid_pct ) if partition == "train": train_w.add(ex) num_train += 1 elif partition == "valid": valid_w.add(ex) num_valid += 1 else: test_w.add(ex)
def test_write_read_standard(self): self.__test_write_read(lambda p: ChunkWriter(p, file_prefix='test', max_chunk_size=123, file_suffix='-test.jsonl.gz'))