Пример #1
0
def split_graphs(data_dir, out_dir, train_ratio=0.7, valid_ratio=0.1):

    num_train, num_valid, num_test = 0, 0, 0
    with ChunkWriter(os.path.join(out_dir, 'train'), file_prefix='graph-', max_chunk_size=1000, file_suffix='.jsonl.gz') as train_w,\
        ChunkWriter(os.path.join(out_dir, 'valid'), file_prefix='graph-', max_chunk_size=1000, file_suffix='.jsonl.gz') as valid_w,\
        ChunkWriter(os.path.join(out_dir, 'test'), file_prefix='graph-', max_chunk_size=1000, file_suffix='.jsonl.gz') as test_w:
        for f in tqdm(iglob(os.path.join(data_dir, "*.jsonl.gz"))):
            for ex in load_jsonl_gz(f):
                partition = get_fold(ex["filename"], train_ratio, valid_ratio)
                if partition == "train":
                    train_w.add(ex)
                    num_train += 1
                elif partition == "valid":
                    valid_w.add(ex)
                    num_valid += 1
                else:
                    test_w.add(ex)
                    num_test += 1

    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s",
    )
    logging.info("Train: %d", num_train)
    logging.info("Valid: %d", num_valid)
    logging.info("Test: %d", num_test)
    logging.info("Total: %d", num_train + num_valid + num_test)
Пример #2
0
def run(args: Namespace):
    filenames = get_files(args.input)
    lattice = TypeLatticeGenerator(args.type_rules)
    skipped_files, errored_files = [], []
    graph_generator = generate_graphs_for_files(filenames, skipped_files, errored_files, lattice)

    os.makedirs(args.output, exist_ok=True)

    if args.format == "dot":
        for i, (generator, graph, fname) in tqdm(enumerate(graph_generator)):
            generator.to_dot(os.path.join(args.output, f"graph_{i}.dot"), initial_comment=fname)
    elif args.format == "jsonl_gz":
        with ChunkWriter(out_folder=args.output, file_prefix='all-graphs',
                         max_chunk_size=5000, file_suffix='.jsonl.gz') as writer:
            for i, (generator, graph, fname) in tqdm(enumerate(graph_generator)):
                graph['filename'] = fname
                writer.add(graph)
    else:
        raise ValueError(f'File format {args.format} is not supported')
    print(f'Finished parsing. Skipped {len(skipped_files)}, failed on {len(errored_files)}')
    if len(skipped_files) > 0:
        print('Skipped:')
        for fname in skipped_files:
            print(fname)
    if len(errored_files) > 0:
        print('Failed on:')
        for fname, e in errored_files:
            print(fname)
            if args.print_errors:
                print(str(e))
Пример #3
0
 def test_write_read_msgpack_sequential(self):
     self.__test_write_read(
         lambda p: ChunkWriter(p,
                               file_prefix='test',
                               max_chunk_size=123,
                               file_suffix='-test.msgpack.l.gz'),
         suffix='.msgpack.l.gz')
Пример #4
0
 def test_write_read_jsonl_parallel(self):
     self.__test_write_read(
         lambda p: ChunkWriter(p,
                               file_prefix='test',
                               max_chunk_size=123,
                               file_suffix='-test.jsonl.gz',
                               parallel_writers=5))
Пример #5
0
 def test_write_read_msgpack_parallel(self):
     self.__test_write_read(
         lambda p: ChunkWriter(p,
                               file_prefix='test',
                               max_chunk_size=123,
                               file_suffix='-test.msgpack.l.gz',
                               parallel_writers=5),
         suffix='.msgpack.l.gz')
Пример #6
0
def main(arguments):
    try:
        start_time = time.clock()
        print("Exploring folders ...")
        walk_dir = arguments['SOURCE_FOLDER']
        monitoring = Monitoring()
        type_lattice = TypeLatticeGenerator(arguments['TYPING_RULES'])

        with open(arguments['DUPLICATES_JSON'], errors='ignore') as f:
            duplicates = json.load(f)
            all_to_remove = set()  # type: Set[str]
            for duplicate_cluster in duplicates:
                # Keep the first element, everything else should be ignored
                all_to_remove.update(duplicate_cluster[1:])

        # Extract graphs
        outputs = explore_files(walk_dir, all_to_remove, monitoring,
                                type_lattice)

        # Save results
        with ChunkWriter(out_folder=arguments['SAVE_FOLDER'],
                         file_prefix='all-graphs',
                         max_chunk_size=5000,
                         file_suffix='.jsonl.gz') as writer:
            for graph in outputs:
                writer.add(graph)
    except bdb.BdbQuit:
        return
    except Exception as e:
        print("e: ", e)
        print(monitoring.current_repo)
        print(monitoring.file)

    print("Building and saving the type graph...")
    type_lattice.build_graph()
    save_jsonl_gz([type_lattice.return_json()],
                  os.path.join(arguments['SAVE_FOLDER'],
                               "_type_lattice.json.gz"))

    print("Done.")
    print("Generated %d graphs out of %d snippets" %
          (monitoring.count - len(monitoring.errors), monitoring.count))

    with open(
            os.path.join(arguments['SAVE_FOLDER'], 'logs_graph_generator.txt'),
            'w') as f:
        for item in monitoring.errors:
            try:
                f.write("%s\n" % item)
            except:
                pass

    print("\nExecution in: ", time.clock() - start_time, " seconds")
Пример #7
0
def run(arguments):
    azure_info_path = arguments.get('--azure-info', None)
    input_folder = RichPath.create(arguments['INPUT_PATH'], azure_info_path)
    output_folder = RichPath.create(arguments['OUTPUT_PATH'], azure_info_path)

    with ChunkWriter(output_folder,
                     file_prefix='codedata',
                     max_chunk_size=500,
                     file_suffix='.jsonl.gz') as chunked_writer:
        for file in input_folder.iterate_filtered_files_in_dir('*.jsonl.gz'):
            for line in file.read_by_file_suffix():
                tokens = line['code_tokens']
                chunked_writer.add(
                    dict(filename='%s:%s:%s' %
                         (line['repo'], line['path'], line['lineno']),
                         tokens=tokens))
Пример #8
0
def extract_graphs(root_dir, typing_rules_path, files_to_extract: Set[str], target_folder):
    start_time = time.time()
    print("Traversing folders ...")
    monitoring = Monitoring()
    type_lattice = TypeLatticeGenerator(typing_rules_path)

    # Extract graphs
    outputs = explore_files(root_dir, files_to_extract, monitoring, type_lattice)

    # Save results
    with ChunkWriter(
        out_folder=target_folder,
        file_prefix="all-graphs",
        max_chunk_size=5000,
        file_suffix=".jsonl.gz",
    ) as writer:
        for graph in outputs:
            writer.add(graph)

    print("Building and saving the type graph...")
    type_lattice.build_graph()
    save_jsonl_gz(
        [type_lattice.return_json()], os.path.join(target_folder, "_type_lattice.json.gz"),
    )

    print("Done.")
    print(
        "Generated %d graphs out of %d snippets"
        % (monitoring.count - len(monitoring.errors), monitoring.count)
    )

    with open(os.path.join(target_folder, "logs_graph_generator.txt"), "w") as f:
        for item in monitoring.errors:
            try:
                f.write("%s\n" % item)
            except:
                pass

    print("\nGraph Execution in: ", time.time() - start_time, " seconds")
Пример #9
0
    parser.add_argument(
        "-train-pct",
        type=float,
        default=0.7,
        help="ratio of training set to whole corpus",
    )
    parser.add_argument(
        "-valid-pct",
        type=float,
        default=0.1,
        help="ratio of validation set to whole corpus",
    )
    args = parser.parse_args()

    num_train, num_valid, num_test = 0, 0, 0
    with ChunkWriter(os.path.join(args.out_dir, 'train'), file_prefix='graph-', max_chunk_size=1000, file_suffix='.jsonl.gz') as train_w,\
        ChunkWriter(os.path.join(args.out_dir, 'valid'), file_prefix='graph-', max_chunk_size=1000, file_suffix='.jsonl.gz') as valid_w,\
        ChunkWriter(os.path.join(args.out_dir, 'test'), file_prefix='graph-', max_chunk_size=1000, file_suffix='.jsonl.gz') as test_w:
        for f in tqdm(iglob(os.path.join(args.data_dir, "*.jsonl.gz"))):
            for ex in load_jsonl_gz(f):
                partition = get_fold(
                    ex["filename"], args.train_pct, args.valid_pct
                )
                if partition == "train":
                    train_w.add(ex)
                    num_train += 1
                elif partition == "valid":
                    valid_w.add(ex)
                    num_valid += 1
                else:
                    test_w.add(ex)
Пример #10
0
 def test_write_read_standard(self):
     self.__test_write_read(lambda p: ChunkWriter(p, file_prefix='test', max_chunk_size=123, file_suffix='-test.jsonl.gz'))