def main(): args = cmd_args() ray.init() chunked_pmap(args.i, verify_variable_dataset, on_receive_chunk=lambda x: None, ray=ray) ray.shutdown()
def create_bpe_training_file(dataset, targets_file_path, subtokens_file_path): with open(targets_file_path, "w") as f_t, open(subtokens_file_path, "w") as f_s: def write_to_file(chunks): for entry in flatten(chunks): f_t.write("\n".join(entry["targets"])) f_s.write("\n".join(entry["subtokens"])) chunked_pmap( dataset, mapper=extract_targets_and_subtokens, on_receive_chunk=write_to_file, )
def process_file( input_file, output_file, n_workers, ): with open(output_file, "w") as f: def write_chunk(chunk): for lines in chunk: for line in lines: f.write(line + "\n") chunked_pmap( input_file, process_chunk, on_receive_chunk=write_chunk, file_chunk_size=2**28, n_futures=n_workers, )
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--i", help="The c2s file for which to construct the vocabulary") parser.add_argument("--o", help="The output folder") parser.add_argument("--variables", action="store_true") parser.add_argument("--chunk-size", default=2**29) parser.add_argument("--n-workers", default=12, type=int) parser.add_argument("--prefix", help="prefix for token frequency files", default=None) args = parser.parse_args() prefix = f"{args.prefix}." if args.prefix else "" node_counts_file = f"{prefix}node_counts.txt" subtoken_counts_file = f"{prefix}subtoken_counts.txt" target_counts_file = f"{prefix}target_counts.txt" base_dir = args.o if args.o else os.path.dirname(args.i) f = process_variables if args.variables else process_chunk results = chunked_pmap(args.i, f, reducer=True, file_chunk_size=2**27, n_futures=args.n_workers) target_count = {} subtoken_count = {} node_count = {} for entry in results: add_counts(target_count, entry[0]) add_counts(subtoken_count, entry[1]) add_counts(node_count, entry[2]) print("-" * 60) print(f"\tDictionary sizes:") print(f"\tsubtokens: {len(subtoken_count)}") print(f"\tTarget: {len(target_count)}") write_dict(target_count, os.path.join(base_dir, target_counts_file)) write_dict(subtoken_count, os.path.join(base_dir, subtoken_counts_file)) write_dict(node_count, os.path.join(base_dir, node_counts_file))
def process_file(infile, outfile): with open(outfile, "w") as out_f: on_receive = write_chunks(out_f) chunked_pmap(infile, process_lines, on_receive_chunk=on_receive)