def main():

    args = cmd_args()
    ray.init()

    chunked_pmap(args.i,
                 verify_variable_dataset,
                 on_receive_chunk=lambda x: None,
                 ray=ray)

    ray.shutdown()
def create_bpe_training_file(dataset, targets_file_path, subtokens_file_path):
    with open(targets_file_path, "w") as f_t, open(subtokens_file_path,
                                                   "w") as f_s:

        def write_to_file(chunks):
            for entry in flatten(chunks):
                f_t.write("\n".join(entry["targets"]))
                f_s.write("\n".join(entry["subtokens"]))

        chunked_pmap(
            dataset,
            mapper=extract_targets_and_subtokens,
            on_receive_chunk=write_to_file,
        )
示例#3
0
def process_file(
    input_file,
    output_file,
    n_workers,
):
    with open(output_file, "w") as f:

        def write_chunk(chunk):
            for lines in chunk:
                for line in lines:
                    f.write(line + "\n")

        chunked_pmap(
            input_file,
            process_chunk,
            on_receive_chunk=write_chunk,
            file_chunk_size=2**28,
            n_futures=n_workers,
        )
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--i", help="The c2s file for which to construct the vocabulary")
    parser.add_argument("--o", help="The output folder")
    parser.add_argument("--variables", action="store_true")
    parser.add_argument("--chunk-size", default=2**29)
    parser.add_argument("--n-workers", default=12, type=int)
    parser.add_argument("--prefix",
                        help="prefix for token frequency files",
                        default=None)

    args = parser.parse_args()

    prefix = f"{args.prefix}." if args.prefix else ""

    node_counts_file = f"{prefix}node_counts.txt"
    subtoken_counts_file = f"{prefix}subtoken_counts.txt"
    target_counts_file = f"{prefix}target_counts.txt"

    base_dir = args.o if args.o else os.path.dirname(args.i)
    f = process_variables if args.variables else process_chunk
    results = chunked_pmap(args.i,
                           f,
                           reducer=True,
                           file_chunk_size=2**27,
                           n_futures=args.n_workers)

    target_count = {}
    subtoken_count = {}
    node_count = {}

    for entry in results:
        add_counts(target_count, entry[0])
        add_counts(subtoken_count, entry[1])
        add_counts(node_count, entry[2])
    print("-" * 60)
    print(f"\tDictionary sizes:")
    print(f"\tsubtokens: {len(subtoken_count)}")
    print(f"\tTarget: {len(target_count)}")

    write_dict(target_count, os.path.join(base_dir, target_counts_file))
    write_dict(subtoken_count, os.path.join(base_dir, subtoken_counts_file))
    write_dict(node_count, os.path.join(base_dir, node_counts_file))
def process_file(infile, outfile):
    with open(outfile, "w") as out_f:
        on_receive = write_chunks(out_f)
        chunked_pmap(infile, process_lines, on_receive_chunk=on_receive)