def make_binary_sent_doc_dataset(input_prefix, output_prefix, lang,
                                     num_workers, output_lang,
                                     output_text_file):
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        input_file = "{}{}".format(input_prefix,
                                   ("." + lang) if lang is not None else "")

        ds = []
        merge_result(
            Binarizer.binarize_sent_doc(input_file, lambda t: ds.append(t)))

        import json
        with open(output_text_file, 'w') as f:
            for line in ds:
                f.write(json.dumps(line.numpy().tolist()) + '\n')

        print("| [{}] {}: {} sents, {} tokens".format(
            output_lang,
            input_file,
            n_seq_tok[0],
            n_seq_tok[1],
        ))
    def make_binary_sent_doc_dataset(input_prefix, output_prefix, lang,
                                     num_workers):
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        input_file = "{}{}".format(input_prefix,
                                   ("." + lang) if lang is not None else "")
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(binarize_sent_doc,
                                 (args, input_file, prefix, lang,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()

        ds = indexed_dataset.make_builder(dataset_dest_file(
            args, output_prefix, lang, "bin"),
                                          impl=args.dataset_impl)
        merge_result(
            Binarizer.binarize_sent_doc(input_file,
                                        lambda t: ds.add_item(t),
                                        offset=0,
                                        end=offsets[1]))
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = dataset_dest_prefix(args, prefix, lang)
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))

        print("| [{}] {}: {} sents, {} tokens".format(
            lang,
            input_file,
            n_seq_tok[0],
            n_seq_tok[1],
        ))
def binarize_sent_doc(args,
                      filename,
                      output_prefix,
                      lang,
                      offset,
                      end,
                      append_eos=True):
    ds = indexed_dataset.make_builder(dataset_dest_file(
        args, output_prefix, lang, "bin"),
                                      impl=args.dataset_impl)

    def consumer(tensor):
        ds.add_item(tensor)

    res = Binarizer.binarize_sent_doc(filename,
                                      consumer,
                                      append_eos=append_eos,
                                      offset=offset,
                                      end=end)
    ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))
    return res