Exemplo n.º 1
0
    def make_binary_dataset(vocab: Dictionary, input_file, output_file,
                            attr: str, num_workers: int):
        """make binary dataset"""
        LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        # split a file into different parts
        # if use multi-processing, we first process 2nd to last file
        # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ...
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, vocab, prefix, attr,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))
        merge_result(
            Binarizer.binarize_bpe(input_file,
                                   vocab,
                                   lambda t: ds.add_item(t),
                                   offset=0,
                                   end=offsets[1]))
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(output_file))

        LOGGER.info(
            "[{}] {}: {} sents, {} tokens, BPE no replaced token".format(
                attr,
                input_file,
                n_seq_tok[0],
                n_seq_tok[1],
            ))
Exemplo n.º 2
0
def binarize(args: Dict,
             filename: str,
             dict: Dictionary,
             in_file: str,
             offset: int,
             end: int,
             append_eos: bool = True):
    """binarize function for multi-processing"""
    ds_file = '{}.mmap'.format(in_file)
    ds = indexed_dataset.make_builder(ds_file,
                                      impl=args['preprocess']['dataset_impl'],
                                      vocab_size=len(dict))

    def consumer(tensor):
        ds.add_item(tensor)

    res = Binarizer.binarize(filename,
                             dict,
                             consumer,
                             tokenize=tokenization.json_tokenizer,
                             append_eos=append_eos,
                             offset=offset,
                             end=end)
    ds.finalize('{}.idx'.format(in_file))
    return res
Exemplo n.º 3
0
def binarize(args, filename: str, vocab, aux_dict, in_file: str, lang, tokenize, max_path_num: int,
             offset: int, end: int, append_eos: bool = False):
    ds_file = '{}.mmap'.format(in_file)
    ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab))
    if lang == 'path':
        sz_ds_file = '{}.sz.mmap'.format(in_file)
        sz_ds = indexed_dataset.make_builder(sz_ds_file, impl=args['preprocess']['dataset_impl'],
                                             vocab_size=len(vocab))
    else:
        sz_ds = None

    def consumer(tensor, size=None):
        ds.add_item(tensor)
        if size is not None:
            sz_ds.add_item(size)

    if sz_ds is None:
        res = Binarizer.binarize(filename, vocab, consumer, tokenize=tokenize,
                                 append_eos=append_eos, offset=offset, end=end, )
        ds.finalize('{}.idx'.format(in_file))
    else:
        res = PathSummarizationBinarizer.path_binarizer(filename, vocab, consumer, tokenize=tokenize,
                                                        append_eos=append_eos, offset=offset, end=end,
                                                        type_dict=aux_dict, max_path_num=max_path_num, )
        ds.finalize('{}.idx'.format(in_file))
        sz_ds.finalize('{}.sz.idx'.format(in_file))
    return res
Exemplo n.º 4
0
 def make_graph_bin_dataset(dict: Dictionary, input_file, output_file,
                            num_workers):
     offsets = Binarizer.find_offsets(input_file, num_workers)
     if num_workers > 1:
         # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
         pool = Pool(processes=num_workers)
         for worker_id in range(num_workers):
             prefix = "{}{}".format(output_file, worker_id)
             pool.apply_async(
                 binarize_dgl,
                 (args, input_file, dict, prefix, offsets[worker_id],
                  offsets[worker_id + 1]),
             )
         pool.close()
     else:
         prefix = "{}0".format(output_file)
         binarize_dgl(args, input_file, dict, prefix, 0, -1)
Exemplo n.º 5
0
def binarize(args: Dict, filename: str, dict: Dictionary, out_file_prefix: str,
             attr: str, offset: int, end: int):
    """binarize function for multi-processing"""
    ds_file = '{}.mmap'.format(out_file_prefix)
    ds = indexed_dataset.make_builder(ds_file,
                                      impl=args['preprocess']['dataset_impl'],
                                      vocab_size=len(dict))

    def consumer(tensor):
        ds.add_item(tensor)

    res = Binarizer.binarize_bpe(filename,
                                 dict,
                                 consumer,
                                 offset=offset,
                                 end=end)
    ds.finalize('{}.idx'.format(out_file_prefix))
    return res
Exemplo n.º 6
0
    def make_binary_dataset(vocab, aux_dict, input_file, output_file, lang, max_path_num, num_workers):
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        tokenize = path_tokenize if lang == 'path' else tokenization.json_tokenizer
        offsets = file_io.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(
                    binarize,
                    (
                        args,
                        input_file,
                        vocab,
                        aux_dict,
                        prefix,
                        lang,
                        tokenize,
                        max_path_num,
                        offsets[worker_id],
                        offsets[worker_id + 1],
                    ),
                    callback=merge_result
                )
            pool.close()

        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab))
        if lang == 'path':
            sz_ds_file = '{}.sz.mmap'.format(output_file)
            sz_ds = indexed_dataset.make_builder(sz_ds_file, impl=args['preprocess']['dataset_impl'],
                                                 vocab_size=len(vocab))
        else:
            sz_ds = None

        def consumer(tensor, size=None):
            ds.add_item(tensor)
            if size is not None:
                sz_ds.add_item(size)

        if sz_ds is None:
            merge_result(
                Binarizer.binarize(
                    input_file, vocab, consumer,
                    tokenize=tokenize, offset=0, end=offsets[1], append_eos=False,
                    max_path_num=max_path_num,
                )
            )
        else:
            merge_result(
                PathSummarizationBinarizer.path_binarizer(
                    input_file, vocab, consumer,
                    tokenize=tokenize, offset=0, end=offsets[1], append_eos=False, type_dict=aux_dict,
                    max_path_num=max_path_num,
                )
            )
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                if sz_ds is not None:
                    sz_ds.merge_file_(f"{temp_file_path}.sz")
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
                if sz_ds is not None:
                    os.remove(indexed_dataset.data_file_path(f"{temp_file_path}.sz"))
                    os.remove(indexed_dataset.index_file_path(f"{temp_file_path}.sz"))
        ds.finalize('{}.idx'.format(output_file))
        if sz_ds is not None:
            sz_ds.finalize('{}.sz.idx'.format(output_file))
        LOGGER.info(
            "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
                lang,
                input_file,
                n_seq_tok[0],
                n_seq_tok[1],
                100 * sum(replaced.values()) / n_seq_tok[1],
                vocab.unk_word,
            )
        )