Пример #1
0
def binarize(args,
             filename,
             vocab,
             output_prefix,
             lang,
             offset,
             end,
             append_eos=True):
    ds = indexed_dataset.make_builder(
        dataset_dest_file(args, output_prefix, lang, "bin"),
        impl=args.dataset_impl,
        vocab_size=len(vocab),
    )

    def consumer(tensor):
        ds.add_item(tensor)

    res = Binarizer.binarize(filename,
                             vocab,
                             consumer,
                             append_eos=append_eos,
                             offset=offset,
                             end=end)
    ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))
    return res
Пример #2
0
def binarize(args,
             filename,
             vocab,
             output_prefix,
             lang,
             offset,
             end,
             append_eos=True,
             copy_from=None):
    ds = indexed_dataset.IndexedDatasetBuilder(
        dataset_dest_file(args, output_prefix, lang, "bin"))
    words_list = []  # todo: 目前传不出去

    def consumer(ids, words):
        ds.add_item(ids)
        words_list.append(words)

    res = Binarizer.binarize(filename,
                             vocab,
                             consumer,
                             append_eos=append_eos,
                             offset=offset,
                             end=end)
    ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))
    return res
Пример #3
0
def binarize(args,
             filename,
             vocab,
             output_prefix,
             lang,
             offset,
             end,
             append_eos=True):
    ds = indexed_dataset.make_builder(dataset_dest_file(
        args, output_prefix, lang, "bin"),
                                      impl=args.dataset_impl,
                                      vocab_size=len(vocab))

    def consumer(tensor):
        ds.add_item(tensor)

    tk = tokenize_smiles if args.file_format == 'smiles' else tokenize_line
    res = Binarizer.binarize(filename,
                             vocab,
                             consumer,
                             tokenize=tk,
                             append_eos=append_eos,
                             offset=offset,
                             end=end)
    ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))
    return res
Пример #4
0
    def make_binary_dataset(vocab, input_prefix, output_prefix, src_lang,
                            tgt_lang, lang, num_workers):
        print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        input_file = "{}.{}-{}.{}".format(input_prefix, src_lang, tgt_lang,
                                          lang)
        if args.model:
            input_file += ".tok"
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = multiprocessing.Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(
                    binarize,
                    (args, input_file, vocab, prefix, src_lang, tgt_lang, lang,
                     offsets[worker_id], offsets[worker_id + 1]),
                    callback=merge_result)
            pool.close()

        ds = indexed_dataset.make_builder(dataset_dest_file(
            args, output_prefix, src_lang, tgt_lang, lang, "bin"),
                                          impl=args.dataset_impl,
                                          vocab_size=len(vocab))
        merge_result(
            Binarizer.binarize(input_file,
                               vocab,
                               lambda t: ds.add_item(t),
                               offset=0,
                               end=offsets[1]))
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = dataset_dest_prefix(args, prefix, src_lang,
                                                     tgt_lang, lang)
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(
            dataset_dest_file(args, output_prefix, src_lang, tgt_lang, lang,
                              "idx"))

        print("| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
            lang,
            input_file,
            n_seq_tok[0],
            n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1],
            vocab.unk_word,
        ))
Пример #5
0
def binarize(args, filename, dict, output_prefix, lang, offset, end):

    ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_file(args, output_prefix, lang, 'bin'))
    def consumer(tensor):
        ds.add_item(tensor)

    res = Binarizer.binarize(filename, dict, consumer, offset=offset, end=end)
    ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx'))
    return res
Пример #6
0
def binarize(args, filename, vocab, output_prefix, lang, offset, end,
             tgt_dict):
    append_eos = True
    ds = indexed_dataset.IndexedDatasetBuilder(
        dataset_dest_file(args, output_prefix, lang, "bin"))

    def consumer(tensor):
        ds.add_item(tensor)

    res = Binarizer.binarize(filename,
                             vocab,
                             consumer,
                             append_eos=append_eos,
                             reverse_order=args.reverse_order,
                             offset=offset,
                             end=end,
                             tgt_dict=tgt_dict)
    ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))
    return res
Пример #7
0
    def make_binary_dataset(input_prefix, output_prefix, lang, num_workers):
        dict = dictionary.Dictionary.load(dict_path(lang))
        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result['replaced'])
            n_seq_tok[0] += worker_result['nseq']
            n_seq_tok[1] += worker_result['ntok']

        input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '')
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(processes=num_workers-1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(binarize, (args, input_file, dict, prefix, lang,
                                            offsets[worker_id],
                                            offsets[worker_id + 1]), callback=merge_result)
            pool.close()

        ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_file(args, output_prefix, lang, 'bin'))
        merge_result(Binarizer.binarize(input_file, dict, lambda t: ds.add_item(t),
                                        offset=0, end=offsets[1]))
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = dataset_dest_prefix(args, prefix, lang)
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))


        ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx'))


        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, n_seq_tok[0], n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1], dict.unk_word))
def binarize(args,
             filename,
             dict,
             output_prefix,
             lang,
             offset,
             end,
             append_eos=True):
    ds = indexed_dataset.IndexedDatasetBuilder(
        dataset_dest_file(args, output_prefix, lang, "bin"))

    def consumer(tensor):
        ds.add_item(tensor)

    res = Binarizer.binarize(filename,
                             dict,
                             consumer,
                             offset=offset,
                             end=end,
                             append_eos=append_eos)
    ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))
    return res
Пример #9
0
def binarize(args,
             filename,
             vocab,
             output_prefix,
             lang,
             offset,
             end,
             append_eos=True):
    ##dataset_impl=mmap, ds -> MMapIndexedDatasetBuilder
    ##dataset_impl=lazy, ds -> IndexedDatasetBuilder
    ds = indexed_dataset.make_builder(dataset_dest_file(
        args, output_prefix, lang, "bin"),
                                      impl=args.dataset_impl,
                                      vocab_size=len(vocab))

    def consumer(tensor):
        ##输入的tensor,就是直接把文本串通过dictionary转为id串
        ##dataset_impl=mmap, MMapIndexedDatasetBuilder.add_item:把输入tensor直接写入文件
        ##dataset_impl=lazy, IndexedDatasetBuilder.add_item:把输入tensor写入文件,并更新sizes, data_offsets, dim_offsets
        ds.add_item(tensor)

    ## 读入文件filename在offset和end之间的内容,并把每个文本串利用dictionary转为id串,利用consumer函数写入到ds中
    res = Binarizer.binarize(filename,
                             vocab,
                             consumer,
                             append_eos=append_eos,
                             offset=offset,
                             end=end)

    ##把写入到ds中的数据存储到对应路径的临时文件, output_prefix包含了worker_id,以区分不同的worker的临时文件
    ##mmap, ds.finalize:调用MMapIndexedDataset.write写入三个tensor:
    ##      训练样例数量,每个样例tensor的size,每个样例的位置pointer
    ##lazy, ds.finalize:IndexedDatasetBuilder.finalize直接写入dim_offsets, data_offsets, sizes
    #       data_offsets: 存放每个tensor在二进制文件中的结尾位置(前一个tensor的结尾处就是这个tensor的开始位置)
    #       sizes: 存放每个tensor的shape的各个dim值
    #       dim_offsets: 存放每个tensor的shape在self.size中的结尾位置(前面tensor shape的结尾是这个tensor shape的开始)
    ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))
    return res
Пример #10
0
    def make_binary_dataset(vocab, input_prefix, output_prefix, lang,
                            num_workers):
        logger.info("[{}] Dictionary: {} types".format(lang, len(vocab)))
        output_prefix += '.bert' if isinstance(vocab, BertTokenizer) else ''
        input_prefix += '.bert' if isinstance(vocab, BertTokenizer) else ''
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        input_file = "{}{}".format(input_prefix,
                                   ("." + lang) if lang is not None else "")
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(
                    binarize,
                    (
                        args,
                        input_file,
                        vocab,
                        prefix,
                        lang,
                        offsets[worker_id],
                        offsets[worker_id + 1],
                    ),
                    callback=merge_result,
                )
            pool.close()

        ds = indexed_dataset.make_builder(
            dataset_dest_file(args, output_prefix, lang, "bin"),
            impl=args.dataset_impl,
            vocab_size=len(vocab),
        )
        merge_result(
            Binarizer.binarize(input_file,
                               vocab,
                               lambda t: ds.add_item(t),
                               offset=0,
                               end=offsets[1]))
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = dataset_dest_prefix(args, prefix, lang)
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))

        logger.info(
            "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
                lang,
                input_file,
                n_seq_tok[0],
                n_seq_tok[1],
                100 * sum(replaced.values()) / n_seq_tok[1],
                vocab.unk_word,
            ))
Пример #11
0
    def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers, avoid_tokenize=False):
        if vocab is not None:
            print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1))
        else:
            print('| Using None Dictionary and only string split is performed.')

        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        input_file = "{}{}".format(
            input_prefix, ("." + lang) if lang is not None else ""
        )
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                # TODO: worker > 1 is not working for map dataset
                if args.input_mapping is True:
                    raise NotImplementedError("Worker > 1 is not implemented for map dataset yet.")
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(
                    binarize,
                    (
                        args,
                        input_file,
                        vocab,
                        prefix,
                        lang,
                        offsets[worker_id],
                        offsets[worker_id + 1],
                        avoid_tokenize,
                    ),
                    callback=merge_result,
                )
            pool.close()

        ds = indexed_dataset.make_builder(
            dataset_dest_file(args, output_prefix, lang, "bin"),
            impl=args.dataset_impl,
            vocab_size=len(vocab) if vocab is not None else -1,
        )
        merge_result(
            Binarizer.binarize(
                input_file, vocab, lambda t: ds.add_item(t), offset=0, end=offsets[1], avoid_tokenize=avoid_tokenize,
            )
        )
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = dataset_dest_prefix(args, prefix, lang)
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))

        if vocab is not None:
            unk = vocab.unk_word if hasattr(vocab, 'unk_word') else vocab.unk_token
        else:
            unk = ""
        logger.info(
            "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
                lang,
                input_file,
                n_seq_tok[0],
                n_seq_tok[1],
                100 * sum(replaced.values()) / n_seq_tok[1],
                unk,
            )
        )
Пример #12
0
    def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers, copy_src_words=None):
        print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()
        copyied = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            copyied.update(worker_result["copied"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        input_file = "{}{}".format(
            input_prefix, ("." + lang) if lang is not None else ""
        )
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:  # todo: not support copy 
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(
                    binarize,
                    (
                        args,
                        input_file,
                        vocab,
                        prefix,
                        lang,
                        offsets[worker_id],
                        offsets[worker_id + 1]
                    ),
                    callback=merge_result  
                )
            pool.close()

        ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_file(args, output_prefix, lang, "bin")
        )
        words_list = []

        def binarize_consumer(ids, words):
            ds.add_item(ids)
            words_list.append(words)

        merge_result(
            Binarizer.binarize(
                input_file, vocab, binarize_consumer,
                offset=0, end=offsets[1], copy_ext_dict=args.copy_ext_dict, copy_src_words=copy_src_words
            )
        )
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = dataset_dest_prefix(args, prefix, lang)
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))

        print(
            "| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}, {:.3}% <unk> copied from src".format(
                lang,
                input_file,
                n_seq_tok[0],
                n_seq_tok[1],
                100 * sum(replaced.values()) / n_seq_tok[1],
                vocab.unk_word,
                100 * sum(copyied.values()) / n_seq_tok[1]
            )
        )

        return words_list