Пример #1
0
    def make_binary_dataset(input_prefix, output_prefix, lang, guess):
        print('aaa')
        dict = dictionary.Dictionary.load(
            os.path.join(args.destdir, 'dict.{}.txt'.format(lang)))
        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))

        if not guess:
            ds = indexed_dataset.IndexedDatasetBuilder(
                '{}/{}.{}-{}.{}.bin'.format(args.destdir, output_prefix,
                                            args.source_lang, args.target_lang,
                                            lang))
            input_file = '{}.{}'.format(input_prefix, lang)
        else:
            ds = indexed_dataset.IndexedDatasetBuilder(
                '{}/{}.{}-{}.{}.guess.bin'.format(args.destdir, output_prefix,
                                                  args.source_lang,
                                                  args.target_lang, lang))
            input_file = '{}.{}.guess'.format(input_prefix, lang)

        def consumer(tensor):
            ds.add_item(tensor)

        res = Tokenizer.binarize(input_file, dict, consumer)
        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, res['nseq'], res['ntok'],
            100 * res['nunk'] / res['ntok'], dict.unk_word))
        ds.finalize('{}/{}.{}-{}.{}.idx'.format(args.destdir, output_prefix,
                                                args.source_lang,
                                                args.target_lang, lang))
def binarize(filename, dict, fn_without_ext, offset, end):
    ds = indexed_dataset.IndexedDatasetBuilder(f"{fn_without_ext}.bin")

    def consumer(tensor):
        ds.add_item(tensor)

    res = Tokenizer.binarize(filename, dict, consumer, offset=offset, end=end)
    ds.finalize(f"{fn_without_ext}.idx")
    return res
    def make_binary_dataset(input_prefix, output_prefix, lng_pair, lang,
                            num_workers):
        if not args.joined_dictionary and lang != 'en':
            dict = dictionary.Dictionary.load(tgt_dict_path)
        else:
            dict = dictionary.Dictionary.load(dict_path)

        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result['replaced'])
            n_seq_tok[0] += worker_result['nseq']
            n_seq_tok[1] += worker_result['ntok']

        input_file = f'{input_prefix}.{lng_pair}.{lang}.tok.bpe'
        if not os.path.exists(input_file):
            input_file = f'{input_prefix}.{lng_pair}.{lang}'
            if not os.path.exists(input_file):
                print("| {} not found".format(input_file))
                return
        if args.expert:
            input_file = input_file + '.e'
        offsets = Tokenizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                fn_without_ext = f"{output_prefix}{worker_id}.{lng_pair}.{lang}"
                pool.apply_async(binarize,
                                 (input_file, dict, fn_without_ext,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()

        ds = indexed_dataset.IndexedDatasetBuilder(
            f"{output_prefix}.{lng_pair}.{lang}.bin")
        merge_result(
            Tokenizer.binarize(input_file,
                               dict,
                               lambda t: ds.add_item(t),
                               offset=0,
                               end=offsets[1]))
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                temp_file_path = f"{output_prefix}{worker_id}.{lng_pair}.{lang}"
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(f"{output_prefix}.{lng_pair}.{lang}.idx")

        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, n_seq_tok[0], n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1], dict.unk_word))
Пример #4
0
def binarize(args, filename, dict, output_prefix, lang, offset, end):
    ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_file(args, output_prefix, lang, 'bin'))

    def consumer(tensor):
        ds.add_item(tensor)

    res = Tokenizer.binarize(filename, dict, consumer, offset=offset, end=end)
    ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx'))
    return res
Пример #5
0
    def make_binary_dataset(input_prefix, output_prefix, lang):
        dict = dictionary.Dictionary.load(dict_path(lang))
        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))

        ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_path(output_prefix, lang, 'bin'))

        def consumer(tensor):
            ds.add_item(tensor)

        input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '')
        res = Tokenizer.binarize(input_file, dict, consumer)
        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, res['nseq'], res['ntok'],
            100 * res['nunk'] / res['ntok'], dict.unk_word))
        ds.finalize(dataset_dest_path(output_prefix, lang, 'idx'))
Пример #6
0
    def make_binary_dataset(input_prefix, output_prefix, lang, num_workers):
        dict = dictionary.Dictionary.load(dict_path(lang))
        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result['replaced'])
            n_seq_tok[0] += worker_result['nseq']
            n_seq_tok[1] += worker_result['ntok']

        input_file = '{}{}'.format(input_prefix,
                                   ('.' + lang) if lang is not None else '')
        offsets = Tokenizer.find_offsets(input_file, num_workers)
        print("offsets", offsets)
        pool = None
        if num_workers > 1:
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, dict, prefix, lang,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()

        ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_file(args, output_prefix, lang, 'bin'))
        merge_result(
            Tokenizer.binarize(input_file,
                               dict,
                               lambda t: ds.add_item(t),
                               offset=0,
                               end=offsets[1]))
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = dataset_dest_prefix(args, prefix, lang)
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx'))

        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, n_seq_tok[0], n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1], dict.unk_word))
Пример #7
0
def make_binary_dataset(input_prefix, output_prefix, lang, src_ids=None):
    dict = dictionary.Dictionary.load(dict_path(lang))
    print('| [{}] Dictionary: {} types'.format(lang, len(dict)))
    ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_path(output_prefix, lang, 'bin'))

    def consumer(tensor):
        ds.add_item(tensor)

    input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '')
    res, ids = Tokenizer.binarize(input_file, dict, consumer, src_ids=src_ids)
    print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}, {:.3}% replaced by copy'.format(
        lang, input_file, res['nseq'], res['ntok'],
        100 * res['nunk'] / res['ntok'], dict.unk_word, 100 * res['ncopied'] / res['ntok']))
    ds.finalize(dataset_dest_path(output_prefix, lang, 'idx'))

    return ids
Пример #8
0
def binarize(args, filename, dict, output_prefix, lang, offset, end, append_eos=False):
    ds = indexed_dataset.IndexedDatasetBuilder(
        dataset_dest_file(args, output_prefix, lang, "bin")
    )

    def consumer(tensor):
        ds.add_item(tensor)

    res = Tokenizer.binarize(
        filename,
        dict,
        consumer,
        offset=offset,
        end=end,
        append_eos=append_eos
    )
    ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))
    return res
Пример #9
0
    def make_binary_dataset(input_prefix, output_prefix, lang):
        dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(lang)))
        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))

        ds = indexed_dataset.IndexedDatasetBuilder(
            '{}/{}.{}-{}.{}.bin'.format(args.destdir, output_prefix, args.source_lang,
                                        args.target_lang, lang)
        )

        def consumer(tensor):
            ds.add_item(tensor)

        input_file = '{}.{}'.format(input_prefix, lang)
        res = Tokenizer.binarize(input_file, dict, consumer)
        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, res['nseq'], res['ntok'],
            100 * res['nunk'] / res['ntok'], dict.unk_word))
        ds.finalize('{}/{}.{}-{}.{}.idx'.format(
            args.destdir, output_prefix,
            args.source_lang, args.target_lang, lang))
Пример #10
0
def binarize(args, filename, dict, output_prefix, lang, offset, end):

    ds = indexed_dataset.IndexedDatasetBuilder(
        dataset_dest_file(args, output_prefix, lang, 'bin'))

    def consumer(tensor):
        ds.add_item(tensor)

    res = Tokenizer.binarize(filename, dict, consumer, offset=offset, end=end)
    # {'nseq': nseq, 'nunk': sum(replaced.values()), 'ntok': ntok, 'replaced': replaced}
    to_print = ['nseq', 'nunk', 'ntok']
    debug_data = {}
    for k, v in res.items:
        if k in to_print:
            debug_data[k] = v
    debug_data['offset'] = offset
    debug_data['end'] = end

    print(debug_data)

    ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx'))
    return res
Пример #11
0
    def make_binary_dataset(input_prefix, output_prefix, lang, append_eos=False):
        if lang == args.target_lang:
            dict = flexible_dictionary.FlexibleDictionary.load(dict_path(lang))
        else:
            # dict = bert_dictionary.BertDictionary.load(dict_path(lang))
            dict = gpt2_dictionary.GPT2Dictionary.load(dict_path(lang))

        print('| [{}] Dictionary: {} types | {} types (for real)'.format(lang, len(dict) - 1, len(dict)))

        ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_path(output_prefix, lang, 'bin'))

        def consumer(tensor):
            ds.add_item(tensor)

        input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '')
        if lang == args.target_lang:
            res = Tokenizer.binarize(input_file, dict, consumer, append_eos=append_eos)
            print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
                lang, input_file, res['nseq'], res['ntok'],
                100 * res['nunk'] / res['ntok'], dict.unk_word if hasattr(dict, 'unk_word') else '<no_unk_word>'))
        else:
            # read article
            # from pytorch_pretrained_bert.tokenization import BertTokenizer
            # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
            from pytorch_transformers import RobertaTokenizer
            tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

            def penn_token2orig_token(sent):
                # -LRB- -RRB- -LSB- -RSB- -LCB- -RCB-
                '''
                penn2orig = {"``":'"', "''": '"',
                             "-LRB-": '(', "-RRB-": ')',
                             "-LSB-":'[', "-RSB-":']',
                             "-LCB-":'{', "-RCB-":'}'}
                '''
                penn2orig = {"-LRB-": '(', "-RRB-": ')',
                             "-LSB-": '[', "-RSB-": ']',
                             "-LCB-": '{', "-RCB-": '}',
                             "-lrb-": '(', "-rrb-": ')',
                             "-lsb-": '[', "-rsb-": ']',
                             "-lcb-": '{', "-rcb-": '}',}
                words = sent.strip().split()
                words = [wd if not wd in penn2orig else penn2orig[wd] for wd in words]
                return ' '.join(words)

            num_token, num_unk_token = 0, 0
            num_seq = 0
            skip_line = 0
            for line in open(input_file, encoding='utf8'):
                sents = line.strip().split('<S_SEP>')
                sents = sents[0:args.max_num_sentences]
                sents = [' '.join(sent.strip().split()[0:args.max_num_words]) for sent in sents]
                # print(sents)
                sents = [tokenizer.tokenize(penn_token2orig_token(sent)) for sent in sents]
                article_wids = []
                for i, sent in enumerate(sents):
                    # sometimes there are too many tokens
                    MAXLEN = 500
                    if len(sent) > MAXLEN:
                        # sent = sent[0:MAXLEN]
                        print(' '.join(sent))
                        skip_line += 1
                        print(skip_line)
                        continue
                    if i != 0:
                        article_wids.append( dict.sep_index )
                    wids = tokenizer.convert_tokens_to_ids(sent)
                    # wids_vocab = [dict.index(word) for word in sent]
                    # assert wids == wids_vocab, 'word indices should be the same!'
                    article_wids.extend(wids)
                    for wid in wids:
                        if wid == dict.unk_index:
                            num_unk_token += 1
                        num_token += 1

                num_seq += 1
                tensor = torch.IntTensor(article_wids)
                # print( dict.string_complete(tensor) )
                ds.add_item(tensor)

            print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
                lang, input_file, num_seq, num_token,
                100 * num_unk_token / num_token, dict.unk_word if hasattr(dict, 'unk_word') else '<no_unk_word>'))

        ds.finalize(dataset_dest_path(output_prefix, lang, 'idx'))