Пример #1
0
from tqdm import tqdm

from nmtg.data import Dictionary

parser = argparse.ArgumentParser()
parser.add_argument('input')
parser.add_argument('other_language_dict')
parser.add_argument('output', default='-', nargs='?')
parser.add_argument('-threshold', type=int, default=0)
parser.add_argument('-prob', type=float, default=0.1)
parser.add_argument('-num_variants', type=int, default=1)

args = parser.parse_args()

with open(args.input) as f:
    main_dictionary = Dictionary.infer_from_text(f)
main_symbols = main_dictionary.symbols[main_dictionary.nspecial:]
del main_dictionary

dictionary = Dictionary.load(args.other_language_dict)
if args.threshold != 0:
    dictionary.finalize(threshold=args.threshold)
symbols = dictionary.symbols[dictionary.nspecial:]
del dictionary


def get_nearest(pool, symbol):
    return symbol, min(pool, key=lambda x: editdistance.eval(x, symbol))


partial = functools.partial(get_nearest, symbols)
Пример #2
0
import argparse

from nmtg.data import Dictionary
from nmtg.data.noisy_text import NoisyTextDataset
from nmtg.data.text_lookup_dataset import TextLookupDataset
from nmtg.tasks.denoising_text_task import DenoisingTextTask

parser = argparse.ArgumentParser()
DenoisingTextTask.add_options(parser)
args = parser.parse_args()

task = DenoisingTextTask.setup_task(args)
dictionary = Dictionary.infer_from_text(task.tgt_dataset)

noisy_text = NoisyTextDataset(TextLookupDataset(task.src_dataset, dictionary, True,
                                                args.lower, False, False, False),
                              args.word_shuffle, args.noise_word_dropout, args.word_blank, args.bpe_symbol)

for i in range(len(noisy_text)):
    print(task.tgt_dataset[i])
    print(dictionary.string(noisy_text[i]))
    input()