def setUp(self) -> None: path_to_model = os.path.expanduser( os.path.join(".", "test", "trained_model")) print(path_to_model) use_cuda = False device = "cuda" if use_cuda else "cpu" MAX_LEN = 30 path_to_exp = os.path.join(path_to_model) path_to_model = os.path.join(path_to_exp, "model.pkl") print(os.path.join(path_to_exp, "experiment.pkl")) experiment = torch.load(os.path.join(path_to_exp, "experiment.pkl")) experiment = Experiment(experiment["args"]) experiment.cuda = use_cuda SRC_vocab = torch.load(os.path.join(path_to_exp, "src.pkl")) TRG_vocab = torch.load(os.path.join(path_to_exp, "trg.pkl")) src_tokenizer = get_custom_tokenizer(experiment.get_src_lang(), "w", prepro=True) trg_tokenizer = get_custom_tokenizer(experiment.get_trg_lang(), "w", prepro=True) SRC_vocab.tokenize = src_tokenizer.tokenize TRG_vocab.tokenize = trg_tokenizer.tokenize tokens_bos_eos_pad_unk = [ TRG_vocab.vocab.stoi[SOS_TOKEN], TRG_vocab.vocab.stoi[EOS_TOKEN], TRG_vocab.vocab.stoi[PAD_TOKEN], TRG_vocab.vocab.stoi[UNK_TOKEN] ] experiment.src_vocab_size = len(SRC_vocab.vocab) experiment.trg_vocab_size = len(TRG_vocab.vocab) model = get_nmt_model(experiment, tokens_bos_eos_pad_unk) model.load_state_dict(torch.load(path_to_model)) model = model.to(device) logger = Logger(path_to_exp, "live_transl.log") logger.log("Live translation: {}".format( datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")), stdout=False) logger.log("Beam width: {}".format(5)) self.translator = Translator(model, SRC_vocab, TRG_vocab, logger, src_tokenizer, device, beam_size=1, max_len=MAX_LEN)
def test_false_language(self): tokenizer = get_custom_tokenizer("adagawe", prepro=True, mode="w") self.assertEqual(tokenizer.lang, "adagawe") try: self.assertIsInstance(tokenizer, SpacyTokenizer) self.assertIsInstance(tokenizer.nlp, spacy.lang.xx.MultiLanguage) except AssertionError: self.assertIsInstance(tokenizer, FastTokenizer) self.assertIs(tokenizer.only_tokenize, True)
def raw_preprocess(parser): # configurations CORPUS_NAME = "europarl" lang_code = parser.lang_code.lower() if lang_code == "en": raise SystemExit( "English is the default language. Please provide second language!") if not lang_code: raise SystemExit("Empty language not allowed!") # Download the raw tmx file try: print("Trying to download the file ...") maybe_download_and_extract_europarl(language_code=lang_code, tmx=True) except urllib.error.HTTPError as e: print(e) raise SystemExit( "Please download the parallel corpus manually from: http://opus.nlpl.eu/ | Europarl > Statistics and TMX/Moses Download " "\nby selecting the data from the upper-right triangle (e.g. en > de])" ) path_to_raw_file = os.path.join(DATA_DIR_RAW, CORPUS_NAME, lang_code) MAX_LEN, MIN_LEN = 30, 2 # min_len is by defaul 2 tokens file_name = lang_code + "-" + "en" + ".tmx" COMPLETE_PATH = os.path.join(path_to_raw_file, file_name) print(COMPLETE_PATH) STORE_PATH = os.path.join(os.path.expanduser(DATA_DIR_PREPRO), CORPUS_NAME, lang_code, "splits", str(MAX_LEN)) os.makedirs(STORE_PATH, exist_ok=True) start = time.time() output_file_path = os.path.join(DATA_DIR_PREPRO, CORPUS_NAME, lang_code) # Conversion tmx > text converter = Converter(output=FileOutput(output_file_path)) converter.convert([COMPLETE_PATH]) print("Converted lines:", converter.output_lines) print("Extraction took {} minutes to complete.".format( convert_time_unit(time.time() - start))) target_file = "bitext.{}".format(lang_code) src_lines, trg_lines = [], [] # Read converted lines for further preprocessing with open(os.path.join(output_file_path, "bitext.en"), 'r', encoding="utf8") as src_file, \ open(os.path.join(output_file_path, target_file), 'r', encoding="utf8") as target_file: for src_line, trg_line in zip(src_file, target_file): src_line = src_line.strip() trg_line = trg_line.strip() if src_line != "" and trg_line != "": src_lines.append(src_line) trg_lines.append(trg_line) ### tokenize lines #### assert len(src_lines) == len( trg_lines), "Lines should have the same lengths." TOKENIZATION_MODE = "w" PREPRO_PHASE = True # Get tokenizer src_tokenizer, trg_tokenizer = get_custom_tokenizer("en", TOKENIZATION_MODE, prepro=PREPRO_PHASE), \ get_custom_tokenizer(lang_code, TOKENIZATION_MODE, prepro=PREPRO_PHASE) # Creates logger to log tokenized objects src_logger = Logger(output_file_path, file_name="bitext.tok.en") trg_logger = Logger(output_file_path, file_name="bitext.tok.{}".format(lang_code)) temp_src_toks, temp_trg_toks = [], [] # Start the tokenisation process if isinstance(src_tokenizer, SpacyTokenizer): print("Tokenization for source sequences is performed with spaCy") with src_tokenizer.nlp.disable_pipes('ner'): for i, doc in enumerate( src_tokenizer.nlp.pipe(src_lines, batch_size=1000)): tok_doc = ' '.join([tok.text for tok in doc]) temp_src_toks.append(tok_doc) src_logger.log(tok_doc, stdout=True if i % 100000 == 0 else False) else: print( "Tokenization for source sequences is performed with FastTokenizer" ) for i, sent in enumerate(src_lines): tok_sent = src_tokenizer.tokenize(sent) tok_sent = ' '.join(tok_sent) temp_src_toks.append(tok_sent) src_logger.log(tok_sent, stdout=True if i % 100000 == 0 else False) if isinstance(trg_tokenizer, SpacyTokenizer): print("Tokenization for target sequences is performed with spaCy") with trg_tokenizer.nlp.disable_pipes('ner'): for i, doc in enumerate( trg_tokenizer.nlp.pipe(trg_lines, batch_size=1000)): tok_doc = ' '.join([tok.text for tok in doc]) temp_trg_toks.append(tok_doc) trg_logger.log(tok_doc, stdout=True if i % 100000 == 0 else False) else: print( "Tokenization for target sequences is performed with FastTokenizer" ) for i, sent in enumerate(trg_lines): tok_sent = trg_tokenizer.tokenize(sent) tok_sent = ' '.join(tok_sent) temp_src_toks.append(tok_sent) src_logger.log(tok_sent, stdout=True if i % 100000 == 0 else False) # Reduce lines by max_len filtered_src_lines, filtered_trg_lines = [], [] print( "Reducing corpus to sequences of min length {} max length: {}".format( MIN_LEN, MAX_LEN)) filtered_src_lines, filtered_trg_lines = [], [] for src_l, trg_l in zip(temp_src_toks, temp_trg_toks): ### remove possible duplicate spaces src_l_s = re.sub(' +', ' ', src_l) trg_l_s = re.sub(' +', ' ', trg_l) if src_l_s != "" and trg_l_s != "": src_l_spl, trg_l_spl = src_l_s.split(" "), trg_l_s.split(" ") if len(src_l_spl) <= MAX_LEN and len(trg_l_spl) <= MAX_LEN: if len(src_l_spl) >= MIN_LEN and len(trg_l_spl) >= MIN_LEN: filtered_src_lines.append(' '.join(src_l_spl)) filtered_trg_lines.append(' '.join(trg_l_spl)) assert len(filtered_src_lines) == len(filtered_trg_lines) src_lines, trg_lines = filtered_src_lines, filtered_trg_lines print("Splitting files...") train_data, val_data, test_data, samples_data = split_data( src_lines, trg_lines) persist_txt(train_data, STORE_PATH, "train.tok", exts=(".en", "." + lang_code)) persist_txt(val_data, STORE_PATH, "val.tok", exts=(".en", "." + lang_code)) persist_txt(test_data, STORE_PATH, "test.tok", exts=(".en", "." + lang_code)) if lang_code != "de": # for german language sample files are versioned with the program print("Generating samples files...") persist_txt(samples_data, STORE_PATH, file_name="samples.tok", exts=(".en", "." + lang_code)) print("Total time:", convert_time_unit(time.time() - start))
def translate(path="", predict_from_file="", beam_size=5): use_cuda = True if torch.cuda.is_available() else False device = "cuda" if use_cuda else "cpu" FIXED_WORD_LEVEL_LEN = 30 if not path: print("Please provide path to model!") return False path_to_exp = os.path.expanduser(path) print("Using experiment from: ", path_to_exp) path_to_model = os.path.join(path_to_exp, "model.pkl") try: experiment = torch.load(os.path.join(path_to_exp, "experiment.pkl")) experiment = Experiment(experiment["args"]) experiment.cuda = use_cuda except FileNotFoundError as e: print("Wrong path. File not found: ", e) return False logger_file_name = experiment.rnn_type + "_live_translations.log" logger = Logger(path_to_exp, file_name=logger_file_name) try: SRC_vocab = torch.load(os.path.join(path_to_exp, "src.pkl")) TRG_vocab = torch.load(os.path.join(path_to_exp, "trg.pkl")) except Exception as e: print( "Error while loading vocabularies: {}\nLoading vocabularies based on experiment configuration..." .format(e)) train_prepos = get_vocabularies_and_iterators(experiment) SRC_vocab, TRG_vocab = train_prepos[0], train_prepos[1] logger.pickle_obj(SRC_vocab, "src") logger.pickle_obj(TRG_vocab, "trg") tok_level = "w" src_tokenizer = get_custom_tokenizer(experiment.get_src_lang(), "w", prepro=True) trg_tokenizer = get_custom_tokenizer(experiment.get_trg_lang(), "w", prepro=True) MAX_LEN = FIXED_WORD_LEVEL_LEN SRC_vocab.tokenize = src_tokenizer.tokenize TRG_vocab.tokenize = trg_tokenizer.tokenize tokens_bos_eos_pad_unk = [ TRG_vocab.vocab.stoi[SOS_TOKEN], TRG_vocab.vocab.stoi[EOS_TOKEN], TRG_vocab.vocab.stoi[PAD_TOKEN], TRG_vocab.vocab.stoi[UNK_TOKEN] ] experiment.src_vocab_size = len(SRC_vocab.vocab) experiment.trg_vocab_size = len(TRG_vocab.vocab) model = get_nmt_model(experiment, tokens_bos_eos_pad_unk) try: model.load_state_dict(torch.load(path_to_model)) except FileNotFoundError as e: print("Wrong path. File not found: ", e) return except RuntimeError as re: print("CUDA Error:", re) print("Loading model with CPU support...") model.load_state_dict( torch.load(path_to_model, map_location=torch.device("cpu"))) model = model.to(device) logger.log("Live translation: {}".format( datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")), stdout=False) logger.log("Beam width: {}".format(beam_size)) translator = Translator(model, SRC_vocab, TRG_vocab, logger, src_tokenizer, device, beam_size, max_len=MAX_LEN) if predict_from_file: translator.predict_from_text(predict_from_file) else: input_sequence = "" while (1): try: try: input_sequence = input("SRC >>> ") if input_sequence.lower().startswith("#"): bs = input_sequence.split("#")[1] try: beam_size = int(bs) logger.log("New Beam width: {}".format(beam_size)) input_sequence = input("SRC >>> ") except ValueError: input_sequence = input("SRC >>> ") except ValueError as e: print("An error has occurred: {}. Please restart program!". format(e)) return False # Check if it is quit case if input_sequence == 'q' or input_sequence == 'quit': break translator.set_beam_size(beam_size) out = translator.predict_sentence(input_sequence) if out: logger.log("-" * 35, stdout=True) else: print("Error while translating!") except KeyError: print("Error: Encountered unknown word.") return [ experiment, model, SRC_vocab, TRG_vocab, src_tokenizer, trg_tokenizer, logger ]
def get_vocabularies_and_iterators(experiment, data_dir=None, max_len=30): """ Creates vocabularies and iterators for the experiment :param experiment: the Experiment object including all settings about the experiment :param data_dir: the directory where data is stored in. If None, default is applied :param max_len: the max length, default is the sentence max length considered during tokenization process :return: src vocabulary, trg vocabulary, datasets and iteratotrs + sample iterator if dataset europarl is used """ device = experiment.get_device() #### Create torchtext fields ####### SRC, TRG voc_limit = experiment.voc_limit min_freq = experiment.min_freq corpus = experiment.corpus language_code = experiment.lang_code reduce = experiment.reduce print("Vocabulary limit:", voc_limit) reverse_input = experiment.reverse_input print("Source reversed:", reverse_input) print("Required samples:") print(experiment.train_samples, experiment.val_samples, experiment.test_samples) PREPRO = False if corpus == "europarl" else True MODE = "w" src_tokenizer, trg_tokenizer = get_custom_tokenizer( "en", mode=MODE, prepro=PREPRO), get_custom_tokenizer(language_code, mode=MODE, prepro=PREPRO) src_vocab = Field(tokenize=lambda s: src_tokenizer.tokenize(s), include_lengths=False, init_token=None, eos_token=None, pad_token=PAD_TOKEN, unk_token=UNK_TOKEN, lower=True) trg_vocab = Field(tokenize=lambda s: trg_tokenizer.tokenize(s), include_lengths=False, init_token=SOS_TOKEN, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN, unk_token=UNK_TOKEN, lower=True) print("Fields created!") ####### create splits ########## if corpus == "europarl": root = os.path.expanduser(DATA_DIR_PREPRO) if not data_dir: data_dir = os.path.join(root, corpus, language_code, "splits", str(max_len)) # local directory # check if files have been preprocessed try: files = os.listdir(data_dir) if len(files) < 8: print( "ERROR: Not enough training files found at {}!\nTraining the model on the Europarl dataset requires train, val, test and samples splits for each language!" .format(data_dir)) print( "Please drerun the script 'preprocess.py' for the given <lang_code>!" ) except FileNotFoundError: print("ERROR: Training files not found at {}!".format(data_dir)) print( "Please run the 'preprocess.py' script for the given <lang_code> before training the model!" ) exit(-1) print("Loading data...") start = time.time() file_type = experiment.tok exts = ("." + experiment.get_src_lang(), "." + experiment.get_trg_lang()) train, val, test = Seq2SeqDataset.splits(fields=(src_vocab, trg_vocab), exts=exts, train="train." + file_type, validation="val." + file_type, test="test." + file_type, path=data_dir, reduce=reduce, truncate=experiment.truncate) ### samples is used to check translations during the training phase samples = Seq2SeqDataset.splits(fields=(src_vocab, trg_vocab), exts=exts, train="samples." + file_type, validation="", test="", path=data_dir) end = time.time() print("Duration: {}".format(convert_time_unit(end - start))) print("Total number of sentences: {}".format( (len(train) + len(val) + len(test)))) else: #### Training on IWSLT torchtext corpus ##### print("Loading data...") start = time.time() path = os.path.expanduser(os.path.join(DATA_DIR_PREPRO, "iwslt")) os.makedirs(path, exist_ok=True) exts = (".en", ".de") if experiment.get_src_lang() == "en" else (".de", ".en") ## see: https://lukemelas.github.io/machine-translation.html train, val, test = datasets.IWSLT.splits( root=path, exts=exts, fields=(src_vocab, trg_vocab), filter_pred=lambda x: max(len(vars(x)['src']), len(vars(x)['trg']) ) <= experiment.truncate) samples = None end = time.time() print("Duration: {}".format(convert_time_unit(end - start))) print("Total number of sentences: {}".format( (len(train) + len(val) + len(test)))) if voc_limit > 0: src_vocab.build_vocab(train, min_freq=min_freq, max_size=voc_limit) trg_vocab.build_vocab(train, min_freq=min_freq, max_size=voc_limit) print("Vocabularies created!") else: src_vocab.build_vocab(train, min_freq=min_freq) trg_vocab.build_vocab(train, min_freq=min_freq) print("Vocabularies created!") #### Iterators ##### # Create iterators to process text in batches of approx. the same length train_iter = data.BucketIterator(train, batch_size=experiment.batch_size, device=device, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), shuffle=True) val_iter = data.BucketIterator(val, 1, device=device, repeat=False, sort_key=lambda x: (len(x.src)), shuffle=True) test_iter = data.Iterator(test, batch_size=1, device=device, repeat=False, sort_key=lambda x: (len(x.src)), shuffle=False) if samples[0].examples: samples_iter = data.Iterator(samples[0], batch_size=1, device=device, repeat=False, shuffle=False, sort_key=lambda x: (len(x.src))) else: samples_iter = None return src_vocab, trg_vocab, train_iter, val_iter, test_iter, train, val, test, samples, samples_iter
def test_factory_spacy(self): tokenizer = get_custom_tokenizer("de", prepro=True, mode="w") self.assertIsInstance(tokenizer, SpacyTokenizer) self.assertEqual(tokenizer.lang, "de") self.assertIsInstance(tokenizer.nlp, spacy.lang.de.German) self.assertIs(tokenizer.only_tokenize, True)
def test_spacy_tokenizer(self): tokenizer = get_custom_tokenizer("de", prepro=True, mode="w") self.assertIsInstance(tokenizer, SpacyTokenizer) test_string = "das ist ein Satz" self.assertIsInstance(tokenizer.tokenize(test_string), list) self.assertIs(len(tokenizer.tokenize(test_string)), 4)
def test_char_mode(self): tokenizer = get_custom_tokenizer("xx", prepro=False, mode="c") self.assertIsInstance(tokenizer, CharBasedTokenizer)
def test_factory_split_xx(self): tokenizer = get_custom_tokenizer("xx", prepro=False, mode="w") self.assertIsInstance(tokenizer, SplitTokenizer) self.assertEqual(tokenizer.lang, "xx") self.assertIs(tokenizer.only_tokenize, True)
def test_factory_spacy_en(self): tokenizer = get_custom_tokenizer("en", prepro=True, mode="w") self.assertIsInstance(tokenizer, SpacyTokenizer) self.assertEqual(tokenizer.lang, "en") self.assertIsInstance(tokenizer.nlp, spacy.lang.en.English) self.assertIs(tokenizer.only_tokenize, True)