def preprocess(data_dict, src_lang, trg_lang, min_freq=0, short_list=0, max_sent_len=-1): src_text = [] trg_text = [] src_tokenizer = mtok.MosesTokenizer(src_lang) trg_tokenizer = mtok.MosesTokenizer(trg_lang) # tokenize data for doc in data_dict: for i, t in enumerate(data_dict[doc]["source"]): data_dict[doc]["source"][i] = ' '.join(src_tokenizer(t)) src_text.append(data_dict[doc]["source"][i]) for i, t in enumerate(data_dict[doc]["target"]): data_dict[doc]["target"][i] = ' '.join(trg_tokenizer(t)) trg_text.append(data_dict[doc]["target"][i]) src_tokenizer.close() trg_tokenizer.close() # train subword model subword_model, subword_src_vocab, subword_trg_vocab, src_text, trg_text = train_subword_model( src_text, trg_text, nb_symbols=short_list) # Re-create the vocabulary with the nmtpy format src_vocab = create_vocab(src_text, min_freq, short_list) trg_vocab = create_vocab(trg_text, min_freq, short_list) #NOTE: put back processed text into data dict for doc in data_dict: nbsent = len(data_dict[doc]["source"]) data_dict[doc]['source'] = [] data_dict[doc]['target'] = [] for i in range(nbsent): src = src_text.pop(0) trg = trg_text.pop(0) if len(src.split(' ')) <= max_sent_len and len( trg.split(' ')) <= max_sent_len: data_dict[doc]['source'].append(src) data_dict[doc]['target'].append(trg) if len(data_dict[doc]['source']) == 0: print( "Document {} contains only lengthy sentences, I'm removing it..." .format(doc)) del data_dict[doc] assert len( src_text) == 0, "preprocess: something is wrong with source text" assert len( trg_text) == 0, "preprocess: something is wrong with target text" return data_dict, src_vocab, trg_vocab, subword_model, subword_src_vocab, subword_trg_vocab
def __init__(self, lang): try: self._moses_tokenizer = mosestokenizer.MosesTokenizer(lang) except NameError as err: logger.error( "Install mosestokenizer to support moses tokenization") raise err
def __init__(self, lang: str, bpe_model=None): self.lang = lang self.word_tokenize = mosestokenizer.MosesTokenizer(self.lang) self.word_detokenize = mosestokenizer.MosesDetokenizer(self.lang) self.splitsents = mosestokenizer.MosesSentenceSplitter(self.lang) self.normalize = mosestokenizer.MosesPunctuationNormalizer(self.lang) if bpe_model: codes = codecs.open(bpe_model)
def tokenize(text, language, join=True): """ Tokenize text with mosestokenizer """ tokenize = mosestokenizer.MosesTokenizer(language) tokens = tokenize(text) tokenize.close() if join: tokens = ' '.join(tokens) return tokens
def tokenize(text, language, join=True): """ Tokenize text with mosestokenizer """ if not language in tokenizers: tokenizers[language] = mosestokenizer.MosesTokenizer(language) tokenizer = tokenizers[language] tokens = tokenizer(text) # tokenize.close() if join: tokens = ' '.join(tokens) return tokens
def __init__(self, config): self.SOS_token = config["SOS_token"] self.EOS_token = config["EOS_token"] self.UNK_token = config["UNK_token"] self.mask_token = config["mask_token"] self.max_features = config["en_voc"] self.word2index = {"SOS": self.SOS_token, "EOS": self.EOS_token, "UNK": self.UNK_token, "MASK": self.mask_token } self.index2word = {self.SOS_token: "SOS", self.EOS_token: "EOS", self.UNK_token: "UNK", self.mask_token: "MASK"} self.word2count = Counter() self.added = set() self.n_words = 4 self.tokenizer = mos.MosesTokenizer("en")
from bs4 import BeautifulSoup import mosestokenizer import os outdir = 'processed_target_extraction_dedup_noleak' # Ensure the datasets appear in chronological order. # Repeated sentences with different labelling would be taken from newer datasets during deduplication. datasets = {'train_valid': ['ABSA-15_Restaurants_Train_Final.xml', 'ABSA16_Restaurants_Train_SB1_v2.xml', 'Restaurants_Train_v2.xml'], 'test': ['ABSA15_Restaurants_Test.xml', 'Restaurants_Test_Data_phaseB.xml']} sentences = [] labels = [] tokenize = mosestokenizer.MosesTokenizer('en') def processSentence(sentence): text = sentence.text.strip('\n') opinions = sentence.findAll('opinion') target_str = 'target' if not opinions: opinions = sentence.findAll('aspectterm') # Opinions are called aspectTerms in this file, and targets are called 'terms target_str = 'term' targets = [] #(target_str, beg, end) for op in opinions: if op[target_str] == 'NULL': continue else: beg = int(op['from']) # text.find(op['target'])
from torchtext import data, datasets import mosestokenizer import torch.nn as nn import torch.optim as optim import torch from torch import Tensor from typing import Tuple tokenizer_en = mosestokenizer.MosesTokenizer('en') tokenizer_de = mosestokenizer.MosesTokenizer('de') BOS = '<s>' EOS = '</s>' PAD = '<pad>' src = data.Field(sequential=True, use_vocab=True, pad_token=PAD, tokenize=tokenizer_en, lower=True, include_lengths=True, ) #unk=0, pad=1 tgt = data.Field(sequential=True, use_vocab=True, pad_token=PAD, tokenize=tokenizer_de, lower=True, init_token=BOS,
from torchtext.data import Field, BucketIterator, interleave_keys from torchtext.datasets import TranslationDataset from torchtext.data import Example import mosestokenizer import torch from typing import Tuple import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch import Tensor tokenizer_en = mosestokenizer.MosesTokenizer('en') tokenizer_fr = mosestokenizer.MosesTokenizer('fr') #src = Field(sequential=True, # use_vocab=True, # pad_token=PAD, # #tokenize=tokenizer_en, # lower=True, # batch_first=True) BOS = '<s>' EOS = '</s>' PAD = '<pad>' src = Field(sequential=True, use_vocab=True, pad_token=PAD, tokenize=tokenizer_en, lower=True,
class MotherJokeGenerator: """ Generator for mother jokes in russian language """ morph = pymorphy2.MorphAnalyzer() tokenizer = mosestokenizer.MosesTokenizer() detokenizer = mosestokenizer.MosesDetokenizer() joke_beginning = "твоя мамка" duplicating_punctuation_remover = re.compile(r"(\W)\1+") punctuation_space_remover = re.compile(r"\s([.,:;?!](?:\s|$))") pair_punctuation_space_remover = re.compile( r'(["\[({<])\s*(.*?)\s*(["\])}>])') unsuitable_verbs = "быть бывать".split() suitable_verb_form = "буду".split() pronouns_p1 = "я меня мне меня мной мы нас нам нас нами".split() pronouns_p2 = "ты тебя тебе тебя тобой вы вас вам вас вами".split() prepositions_p1 = "обо со".split() prepositions_p2 = "о с".split() pronoun_flip_map = dict( chain(zip(pronouns_p1, pronouns_p2), zip(pronouns_p2, pronouns_p1))) preposition_flip_map = dict( chain(zip(prepositions_p1, prepositions_p2), zip(prepositions_p2, prepositions_p1))) preposition_flip_pronouns = set("мне тебе мной тобой".split()) def __init__( self, min_token_count=3, max_token_count=15, min_verb_length=3, min_words_after_verb=1, ): """ :param min_token_count: do not process sentences that have less than that number of tokens :param max_token_count: do not process sentences that have more than that number of tokens :param min_verb_length: limits minimal verb length (saves some time skipping small words) :param min_words_after_verb: minimal words number after a verb (joke is usually better with some trailing words) """ super().__init__() self.min_token_count = min_token_count self.max_token_count = max_token_count self.min_word_length = min_verb_length self.min_words_after_verb = min_words_after_verb def get_joke(self, sentence): """ Generate a joke from the sentence if it contains verbs. Morphological analysis is performed to find verbs and transform them. :returns joke string or None if was unable to generate a joke """ clean_sentence = self.duplicating_punctuation_remover.sub( r"\1", sentence.lower()).strip(",.!?;:") tokens = self.tokenizer(clean_sentence) tokens_count = len(tokens) if self.min_token_count > tokens_count or tokens_count > self.max_token_count: return None inflected_verb_index = -1 inflected_verb = None # finding last verb in the sentence and changing it's form # starting enumeration from 1 for regular index calculation convenience for reverse_index, word in enumerate(reversed(tokens), 1): if len(word) < self.min_word_length: continue # taking the most probable morphological analysis result word_morph = self.morph.parse(word)[0] is_verb = word_morph.tag.POS == "VERB" if not is_verb: continue if (word_morph.normal_form in self.unsuitable_verbs and word not in self.suitable_verb_form): continue if reverse_index <= self.min_words_after_verb: break inflected_verb_morph = self._inflect_verb(word_morph) if inflected_verb_morph: inflected_verb_index = len(tokens) - reverse_index inflected_verb = inflected_verb_morph.word # last_good_verb might not exist here if failed to inflate the verb # will ignore sentence in such case break return (self._compile_joke(tokens[inflected_verb_index + 1:], inflected_verb) if inflected_verb else None) @staticmethod def _inflect_verb(verb_morph): """ Change the form of the verb to correlate with the beginning of the joke """ tense = verb_morph.tag.tense inflected_verb_morph = (verb_morph.inflect( {"VERB", "femn"}) if tense == "past" else verb_morph.inflect( {"VERB", "sing", "3per", tense}) if tense else None) return inflected_verb_morph def _flip_pronouns(self, rest_of_joke): """ Reverse the pronouns, usually makes more sense this way round """ for index, word in enumerate(rest_of_joke): if word in self.pronoun_flip_map: rest_of_joke[index] = self.pronoun_flip_map[word] prev_word = rest_of_joke[index - 1] if (index > 0 and word in self.preposition_flip_pronouns and prev_word in self.preposition_flip_map): rest_of_joke[index - 1] = self.preposition_flip_map[prev_word] def _compile_joke(self, remaining_tokens, inflected_verb): """ Final preparations and formatting """ self._flip_pronouns(remaining_tokens) joke_ending = self.detokenizer(remaining_tokens) joke = f"{self.joke_beginning} {inflected_verb} {joke_ending}" joke = self.punctuation_space_remover.sub(r"\1", joke) joke = self.pair_punctuation_space_remover.sub(r"\1\2\3", joke) return joke
import os import numpy as np import time import re import pickle as pkl from torch.utils.data import Dataset import torch import torch.nn as nn import mosestokenizer puncnormalize = mosestokenizer.MosesPunctuationNormalizer("en") normalize = mosestokenizer.MosesTokenizer("en") import os def read_in_file(path, labeled=True): file = open(path, 'r', encoding='utf-8') qids = [] questions = [] labels = [] line = file.readline() print(line) cnt = 0 while True: line = file.readline() if line: try: qid, question = line.split(',', 1) if labeled: