def classifier_process(i, jobs_queue, output_queue, args): with MosesTokenizer(args.source_lang) as source_tokenizer, MosesTokenizer(args.target_lang) as target_tokenizer: while True: job = jobs_queue.get() if job: logging.debug("Job {0}".format(job.__repr__())) nblock, filein_name = job ojob = None with open(filein_name, 'r') as filein, NamedTemporaryFile(mode="w", delete=False, dir=args.tmp_dir) as fileout: logging.debug("Classification: creating temporary filename {0}".format(fileout.name)) feats = [] # TODO Test times with predict one-by-one and this impl for i in filein: features = feature_extract(i, source_tokenizer, target_tokenizer, args) feats.append([float(v) for v in features]) if len(feats) > 0: prediction = args.clf.predict_proba(np.array(feats)) row = 0 for pred in prediction: fileout.write("{}\n".format(str(pred[1]))) row += 1 ojob = (nblock, fileout.name) filein.close() fileout.close() if ojob: output_queue.put(ojob) os.unlink(filein_name) else: logging.debug("Exiting worker") break
def print_unrolled_stats(unrolled_data): counter = dict() sentiment_counter = defaultdict(int) length_list = [] tk = MosesTokenizer() aspects = set() for x in unrolled_data: aspects.add(x['aspect']) for a in aspects: counter[a] = defaultdict(int) for e in unrolled_data: counter[e['aspect']][e['sentiment']] += 1 length_list.append(len(tk.tokenize((e['sentence'])))) for aspect in sorted(counter.keys()): total = 0 for sentiment in sorted(counter[aspect].keys()): print('# {}\t\t{}:\t{}'.format(aspect, sentiment, counter[aspect][sentiment])) total += counter[aspect][sentiment] sentiment_counter[sentiment] += counter[aspect][sentiment] counter[aspect]['total'] = total print('# {}\t\t{}:\t{}'.format(aspect, 'total', total)) print() print(sentiment_counter) return counter
def worker_process(i, jobs_queue, output_queue, args): with MosesTokenizer(args.source_lang) as tokl, \ MosesTokenizer(args.target_lang) as tokr: while True: job = jobs_queue.get() if job: logging.debug("Job {}".format(job.__repr__())) nblock, filein_name, label = job with open(filein_name, 'r') as filein, NamedTemporaryFile(mode="w", delete=False) as fileout: logging.debug("Filtering: creating temporary file {}".format(fileout.name)) for i in filein: features = feature_extract(i, tokl, tokr, args) for j in features: fileout.write("{}".format(j)) fileout.write("\t") fileout.write("{}".format(label)) fileout.write("\n") ojob = (nblock, fileout.name) fileout.close() filein.close() output_queue.put(ojob) os.unlink(filein_name) else: logging.debug("Exiting worker") break
def classifier_process(i, jobs_queue, output_queue, args): with MosesTokenizer(args.source_lang) as source_tokenizer, MosesTokenizer( args.target_lang) as target_tokenizer: while True: job = jobs_queue.get() if job: logging.debug("Job {0}".format(job.__repr__())) nblock, filein_name = job with open(filein_name, 'r') as filein, NamedTemporaryFile( mode="w", delete=False, dir=args.tmp_dir) as fileout: logging.debug( "Classification: creating temporary filename {0}". format(fileout.name)) feats = [] temp_lines = [] # TODO Test times with predict one-by-one and this impl for i in filein: parts = i.strip().split("\t") line = "" temp_lines.append(i) if len(parts) == 7: # Last two columns are the language pair if parts[-2] == args.source_lang and parts[ -1] == args.target_lang: line = "{}\t{}\n".format(parts[1], parts[3]) elif parts[-1] == args.source_lang and parts[ -2] == args.source_lang: line = "{}\t{}\n".format(parts[3], parts[1]) features = feature_extract(line, source_tokenizer, target_tokenizer, args) feats.append([float(v) for v in features]) else: logging.debug( "Line not included in process: {}".format(i)) if len(feats) > 0: prediction = args.clf.predict_proba(np.array(feats)) row = 0 for pred in prediction: while not temp_lines[row].startswith("<tu "): fileout.write(temp_lines[row]) row += 1 fileout.write("{}\t{}\n".format( temp_lines[row].strip("\n"), str(pred[1]))) row += 1 else: for l in temp_lines: fileout.write(l) ojob = (nblock, fileout.name) filein.close() fileout.close() output_queue.put(ojob) os.unlink(filein_name) else: logging.debug("Exiting worker") break
def __init__(self): from mosestokenizer import MosesTokenizer self.tokenizer = MosesTokenizer('ru') # disable self.tokenizer.argv.append('-no-escape') # " -> " self.tokenizer.argv.remove('-a') # - -> @-@ self.tokenizer.restart()
def own_bleu_score(predictions, references, max_order=4, smooth=False): ''' reference_corpus = [] prediction_corpus = [] for instance_id, reference_sents in references.items(): try: prediction_sent = predictions[instance_id] except KeyError: logging.error("Missing prediction for instance '%s'.", instance_id) sys.exit(EXIT_STATUS_PREDICTION_MISSING) del predictions[instance_id] prediction_corpus.append(prediction_sent) reference_corpus.append(reference_sents) if len(predictions) > 0: logging.error("Found %d extra predictions, for example: %s", len(predictions), ", ".join(list(predictions.keys())[:3])) sys.exit(EXIT_STATUS_PREDICTIONS_EXTRA) reference_length = 0 translation_length = 0 scores = [] counter = 0 for (references, translation) in zip(reference_corpus, prediction_corpus): if counter <= 4: print("Referenz: ", references, "\nPrediction: ", translation, "\n") counter += 1 scores.append(sentence_bleu(references, translation, weights=(0,0,0,1))) ''' # to be able to load punkt tokenizer from local folder even if on cluster original_dir = os.getcwd() execution_dir = os.path.dirname(os.path.abspath(__file__)) os.chdir(execution_dir) ''' compl_ref = "" for ref in references: compl_ref += ref + " " references = nltk.word_tokenize(compl_ref) ''' #predictions = nltk.word_tokenize(predictions[0].strip('.')) tokenizer = MosesTokenizer('en') predictions = tokenizer.tokenize(predictions[0].lower()) references = [ tokenizer.tokenize(reference.lower()) for reference in references ] # change directory back after nltk tokenizers have been applied os.chdir(original_dir) # original bleu score uses constant weights #print(references[0]) #scores = corpus_bleu([references], [predictions]) scores = sentence_bleu(references, predictions, weights=(0.33, 0.33, 0.33)) return scores
def read_sentence14_target(file_path, max_offset_len=83): tk = MosesTokenizer() with open(file_path, 'rb') as fopen: raw = fopen.read() root = etree.fromstring(raw) for sentence in root: example = dict() example["sentence"] = sentence.find('text').text.lower() # for RAN tokens = tk.tokenize(example['sentence']) terms = sentence.find('aspectTerms') if terms is None: continue example["aspect_sentiment"] = [] example["left_right"] = [] example['offset'] = [] for c in terms: target = c.attrib['term'].lower() example["aspect_sentiment"].append( (target, c.attrib['polarity'])) # for td lstm left_index = int(c.attrib['from']) right_index = int(c.attrib['to']) example["left_right"].append( (example['sentence'][:right_index], example['sentence'][left_index:], c.attrib['polarity'])) # for RAN left_word_offset = len( tk.tokenize(example['sentence'][:left_index])) right_word_offset = len( tk.tokenize(example['sentence'][right_index:])) token_index = list(range(len(tokens))) token_length = float(len(token_index)) for i in range(len(tokens)): if i < left_word_offset: token_index[i] = 1 - (left_word_offset - token_index[i]) / token_length elif i >= right_word_offset: token_index[i] = 1 - (token_index[i] - (len(tokens) - right_word_offset) + 1) / token_length else: token_index[i] = 0 token_index += [-1.] * (max_offset_len - len(tokens)) example['offset'].append( (token_index, target, c.attrib['polarity'])) yield example
def moses_tokenize(text): from mosestokenizer import MosesTokenizer global MOSES_TOK if not MOSES_TOK: MOSES_TOK = MosesTokenizer('ru') # disable MOSES_TOK.argv.append('-no-escape') # " -> " MOSES_TOK.argv.remove('-a') # - -> @-@ MOSES_TOK.restart() chunks = MOSES_TOK(text) return find_substrings(chunks, text)
class WordTokenizer(BaseTokenizer): def __init__(self): self.tokenizer = MosesTokenizer() def tokenize(self, text: str) -> List[str]: return self.tokenizer(text.strip()) def detokenize(self, tokens: List[str]) -> str: text = " ".join(tokens).strip() return text def close(self): self.tokenizer.close()
def print_unrolled_stats_atsa(unrolled_data): counter = defaultdict(int) length_list = [] tk = MosesTokenizer() for e in unrolled_data: counter[e['sentiment']] += 1 length_list.append(len(tk.tokenize((e['sentence'])))) for sentiment in sorted(counter.keys()): print('#{}:\t{}'.format(sentiment, counter[sentiment])) return counter
class MosesTokenizer: label = 'mosestokenizer' def __init__(self): from mosestokenizer import MosesTokenizer self.tokenizer = MosesTokenizer('ru') # disable self.tokenizer.argv.append('-no-escape') # " -> " self.tokenizer.argv.remove('-a') # - -> @-@ self.tokenizer.restart() def __call__(self, text): chunks = self.tokenizer(text) return find_substrings(chunks, text)
def __init__(self, server, servable_name, t2t_usr_dir, problem, data_dir, timeout_secs): super(EnZhNmtClient).__init__() tf.logging.set_verbosity(tf.logging.INFO) validate_flags(server, servable_name) usr_dir.import_usr_dir(t2t_usr_dir) self.problem = registry.problem(problem) self.hparams = tf.contrib.training.HParams( data_dir=os.path.expanduser(data_dir)) self.problem.get_hparams(self.hparams) self.request_fn = make_request_fn(server, servable_name, timeout_secs) self.moses_tokenizer = MosesTokenizer('en') self.moses_detokenizer = MosesDetokenizer('zh') if problem.endswith("_rev"): fname = "targets" else: fname = "inputs" if self.problem.has_inputs else "targets" self.input_encoder = self.problem.feature_info[fname].encoder if problem.endswith("_rev"): self.output_decoder = self.problem.feature_info["inputs"].encoder else: self.output_decoder = self.problem.feature_info["targets"].encoder
def kazakh_lemma_tokenizer(sent): klt = KazakhLemmatizer() tokens = [] with MosesTokenizer('kk') as tokenize: for token in tokenize(sent): tokens.append(klt.lemmatize(token)) return tokens
def classifier_process(i, jobs_queue, output_queue, args): with MosesTokenizer(args.source_lang) as source_tokenizer, MosesTokenizer(args.target_lang) as target_tokenizer: while True: job = jobs_queue.get() if job: logging.debug("Job {0}".format(job.__repr__())) nblock, filein_name = job ojob = None with open(filein_name, 'r') as filein, NamedTemporaryFile(mode="w", delete=False, dir=args.tmp_dir) as fileout: logging.debug("Classification: creating temporary filename {0}".format(fileout.name)) feats = [] for i in filein: parts = i.split("\t") if len(parts) >= 2 and len(parts[0].strip()) != 0 and len(parts[1].strip()) != 0: features = feature_extract(i, source_tokenizer, target_tokenizer, args) # print("SENTENCE PAIR: %%{}%%".format(i)) # print(Features(features)) # debug feats.append([float(v) for v in features]) predictions = args.clf.predict_proba(np.array(feats)) if len(feats) > 0 else [] filein.seek(0) piter = iter(predictions) for i in filein: parts = i.split("\t") if len(parts) >= 2 and len(parts[0].strip()) != 0 and len(parts[1].strip()) != 0: p = next(piter) fileout.write(i.strip()) fileout.write("\t") fileout.write(str(p[1])) fileout.write("\n") else: fileout.write(i.strip("\n")) fileout.write("\t0\n") ojob = (nblock, fileout.name) filein.close() fileout.close() if ojob: output_queue.put(ojob) os.unlink(filein_name) else: logging.debug("Exiting worker") break
def __init__(self, language_code='pt', nltk_stop_words_package='portuguese'): self.tokenize = MosesTokenizer(language_code) nltk.download('wordnet', quiet=False) self.lemmatizer = nltk.stem.WordNetLemmatizer() nltk.download(info_or_id='stopwords', quiet=False) self.stop_words = nltk.corpus.stopwords.words(nltk_stop_words_package)
def worker_process(i, jobs_queue, output_queue, args): if args.source_tokeniser_path: source_tokeniser = ToolWrapper(args.source_tokeniser_path.split(' ')) else: source_tokeniser = MosesTokenizer(args.source_lang) if args.target_tokeniser_path: target_tokeniser = ToolWrapper(args.target_tokeniser_path.split(' ')) else: target_tokeniser = MosesTokenizer(args.target_lang) while True: job = jobs_queue.get() if job: logging.debug("Job {}".format(job.__repr__())) nblock, filein_name, label = job with open(filein_name, 'r') as filein, NamedTemporaryFile( mode="w", delete=False) as fileout: logging.debug("Filtering: creating temporary file {}".format( fileout.name)) for i in filein: srcsen, trgsen = i.split("\t")[:2] trgsen = trgsen.strip() # print(str(srcsen) + " --- " + str(trgsen)) features = feature_extract(srcsen, trgsen, source_tokeniser, target_tokeniser, args) for j in features: fileout.write("{}".format(j)) fileout.write("\t") fileout.write("{}".format(label)) fileout.write("\n") ojob = (nblock, fileout.name) fileout.close() filein.close() output_queue.put(ojob) os.unlink(filein_name) else: logging.debug("Exiting worker") source_tokeniser.close() target_tokeniser.close() break
def __init__(self, user_dict, src_vacob_model, tgt_vocab_model, server, servable_name, timeout_secs): tf.logging.set_verbosity(tf.logging.INFO) self.src_encoder = SpmTextEncoder(src_vacob_model) self.tgt_encoder = SpmTextEncoder(tgt_vocab_model) self.en_tokenizer = MosesTokenizer('en') jieba.load_userdict(user_dict) self.request_fn = make_request_fn(server, servable_name, timeout_secs) super(EnZhBertAlignClient, self).__init__(src_vacob_model, tgt_vocab_model, server, servable_name, timeout_secs)
def __init__(self, src_vacob_model, tgt_vocab_model, server, servable_name, timeout_secs): tf.logging.set_verbosity(tf.logging.INFO) self.src_encoder = SpmTextEncoder(src_vacob_model) self.tgt_encoder = SpmTextEncoder(tgt_vocab_model) self.en_tokenizer = MosesTokenizer('en') self.mecab = MeCab.Tagger("-Owakati") self.request_fn = make_request_fn(server, servable_name, timeout_secs) super(EnJaBertAlignClient, self).__init__(src_vacob_model, tgt_vocab_model, server, servable_name, timeout_secs)
def __init__(self): tf.logging.set_verbosity(tf.logging.INFO) validate_flags() usr_dir.import_usr_dir(FLAGS.t2t_usr_dir) self.problem = registry.problem(FLAGS.problem) self.hparams = tf.contrib.training.HParams( data_dir=os.path.expanduser(FLAGS.data_dir)) self.problem.get_hparams(self.hparams) self.request_fn = make_request_fn() self.tokenizer = MosesTokenizer('en') self.moses_detokenizer = MosesDetokenizer('zh') self.delimiter = re.compile("(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s")
def preprocess(inputs): sentences, max_tokens, lang = inputs tokenizer = MosesTokenizer(lang) result = [] for sent in sentences: words = tokenizer(sent.strip()) if len(words) > max_tokens: continue else: result.append(" ".join(words) + "\n") return result
def preproc_europarl(args): """ - tokenization - lower case - sub digit with 0 - remove all punctuations - remove redundant spaces and emtpy lines - (optional) cut long sentences to a reasonable length """ langs = args.input[args.input.rfind('.') + 1:].strip().split('-') # only 2 languages assert (len(langs) == 2) lang1, lang2 = langs tokenizer1 = MosesTokenizer(lang1) tokenizer2 = MosesTokenizer(lang2) # read corpus with open(args.input + '.{}'.format(lang1), 'r') as fin1, \ open(args.input + '.{}'.format(lang2), 'r') as fin2: text1 = fin1.readlines() text2 = fin2.readlines() assert (len(text1) == len(text2)) with open(args.input + '.{}.preproc'.format(lang1), 'w') as fout1, \ open(args.input + '.{}.preproc'.format(lang2), 'w') as fout2: for i, line1 in tqdm(enumerate(text1), total=len(text1)): # each line is a sentence line1 = line1.strip() line2 = text2[i].strip() line1 = preproc_text(line1, tokenizer1) line2 = preproc_text(line2, tokenizer2) # remove emtpy lines if not line1 or not line2: continue fout1.write(line1 + '\n') fout2.write(line2 + '\n')
def process_corpus(embeddings_dictionary, corpus, vectors, language): """ Cleans corpus using the dictionary of embeddings. Any word without an associated embedding in the dictionary is ignored. Adds '__target-language' and '__source-language' at the end of the words according to their language. """ clean_corpus, clean_vectors, keys = [], {}, [] words_we_want = set(embeddings_dictionary) tokenize = MosesTokenizer(language) for key, doc in enumerate(corpus): clean_doc = [] words = tokenize(doc) for word in words: if word in words_we_want: clean_doc.append(word + "__%s" % language) clean_vectors[word + "__%s" % language] = np.array( vectors[word].split()).astype(np.float) if len(clean_doc) > 3 and len(clean_doc) < 25: keys.append(key) clean_corpus.append(" ".join(clean_doc)) tokenize.close() return np.array(clean_corpus), clean_vectors, keys
def clean_corpus_suffix(corpus, language): """ Adds '__target-language' and '__source-language' at the end of the words """ clean_corpus = [] tokenize = MosesTokenizer(language) for definition in corpus: definition = sub(r"'", "", definition) definition = sub(r"[^\w]", " ", definition) clean_doc = [] words = tokenize(definition) for word in words: clean_doc.append(word + "__%s" % language) clean_corpus.append(" ".join(clean_doc)) return clean_corpus
def get_tokenizer(cmd, lang="en"): if cmd == "moses": return MosesTokenizer(lang) elif cmd == "mecab": tagger = MeCab.Tagger("-Owakati") def mecab(text): return tagger.parse(text).strip().split() return mecab else: proc = ExternalTextProcessor(cmd.split()) def external(text): return proc.process(text).strip().split() return external
class MosesTokenizer(Tokenizer): def __init__(self): super().__init__() self._tokenizer = NLTKMosesTokenizer() self._detokenizer = MosesDetokenizer() def tokenize(self, sentence): return self._tokenizer.tokenize(sentence) def detokenize(self, tokens): """Unescape Moses punctuation tokens. Replaces escape sequences like [ with the original characters (such as '['), so they better align to the original text. """ return [self._detokenizer.unescape_xml(t) for t in tokens]
def get_tokenizer(tokenizer_name): log.info(f"\tLoading Tokenizer {tokenizer_name}") if tokenizer_name.startswith("bert-"): from pytorch_pretrained_bert import BertTokenizer do_lower_case = tokenizer_name.endswith("uncased") tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name == "OpenAI.BPE": tokenizer = OpenAIBPETokenizer() elif tokenizer_name == "MosesTokenizer": tokenizer = MosesTokenizer() elif tokenizer_name == "": tokenizer = SpaceTokenizer() else: tokenizer = None return tokenizer
def __init__(self, srclang, targetlang, sourcebpe=None, targetbpe=None, sourcespm=None, targetspm=None): self.bpe_source = None self.bpe_target = None self.sp_processor_source = None self.sp_processor_target = None self.sentences = [] # load BPE model for pre-processing if sourcebpe: # print("load BPE codes from " + sourcebpe, flush=True) BPEcodes = open(sourcebpe, 'r', encoding="utf-8") self.bpe_source = BPE(BPEcodes) if targetbpe: # print("load BPE codes from " + targetbpe, flush=True) BPEcodes = open(targetbpe, 'r', encoding="utf-8") self.bpe_target = BPE(BPEcodes) # load SentencePiece model for pre-processing if sourcespm: # print("load sentence piece model from " + sourcespm, flush=True) self.sp_processor_source = sentencepiece.SentencePieceProcessor() self.sp_processor_source.Load(sourcespm) if targetspm: # print("load sentence piece model from " + targetspm, flush=True) self.sp_processor_target = sentencepiece.SentencePieceProcessor() self.sp_processor_target.Load(targetspm) # pre- and post-processing tools self.tokenizer = None self.detokenizer = None # TODO: should we have support for other sentence splitters? # print("start pre- and post-processing tools") self.sentence_splitter = MosesSentenceSplitter(srclang) self.normalizer = MosesPunctuationNormalizer(srclang) if self.bpe_source: self.tokenizer = MosesTokenizer(srclang) if self.bpe_source: self.detokenizer = MosesDetokenizer(targetlang)
def tokenizer_moses(text, column='comment_text'): #column for extracting from csv ''' A proper wrapper for moses text preprocessing utilities, because they can't handle newlines text: string out: list ''' result = [] with MosesPunctuationNormalizer() as punct, MosesTokenizer('en') as tok: if column: texts = list(filter(None, text[column].lower().split('\n'))) else: texts = text for t in texts: if len(t.strip()): norm = punct(t) tokens = tok(norm) result.extend(tokens) return result
def build_vocabulary(cls, corpus: list=None, file_path: str=None, max_vocab_size=30000, lang='en'): vocab = cls(lang=lang, max_vocab_size=max_vocab_size) counter = Counter() tokenizer = MosesTokenizer(lang=lang) if file_path is not None: with open(file_path, "rt") as f: # TODO: Make preprocessor corpus = f.readlines() for sentence in tqdm(corpus, desc="Build vocabulary"): words = tokenizer(sentence.strip()) counter.update(words) for index, (k, v) in enumerate(counter.most_common(max_vocab_size - len(basic_tokens))): vocab.dictionary[index + len(basic_tokens)] = k vocab.reversed_dictionary = dict(zip(vocab.dictionary.values(), vocab.dictionary.keys())) return vocab
def __init__(self, mosestokenizer_language_code="en", store_data=False, spell_checker_lang=None, n_jobs=1): self.mosestokenizer_language_code = mosestokenizer_language_code self.splitsents = MosesSentenceSplitter( self.mosestokenizer_language_code) self.tokenize = MosesTokenizer(self.mosestokenizer_language_code) nltk.download('wordnet', quiet=False) self.lemmatizer = nltk.stem.WordNetLemmatizer() self.stop = False self.store_data = store_data if spell_checker_lang is None: logger.info("The spell checker is disabled.") self.spell_checker = None else: logger.info("The spell checker is enabled for %s." % (spell_checker_lang)) self.spell_checker = SpellChecker(language=spell_checker_lang, n_jobs=n_jobs)