def __init__(self, lang: str = 'en', lower_case: bool = True, romanize: bool = False, descape: bool = False): assert lower_case, 'lower case is needed by all the models' if lang in ('cmn', 'wuu', 'yue'): lang = 'zh' if lang == 'jpn': lang = 'ja' if lang == 'zh': raise NotImplementedError('jieba is not yet implemented') if lang == 'ja': raise NotImplementedError('mecab is not yet implemented') if romanize: raise NotImplementedError('romanize is not yet implemented') self.lower_case = lower_case self.romanize = romanize self.descape = descape self.normalizer = MosesPunctNormalizer(lang=lang) self.tokenizer = MosesTokenizer(lang=lang)
def generate(corpus: Optional[str] = None, test: Optional[str] = None): moses = MosesTokenizer(lang='fr') pos_tagged = loadCorpusFromStr(corpus) grammar_str = FormatGrammarAsCFG(InductGrammar(pos_tagged)) grammar = nltk.CFG.fromstring(grammar_str.split('\n')) parsed = [] valid = [] not_valide = [] for s in test.split('$'): try: tagged_sent = [ token[1] for token in tagger.tag(moses.tokenize(s, escape=False)) ] parsed = parse(tagged_sent, grammar) if parsed != None: valid.append((s, str(parsed))) else: not_valide.append(s) except: not_valide.append(s) return { "grammar": grammar_str, "test_results": { "valide": valid, "not_valide": not_valide } }
def build_vocab(num): vocab_file = f"vocab_{num}.txt" vocab_list = list() tokenizer = MosesTokenizer(lang='en') i = 0 for file in os.listdir("dataset/train/body"): filename = os.fsdecode(file) with open(f"dataset/train/body/{filename}", 'r', encoding='utf-8') as in_file: corpus_lines = in_file.readlines() corpus_lines = tokenizer.tokenize(corpus_lines) for line in corpus_lines: for word in line.split(): if word.lower() not in vocab_list: vocab_list.append(word.lower()) i += 1 if i >= 1000: break with open(vocab_file, 'w', encoding='utf-8') as out_file: for word in vocab_list: out_file.write(f"{word}\n")
def get_tokenized_review_list(): mt = MosesTokenizer() reviews = get_reviews()[0] tokenized_list = [ mt.tokenize(review_text, escape=False) for review_text in reviews ] return (tokenized_list, reviews[1])
def __init__(self, lang_id: str): self.lang_id = lang_id self.moses_tokenizer = MosesTokenizer(lang=lang_id) self.moses_detokenizer = MosesDetokenizer(lang=lang_id) self.normalizer = MosesPunctNormalizer(lang=lang_id, pre_replace_unicode_punct=True, post_remove_control_chars=True)
def __init__(self, lang: str = 'en', lower_case: bool = True, romanize: Optional[bool] = None, descape: bool = False): assert lower_case, 'lower case is needed by all the models' if lang in ('cmn', 'wuu', 'yue'): lang = 'zh' if lang == 'jpn': lang = 'ja' if lang == 'zh' and jieba is None: raise ModuleNotFoundError( '''No module named 'jieba'. Install laserembeddings with 'zh' extra to fix that: "pip install laserembeddings[zh]"''' ) if lang == 'ja' and MeCab is None: raise ModuleNotFoundError( '''No module named 'MeCab'. Install laserembeddings with 'ja' extra to fix that: "pip install laserembeddings[ja]"''' ) self.lang = lang self.lower_case = lower_case self.romanize = romanize if romanize is not None else lang == 'el' self.descape = descape self.normalizer = MosesPunctNormalizer(lang=lang) self.tokenizer = MosesTokenizer(lang=lang) self.mecab_tokenizer = MeCab.Tagger( "-O wakati -b 50000") if lang == 'ja' else None
def score(path_to_segmentation: str, path_to_reference: str) -> None: path_to_segmentation = Path(path_to_segmentation) path_to_reference = Path(path_to_reference) # init tokenizer and detokenizer mt, md = MosesTokenizer(lang = "de"), MosesDetokenizer(lang = "de") # extract the reference sentences from the xml file reference = [] with open(path_to_reference, "r", encoding = "utf-8") as f: for line in f.read().splitlines(): if line[:4] == "<seg": reference.append(line.split(">", maxsplit = 1)[1].split("</seg>")[0]) scores = {} for path_to_segmentation_file_i in path_to_segmentation.glob("*.xml"): # extract generated translations from the xml file segm_translation = load_segm_file(path_to_segmentation_file_i) # detokenize (have to tokenize first with the python implementation of Moses) segm_translation = [md.detokenize(mt.tokenize(s)) for s in segm_translation] assert len(reference) == len(segm_translation) # get bleu score bleu = sacrebleu.corpus_bleu(segm_translation, [reference]) scores[path_to_segmentation_file_i.name] = bleu.score for n, s in scores.items(): print(f"{n}: {s} BLEU")
def _moses_tokenize(text, lang): """ Tokenize a given string using moses tokenizer Tokenization: https://github.com/alvations/sacremoses """ from sacremoses import MosesTokenizer mt = MosesTokenizer(lang) return [string_unescape(t) for t in mt.tokenize(text)]
def clean( l1="C:/Users/azaninello/Desktop/experiments/nuovo/exp1/zing_phrases_examples", l2="C:/Users/azaninello/Desktop/experiments/nuovo/exp1/zing_phrases_examples" ): en_tok = MT(lang='en') it_tok = MT(lang='it') with open(l1, "r", encoding="utf-8") as en, open(l2, "r", encoding="utf-8") as it: en_text = en.readlines() it_text = it.readlines() with open("STOCAZZO.en", "w+", encoding="utf-8") as cl_en, open("DAJE.it", "w+", encoding="utf-8") as cl_it: c = 0 for line_en, line_it in zip(en_text, it_text): line_en = " ".join(en_tok.tokenize(line_en)).lower().replace( "'", "'").replace(""", '"') line_it = " ".join(it_tok.tokenize(line_it)).lower().replace( "'", "'").replace(""", '"') cl_en.write(line_en + "\n") cl_it.write(line_it + "\n") c += 1 if c % 500 == 0: print("Processed {} sentences".format(c))
def __init__(self, args): self.args = args try: from sacremoses import MosesTokenizer, MosesDetokenizer self.tok = MosesTokenizer(args.moses_source_lang) self.detok = MosesDetokenizer(args.moses_target_lang) except ImportError: raise ImportError('Please install Moses tokenizer with: pip install sacremoses')
def tokenize_captions(captions, lang='en'): """Tokenizes captions list with Moses tokenizer. """ tokenizer = MosesTokenizer(lang=lang) return [ tokenizer.tokenize(caption, return_str=True) for caption in captions ]
def __init__(self, lang='en', custom_nonbreaking_prefixes_file=None): try: from sacremoses import MosesTokenizer except ImportError: raise ImportError('Please install package `sacremoses`') self.tokenizer = MosesTokenizer( lang=lang, custom_nonbreaking_prefixes_file=custom_nonbreaking_prefixes_file )
def __init__(self): try: from sacremoses import MosesTokenizer self._tokenizer = MosesTokenizer() except (ImportError, TypeError) as err: print('sacremoses is not installed. ' 'To install sacremoses, use pip install -U sacremoses' ' Now try NLTKMosesTokenizer using NLTK ...') raise
def __init__(self, command=None, l="en"): if command: self.tokenizer = ToolWrapper(command.split(' ')) self.external = True self.spm = command.find('spm_encode') > -1 else: self.tokenizer = MosesTokenizer(lang=l) self.external = False self.spm = False
def tokenize(txt, to_lower=False): assert isinstance(txt, str) tokenizer = MosesTokenizer() lines = txt.split('\n') t = [tokenizer.tokenize(line) for line in lines] if to_lower: return [[word.lower() for word in line] for line in t] else: return t
def __init__(self, mode): self.mode = mode if self.mode == 'moses': self.tokenizer = MosesTokenizer() elif self.mode == 'bert-base-cased': self.tokenizer = BertTokenizer.from_pretrained( 'bert-base-cased', do_lower_case=False)
def __init__(self, filename, genia, gen_features, lowercase, replace_digits, to_filter): self.filename = filename self.basename = os.path.basename(filename) self.protocol_name = self.basename self.text_file = self.filename + '.txt' self.ann_file = self.filename + '.ann' with io.open(self.text_file, 'r', encoding='utf-8', newline='') as t_f, io.open(self.ann_file, 'r', encoding='utf-8', newline='') as a_f: self.tokenizer = MosesTokenizer() self.lines = [] for line in t_f.readlines(): self.lines.append(html.unescape(line)) self.text = "".join(self.lines) # full text self.ann = a_f.readlines() self.status = self.__pretest() # self.status=True self.links = [] if self.status: sents = [self.tokenizer.tokenize(line) for line in self.lines] # generate list of list of words self.heading = sents[0] self.sents = sents[1:] self.tags = self.__parse_tags() self.unique_tags = set([tag.tag_name for tag in self.tags]) self.__std_index() self.__parse_links() self.tag_0_id = 'T0' self.tag_0_name = 'O' self.tokens2d = self.gen_tokens(labels_allowed=cfg.LABELS, lowercase=lowercase, replace_digits=replace_digits) self.tokens2d = [[self.clean_html_tag(token) for token in token1d] for token1d in self.tokens2d] self.word_cnt = sum(len(tokens1d) for tokens1d in self.tokens2d) self.f_df = None if gen_features: if genia: self.pos_tags = self.__gen_pos_genia(genia) else: self.pos_tags = self.__gen_pos_stanford() self.conll_deps = self.__gen_dep() self.parse_trees = self.__gen_parse_trees() if to_filter: self.filter() self.relations = self.gen_relations()
def preprocess(source_lang,tcmodel,escape): mtok = MosesTokenizer(lang=source_lang) mtr = MosesTruecaser(tcmodel) sys.stderr.write("model loaded\n") for line in sys.stdin: tokenized = mtok.tokenize(line,escape=escape) truecased = mtr.truecase(" ".join(tokenized)) sys.stderr.write("sentence processed\n") sys.stdout.buffer.write((" ".join(truecased) + "\n").encode("utf-8")) sys.stdout.flush()
def enable_moses(self, lang='en', tokenize=True, detokenize=True): if tokenize: self._moses_tok = MosesTokenizer(lang=lang) else: self._moses_tok = None if detokenize: self._moses_detok = MosesDetokenizer(lang=lang) else: self._moses_detok = None
def get_moses_tokenizer(lang): try: moses_tokenizer = MosesTokenizer(lang=lang) except: print("WARNING: Moses doesn't have tokenizer for", lang) moses_tokenizer = MosesTokenizer(lang='en') tokenizer = lambda x: moses_tokenizer.tokenize(x, return_str=True ) #string IN -> string OUT return tokenizer
def __init__(self, cfg: MosesTokenizerConfig): self.cfg = cfg try: from sacremoses import MosesTokenizer, MosesDetokenizer self.tok = MosesTokenizer(cfg.source_lang) self.detok = MosesDetokenizer(cfg.target_lang) except ImportError: raise ImportError( "Please install Moses tokenizer with: pip install sacremoses")
def main(): tic = time.time() args = parse_args() logging.basicConfig(level=logging.DEBUG) logging.debug(args) if args.tokenize: tokenizer = MosesTokenizer(lang=args.lang) lines = sys.stdin.readlines() all_tokens = [] for line in lines: if args.tokenize: t = tokenizer.tokenize(line) else: t = line.split() all_tokens.append(t) flat_tokens = chain.from_iterable(all_tokens) counter = Counter(flat_tokens) # try to free up memory early del flat_tokens logging.debug("Vocabulary size before/after/max_allowed = %d/%d/%d" % (len(counter.keys()), min(args.vocab_size, len(counter.keys())), args.vocab_size)) vocabulary = [ token for token, frequency in counter.most_common(args.vocab_size) ] for tokens in all_tokens: output_tokens = [] for token in tokens: if token in vocabulary: output_tokens.append(token) else: output_tokens.append(args.unk_string) output_string = " ".join(output_tokens) sys.stdout.write(output_string + "\n") toc = time.time() - tic logging.debug("Time taken: %f seconds" % toc)
def __init__(self, bpe_codes_file: str, lang_src: str = 'en', lang_trg: str = 'de', separator='@@'): self.moses_tokenizer = MosesTokenizer(lang=lang_src) self.moses_detokenizer = MosesDetokenizer(lang=lang_trg) self.bpe_tokenizer = BPE(codes=codecs.open(bpe_codes_file, encoding='utf-8'), merges=-1, separator=separator, vocab=None, glossaries=None)
def _test_tokenize(self, test_file, language='en'): """ Compares MosesPunctuationNormalizer's output to the output of the original Perl script. """ tokenizer = MosesTokenizer(lang=language) # Normalize test file with original Perl script and given flags path_gold = self._create_gold(test_file, language) # Compare to output of original Perl script with open(test_file, encoding='utf-8') as u, open(path_gold, encoding='utf-8') as g: for text, gold in zip(u, g): tokenized = tokenizer.tokenize(text, return_str=True) self.assertEqual(tokenized.rstrip(), gold.rstrip())
def __init__(self, exp): self.exp = exp self.tokr = MosesTokenizer() self.detokr = MosesDetokenizer() self.punct_normr = MosesPunctNormalizer() #self.true_caser = MosesTruecaser() self.punct_normalize = True self.tokenize = True self.html_unesc = True self.drop_unks = True #self.truecase = True self.detokenize = True
def morph(self, premise, hypothesis, label_text, constrain_pos=True, conservative=False): assert label_text in self.labels label = self.labels.index(label_text) orig_prem_tokenized = MosesTokenizer(lang='en').tokenize(premise) orig_hypo_tokenized = MosesTokenizer(lang='en').tokenize(hypothesis) prem_pos_tagged = [ (tagged[0], '.') if '&' in tagged[0] else tagged for tagged in nltk.pos_tag(orig_prem_tokenized, tagset='universal') ] hypo_pos_tagged = [ (tagged[0], '.') if '&' in tagged[0] else tagged for tagged in nltk.pos_tag(orig_hypo_tokenized, tagset='universal') ] prem_token_inflections = super().get_inflections( orig_prem_tokenized, prem_pos_tagged, constrain_pos) hypo_token_inflections = super().get_inflections( orig_hypo_tokenized, hypo_pos_tagged, constrain_pos) original_loss, init_predicted = self.get_loss(premise, hypothesis, label) if init_predicted != label: return premise, hypothesis, label_text, 1 forward_prem_perturbed, forward_hypo_perturbed, forward_loss, forward_predicted, num_queries_forward = self.search_nli( prem_token_inflections, hypo_token_inflections, orig_prem_tokenized, orig_hypo_tokenized, original_loss, label, conservative) if conservative and forward_predicted != label: return forward_prem_perturbed, forward_hypo_perturbed, self.labels[ forward_predicted], num_queries_forward + 1 backward_prem_perturbed, backward_hypo_perturbed, backward_loss, backward_predicted, num_queries_backward = self.search_nli( prem_token_inflections, hypo_token_inflections, orig_prem_tokenized, orig_hypo_tokenized, original_loss, label, conservative) num_queries = 1 + num_queries_forward + num_queries_backward if forward_loss > backward_loss: return forward_prem_perturbed, forward_hypo_perturbed, self.labels[ forward_predicted], num_queries else: return backward_prem_perturbed, backward_hypo_perturbed, self.labels[ backward_predicted], num_queries
def __init__(self, args): self.args = args if getattr(args, 'moses_source_lang', None) is None: args.moses_source_lang = getattr(args, 'source_lang', 'en') if getattr(args, 'moses_target_lang', None) is None: args.moses_target_lang = getattr(args, 'target_lang', 'en') try: from sacremoses import MosesTokenizer, MosesDetokenizer self.tok = MosesTokenizer(args.moses_source_lang) self.detok = MosesDetokenizer(args.moses_target_lang) except ImportError: raise ImportError('Please install Moses tokenizer with: pip install sacremoses')
def __init__(self, pretokenizer='moses'): self.tagger = PerceptronTagger() self.pretok_type = pretokenizer if pretokenizer == 'bertpretokenizer': self.pretokenizer = BertPreTokenizer() elif pretokenizer == 'moses': self.pretokenizer = MosesTokenizer() self.detokenizer = MosesDetokenizer() elif pretokenizer == 'whitespace': pass else: raise ValueError( "pretokenizer must be 'bertpretokenizer', 'moses', or 'whitespace'." )
def translate(self, text: List[str], source_lang: str = None, target_lang: str = None) -> List[str]: """ Translates list of sentences from source language to target language. Should be regular text, this method performs its own tokenization/de-tokenization Args: text: list of strings to translate source_lang: if not None, corresponding MosesTokenizer and MosesPunctNormalizer will be run target_lang: if not None, corresponding MosesDecokenizer will be run Returns: list of translated strings """ mode = self.training if source_lang != "None": tokenizer = MosesTokenizer(lang=source_lang) normalizer = MosesPunctNormalizer(lang=source_lang) if target_lang != "None": detokenizer = MosesDetokenizer(lang=target_lang) try: self.eval() res = [] for txt in text: if source_lang != "None": txt = normalizer.normalize(txt) txt = tokenizer.tokenize(txt, escape=False, return_str=True) ids = self.encoder_tokenizer.text_to_ids(txt) ids = [self.encoder_tokenizer.bos_id ] + ids + [self.encoder_tokenizer.eos_id] src = torch.Tensor(ids).long().to(self._device).unsqueeze(0) src_mask = torch.ones_like(src) src_hiddens = self.encoder(input_ids=src, encoder_mask=src_mask) beam_results = self.beam_search( encoder_hidden_states=src_hiddens, encoder_input_mask=src_mask) beam_results = self.filter_predicted_ids(beam_results) translation_ids = beam_results.cpu()[0].numpy() translation = self.decoder_tokenizer.ids_to_text( translation_ids) if target_lang != "None": translation = detokenizer.detokenize(translation.split()) res.append(translation) finally: self.train(mode=mode) return res
def __init__(self, device, cache_dir, state): # tokenize sents self.tokenizer = MosesTokenizer() self.preprocess = lambda sent: self.tokenizer.tokenize(sent.lower(), escape=False) self.elmo = ElmoEmbedder( options_file=os.path.join( cache_dir, 'elmo_2x4096_512_2048cnn_2xhighway_options.json'), weight_file=os.path.join( cache_dir, 'elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'), cuda_device=0 if device.type == 'cuda' else -1) self.device = device self.state = RandomState(state) self.name = 'ELMo' self.is_unk = lambda tok_id: False