예제 #1
0
    def __init__(self,
                 lang: str = 'en',
                 lower_case: bool = True,
                 romanize: bool = False,
                 descape: bool = False):
        assert lower_case, 'lower case is needed by all the models'

        if lang in ('cmn', 'wuu', 'yue'):
            lang = 'zh'
        if lang == 'jpn':
            lang = 'ja'

        if lang == 'zh':
            raise NotImplementedError('jieba is not yet implemented')
        if lang == 'ja':
            raise NotImplementedError('mecab is not yet implemented')
        if romanize:
            raise NotImplementedError('romanize is not yet implemented')

        self.lower_case = lower_case
        self.romanize = romanize
        self.descape = descape

        self.normalizer = MosesPunctNormalizer(lang=lang)
        self.tokenizer = MosesTokenizer(lang=lang)
def generate(corpus: Optional[str] = None, test: Optional[str] = None):
    moses = MosesTokenizer(lang='fr')

    pos_tagged = loadCorpusFromStr(corpus)
    grammar_str = FormatGrammarAsCFG(InductGrammar(pos_tagged))

    grammar = nltk.CFG.fromstring(grammar_str.split('\n'))

    parsed = []
    valid = []
    not_valide = []

    for s in test.split('$'):
        try:
            tagged_sent = [
                token[1]
                for token in tagger.tag(moses.tokenize(s, escape=False))
            ]
            parsed = parse(tagged_sent, grammar)
            if parsed != None:
                valid.append((s, str(parsed)))
            else:
                not_valide.append(s)
        except:
            not_valide.append(s)

    return {
        "grammar": grammar_str,
        "test_results": {
            "valide": valid,
            "not_valide": not_valide
        }
    }
def build_vocab(num):
    vocab_file = f"vocab_{num}.txt"
    vocab_list = list()
    tokenizer = MosesTokenizer(lang='en')

    i = 0
    for file in os.listdir("dataset/train/body"):
        filename = os.fsdecode(file)

        with open(f"dataset/train/body/{filename}", 'r',
                  encoding='utf-8') as in_file:
            corpus_lines = in_file.readlines()
            corpus_lines = tokenizer.tokenize(corpus_lines)

            for line in corpus_lines:
                for word in line.split():
                    if word.lower() not in vocab_list:
                        vocab_list.append(word.lower())

        i += 1
        if i >= 1000:
            break

    with open(vocab_file, 'w', encoding='utf-8') as out_file:
        for word in vocab_list:
            out_file.write(f"{word}\n")
예제 #4
0
def get_tokenized_review_list():
    mt = MosesTokenizer()
    reviews = get_reviews()[0]
    tokenized_list = [
        mt.tokenize(review_text, escape=False) for review_text in reviews
    ]
    return (tokenized_list, reviews[1])
예제 #5
0
 def __init__(self, lang_id: str):
     self.lang_id = lang_id
     self.moses_tokenizer = MosesTokenizer(lang=lang_id)
     self.moses_detokenizer = MosesDetokenizer(lang=lang_id)
     self.normalizer = MosesPunctNormalizer(lang=lang_id,
                                            pre_replace_unicode_punct=True,
                                            post_remove_control_chars=True)
예제 #6
0
    def __init__(self,
                 lang: str = 'en',
                 lower_case: bool = True,
                 romanize: Optional[bool] = None,
                 descape: bool = False):
        assert lower_case, 'lower case is needed by all the models'

        if lang in ('cmn', 'wuu', 'yue'):
            lang = 'zh'
        if lang == 'jpn':
            lang = 'ja'

        if lang == 'zh' and jieba is None:
            raise ModuleNotFoundError(
                '''No module named 'jieba'. Install laserembeddings with 'zh' extra to fix that: "pip install laserembeddings[zh]"'''
            )
        if lang == 'ja' and MeCab is None:
            raise ModuleNotFoundError(
                '''No module named 'MeCab'. Install laserembeddings with 'ja' extra to fix that: "pip install laserembeddings[ja]"'''
            )

        self.lang = lang
        self.lower_case = lower_case
        self.romanize = romanize if romanize is not None else lang == 'el'
        self.descape = descape

        self.normalizer = MosesPunctNormalizer(lang=lang)
        self.tokenizer = MosesTokenizer(lang=lang)
        self.mecab_tokenizer = MeCab.Tagger(
            "-O wakati -b 50000") if lang == 'ja' else None
예제 #7
0
def score(path_to_segmentation: str, path_to_reference: str) -> None:

    path_to_segmentation = Path(path_to_segmentation)
    path_to_reference = Path(path_to_reference)

    # init tokenizer and detokenizer
    mt, md = MosesTokenizer(lang = "de"), MosesDetokenizer(lang = "de")

    # extract the reference sentences from the xml file
    reference = []
    with open(path_to_reference, "r", encoding = "utf-8") as f:
        for line in f.read().splitlines():
            if line[:4] == "<seg":
                reference.append(line.split(">", maxsplit = 1)[1].split("</seg>")[0])

    scores = {}
    for path_to_segmentation_file_i in path_to_segmentation.glob("*.xml"):

        # extract generated translations from the xml file
        segm_translation = load_segm_file(path_to_segmentation_file_i)

        # detokenize (have to tokenize first with the python implementation of Moses)
        segm_translation = [md.detokenize(mt.tokenize(s)) for s in segm_translation]

        assert len(reference) == len(segm_translation)

        # get bleu score
        bleu = sacrebleu.corpus_bleu(segm_translation, [reference])
        scores[path_to_segmentation_file_i.name] = bleu.score

    for n, s in scores.items():
        print(f"{n}: {s} BLEU")
예제 #8
0
def _moses_tokenize(text, lang):
    """ Tokenize a given string using moses tokenizer
    Tokenization: https://github.com/alvations/sacremoses
    """
    from sacremoses import MosesTokenizer
    mt = MosesTokenizer(lang)
    return [string_unescape(t) for t in mt.tokenize(text)]
예제 #9
0
def clean(
    l1="C:/Users/azaninello/Desktop/experiments/nuovo/exp1/zing_phrases_examples",
    l2="C:/Users/azaninello/Desktop/experiments/nuovo/exp1/zing_phrases_examples"
):
    en_tok = MT(lang='en')
    it_tok = MT(lang='it')
    with open(l1, "r", encoding="utf-8") as en, open(l2, "r",
                                                     encoding="utf-8") as it:
        en_text = en.readlines()
        it_text = it.readlines()
    with open("STOCAZZO.en", "w+",
              encoding="utf-8") as cl_en, open("DAJE.it",
                                               "w+",
                                               encoding="utf-8") as cl_it:
        c = 0
        for line_en, line_it in zip(en_text, it_text):
            line_en = " ".join(en_tok.tokenize(line_en)).lower().replace(
                "&apos;", "'").replace("&quot;", '"')
            line_it = " ".join(it_tok.tokenize(line_it)).lower().replace(
                "&apos;", "'").replace("&quot;", '"')
            cl_en.write(line_en + "\n")
            cl_it.write(line_it + "\n")
            c += 1
            if c % 500 == 0:
                print("Processed {} sentences".format(c))
예제 #10
0
 def __init__(self, args):
     self.args = args
     try:
         from sacremoses import MosesTokenizer, MosesDetokenizer
         self.tok = MosesTokenizer(args.moses_source_lang)
         self.detok = MosesDetokenizer(args.moses_target_lang)
     except ImportError:
         raise ImportError('Please install Moses tokenizer with: pip install sacremoses')
def tokenize_captions(captions, lang='en'):
    """Tokenizes captions list with Moses tokenizer.
    """

    tokenizer = MosesTokenizer(lang=lang)
    return [
        tokenizer.tokenize(caption, return_str=True) for caption in captions
    ]
예제 #12
0
 def __init__(self, lang='en', custom_nonbreaking_prefixes_file=None):
     try:
         from sacremoses import MosesTokenizer
     except ImportError:
         raise ImportError('Please install package `sacremoses`')
     self.tokenizer = MosesTokenizer(
         lang=lang,
         custom_nonbreaking_prefixes_file=custom_nonbreaking_prefixes_file
     )
예제 #13
0
 def __init__(self):
     try:
         from sacremoses import MosesTokenizer
         self._tokenizer = MosesTokenizer()
     except (ImportError, TypeError) as err:
         print('sacremoses is not installed. '
               'To install sacremoses, use pip install -U sacremoses'
               ' Now try NLTKMosesTokenizer using NLTK ...')
         raise
예제 #14
0
 def __init__(self, command=None, l="en"):
     if command:
         self.tokenizer = ToolWrapper(command.split(' '))
         self.external = True
         self.spm = command.find('spm_encode') > -1
     else:
         self.tokenizer = MosesTokenizer(lang=l)
         self.external = False
         self.spm = False
예제 #15
0
 def tokenize(txt, to_lower=False):
     assert isinstance(txt, str)
     tokenizer = MosesTokenizer()
     lines = txt.split('\n')
     t = [tokenizer.tokenize(line) for line in lines]
     if to_lower:
         return [[word.lower() for word in line] for line in t]
     else:
         return t
예제 #16
0
    def __init__(self, mode):
        self.mode = mode

        if self.mode == 'moses':
            self.tokenizer = MosesTokenizer()

        elif self.mode == 'bert-base-cased':
            self.tokenizer = BertTokenizer.from_pretrained(
                'bert-base-cased', do_lower_case=False)
예제 #17
0
    def __init__(self, filename, genia, gen_features, lowercase,
                 replace_digits, to_filter):
        self.filename = filename
        self.basename = os.path.basename(filename)
        self.protocol_name = self.basename
        self.text_file = self.filename + '.txt'
        self.ann_file = self.filename + '.ann'

        with io.open(self.text_file, 'r', encoding='utf-8',
                     newline='') as t_f, io.open(self.ann_file,
                                                 'r',
                                                 encoding='utf-8',
                                                 newline='') as a_f:
            self.tokenizer = MosesTokenizer()
            self.lines = []
            for line in t_f.readlines():
                self.lines.append(html.unescape(line))

            self.text = "".join(self.lines)  # full text
            self.ann = a_f.readlines()
            self.status = self.__pretest()
            # self.status=True
            self.links = []

        if self.status:
            sents = [self.tokenizer.tokenize(line)
                     for line in self.lines]  # generate list of list of words
            self.heading = sents[0]
            self.sents = sents[1:]
            self.tags = self.__parse_tags()
            self.unique_tags = set([tag.tag_name for tag in self.tags])
            self.__std_index()
            self.__parse_links()
            self.tag_0_id = 'T0'
            self.tag_0_name = 'O'
            self.tokens2d = self.gen_tokens(labels_allowed=cfg.LABELS,
                                            lowercase=lowercase,
                                            replace_digits=replace_digits)
            self.tokens2d = [[self.clean_html_tag(token) for token in token1d]
                             for token1d in self.tokens2d]

            self.word_cnt = sum(len(tokens1d) for tokens1d in self.tokens2d)
            self.f_df = None
            if gen_features:
                if genia:
                    self.pos_tags = self.__gen_pos_genia(genia)
                else:
                    self.pos_tags = self.__gen_pos_stanford()

                self.conll_deps = self.__gen_dep()
                self.parse_trees = self.__gen_parse_trees()

            if to_filter:
                self.filter()

            self.relations = self.gen_relations()
예제 #18
0
파일: process.py 프로젝트: av1611/OPUS-CAT
def preprocess(source_lang,tcmodel,escape):
	mtok = MosesTokenizer(lang=source_lang)
	mtr = MosesTruecaser(tcmodel)
	sys.stderr.write("model loaded\n")
	for line in sys.stdin:
		tokenized = mtok.tokenize(line,escape=escape)
		truecased = mtr.truecase(" ".join(tokenized))
		sys.stderr.write("sentence processed\n")
		sys.stdout.buffer.write((" ".join(truecased) + "\n").encode("utf-8"))
		sys.stdout.flush()
예제 #19
0
    def enable_moses(self, lang='en', tokenize=True, detokenize=True):
        if tokenize:
            self._moses_tok = MosesTokenizer(lang=lang)
        else:
            self._moses_tok = None

        if detokenize:
            self._moses_detok = MosesDetokenizer(lang=lang)
        else:
            self._moses_detok = None
예제 #20
0
def get_moses_tokenizer(lang):
    try:
        moses_tokenizer = MosesTokenizer(lang=lang)
    except:
        print("WARNING: Moses doesn't have tokenizer for", lang)
        moses_tokenizer = MosesTokenizer(lang='en')

    tokenizer = lambda x: moses_tokenizer.tokenize(x, return_str=True
                                                   )  #string IN -> string OUT
    return tokenizer
예제 #21
0
    def __init__(self, cfg: MosesTokenizerConfig):
        self.cfg = cfg

        try:
            from sacremoses import MosesTokenizer, MosesDetokenizer

            self.tok = MosesTokenizer(cfg.source_lang)
            self.detok = MosesDetokenizer(cfg.target_lang)
        except ImportError:
            raise ImportError(
                "Please install Moses tokenizer with: pip install sacremoses")
예제 #22
0
def main():

    tic = time.time()

    args = parse_args()

    logging.basicConfig(level=logging.DEBUG)
    logging.debug(args)

    if args.tokenize:
        tokenizer = MosesTokenizer(lang=args.lang)

    lines = sys.stdin.readlines()

    all_tokens = []

    for line in lines:
        if args.tokenize:
            t = tokenizer.tokenize(line)
        else:
            t = line.split()
        all_tokens.append(t)

    flat_tokens = chain.from_iterable(all_tokens)

    counter = Counter(flat_tokens)

    # try to free up memory early

    del flat_tokens

    logging.debug("Vocabulary size before/after/max_allowed = %d/%d/%d" %
                  (len(counter.keys()),
                   min(args.vocab_size, len(counter.keys())), args.vocab_size))

    vocabulary = [
        token for token, frequency in counter.most_common(args.vocab_size)
    ]

    for tokens in all_tokens:
        output_tokens = []
        for token in tokens:
            if token in vocabulary:
                output_tokens.append(token)
            else:
                output_tokens.append(args.unk_string)

        output_string = " ".join(output_tokens)
        sys.stdout.write(output_string + "\n")

    toc = time.time() - tic

    logging.debug("Time taken: %f seconds" % toc)
예제 #23
0
파일: utils.py 프로젝트: houj04/PaddleHub
 def __init__(self,
              bpe_codes_file: str,
              lang_src: str = 'en',
              lang_trg: str = 'de',
              separator='@@'):
     self.moses_tokenizer = MosesTokenizer(lang=lang_src)
     self.moses_detokenizer = MosesDetokenizer(lang=lang_trg)
     self.bpe_tokenizer = BPE(codes=codecs.open(bpe_codes_file,
                                                encoding='utf-8'),
                              merges=-1,
                              separator=separator,
                              vocab=None,
                              glossaries=None)
 def _test_tokenize(self, test_file, language='en'):
     """
     Compares MosesPunctuationNormalizer's output to the output of the
     original Perl script.
     """
     tokenizer = MosesTokenizer(lang=language)
     # Normalize test file with original Perl script and given flags
     path_gold = self._create_gold(test_file, language)
     # Compare to output of original Perl script
     with open(test_file, encoding='utf-8') as u, open(path_gold, encoding='utf-8') as g:
         for text, gold in zip(u, g):
             tokenized = tokenizer.tokenize(text, return_str=True)
             self.assertEqual(tokenized.rstrip(), gold.rstrip())
예제 #25
0
    def __init__(self, exp):
        self.exp = exp
        self.tokr = MosesTokenizer()
        self.detokr = MosesDetokenizer()
        self.punct_normr = MosesPunctNormalizer()
        #self.true_caser = MosesTruecaser()

        self.punct_normalize = True
        self.tokenize = True
        self.html_unesc = True
        self.drop_unks = True
        #self.truecase = True
        self.detokenize = True
예제 #26
0
    def morph(self,
              premise,
              hypothesis,
              label_text,
              constrain_pos=True,
              conservative=False):
        assert label_text in self.labels
        label = self.labels.index(label_text)
        orig_prem_tokenized = MosesTokenizer(lang='en').tokenize(premise)
        orig_hypo_tokenized = MosesTokenizer(lang='en').tokenize(hypothesis)

        prem_pos_tagged = [
            (tagged[0], '.') if '&' in tagged[0] else tagged
            for tagged in nltk.pos_tag(orig_prem_tokenized, tagset='universal')
        ]
        hypo_pos_tagged = [
            (tagged[0], '.') if '&' in tagged[0] else tagged
            for tagged in nltk.pos_tag(orig_hypo_tokenized, tagset='universal')
        ]

        prem_token_inflections = super().get_inflections(
            orig_prem_tokenized, prem_pos_tagged, constrain_pos)
        hypo_token_inflections = super().get_inflections(
            orig_hypo_tokenized, hypo_pos_tagged, constrain_pos)

        original_loss, init_predicted = self.get_loss(premise, hypothesis,
                                                      label)
        if init_predicted != label:
            return premise, hypothesis, label_text, 1

        forward_prem_perturbed, forward_hypo_perturbed, forward_loss, forward_predicted, num_queries_forward = self.search_nli(
            prem_token_inflections, hypo_token_inflections,
            orig_prem_tokenized, orig_hypo_tokenized, original_loss, label,
            conservative)

        if conservative and forward_predicted != label:
            return forward_prem_perturbed, forward_hypo_perturbed, self.labels[
                forward_predicted], num_queries_forward + 1

        backward_prem_perturbed, backward_hypo_perturbed, backward_loss, backward_predicted, num_queries_backward = self.search_nli(
            prem_token_inflections, hypo_token_inflections,
            orig_prem_tokenized, orig_hypo_tokenized, original_loss, label,
            conservative)

        num_queries = 1 + num_queries_forward + num_queries_backward
        if forward_loss > backward_loss:
            return forward_prem_perturbed, forward_hypo_perturbed, self.labels[
                forward_predicted], num_queries
        else:
            return backward_prem_perturbed, backward_hypo_perturbed, self.labels[
                backward_predicted], num_queries
예제 #27
0
    def __init__(self, args):
        self.args = args

        if getattr(args, 'moses_source_lang', None) is None:
            args.moses_source_lang = getattr(args, 'source_lang', 'en')
        if getattr(args, 'moses_target_lang', None) is None:
            args.moses_target_lang = getattr(args, 'target_lang', 'en')

        try:
            from sacremoses import MosesTokenizer, MosesDetokenizer
            self.tok = MosesTokenizer(args.moses_source_lang)
            self.detok = MosesDetokenizer(args.moses_target_lang)
        except ImportError:
            raise ImportError('Please install Moses tokenizer with: pip install sacremoses')
예제 #28
0
 def __init__(self, pretokenizer='moses'):
     self.tagger = PerceptronTagger()
     self.pretok_type = pretokenizer
     if pretokenizer == 'bertpretokenizer':
         self.pretokenizer = BertPreTokenizer()
     elif pretokenizer == 'moses':
         self.pretokenizer = MosesTokenizer()
         self.detokenizer = MosesDetokenizer()
     elif pretokenizer == 'whitespace':
         pass
     else:
         raise ValueError(
             "pretokenizer must be 'bertpretokenizer', 'moses', or 'whitespace'."
         )
예제 #29
0
 def translate(self,
               text: List[str],
               source_lang: str = None,
               target_lang: str = None) -> List[str]:
     """
     Translates list of sentences from source language to target language.
     Should be regular text, this method performs its own tokenization/de-tokenization
     Args:
         text: list of strings to translate
         source_lang: if not None, corresponding MosesTokenizer and MosesPunctNormalizer will be run
         target_lang: if not None, corresponding MosesDecokenizer will be run
     Returns:
         list of translated strings
     """
     mode = self.training
     if source_lang != "None":
         tokenizer = MosesTokenizer(lang=source_lang)
         normalizer = MosesPunctNormalizer(lang=source_lang)
     if target_lang != "None":
         detokenizer = MosesDetokenizer(lang=target_lang)
     try:
         self.eval()
         res = []
         for txt in text:
             if source_lang != "None":
                 txt = normalizer.normalize(txt)
                 txt = tokenizer.tokenize(txt,
                                          escape=False,
                                          return_str=True)
             ids = self.encoder_tokenizer.text_to_ids(txt)
             ids = [self.encoder_tokenizer.bos_id
                    ] + ids + [self.encoder_tokenizer.eos_id]
             src = torch.Tensor(ids).long().to(self._device).unsqueeze(0)
             src_mask = torch.ones_like(src)
             src_hiddens = self.encoder(input_ids=src,
                                        encoder_mask=src_mask)
             beam_results = self.beam_search(
                 encoder_hidden_states=src_hiddens,
                 encoder_input_mask=src_mask)
             beam_results = self.filter_predicted_ids(beam_results)
             translation_ids = beam_results.cpu()[0].numpy()
             translation = self.decoder_tokenizer.ids_to_text(
                 translation_ids)
             if target_lang != "None":
                 translation = detokenizer.detokenize(translation.split())
             res.append(translation)
     finally:
         self.train(mode=mode)
     return res
예제 #30
0
 def __init__(self, device, cache_dir, state):
     # tokenize sents
     self.tokenizer = MosesTokenizer()
     self.preprocess = lambda sent: self.tokenizer.tokenize(sent.lower(),
                                                            escape=False)
     self.elmo = ElmoEmbedder(
         options_file=os.path.join(
             cache_dir, 'elmo_2x4096_512_2048cnn_2xhighway_options.json'),
         weight_file=os.path.join(
             cache_dir, 'elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'),
         cuda_device=0 if device.type == 'cuda' else -1)
     self.device = device
     self.state = RandomState(state)
     self.name = 'ELMo'
     self.is_unk = lambda tok_id: False