Пример #1
0
 def test_chinese_tokenization(self):
     tokenizer = MosesTokenizer(lang="zh")
     text = u"记者 应谦 美国"
     assert tokenizer.tokenize(text) == [u"记者", u"应谦", u"美国"]
Пример #2
0
 def test_korean_tokenization(self):
     tokenizer = MosesTokenizer(lang="ko")
     detokenizer = MosesDetokenizer(lang="ko")
     text = u"세계 에서 가장 강력한."
     assert tokenizer.tokenize(text) == [u"세계", u"에서", u"가장", u"강력한", u"."]
     assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
Пример #3
0
    def test_opening_brackets(self):
        tokenizer = MosesTokenizer()
        detokenizer = MosesDetokenizer()

        text = "By the mid 1990s a version of the game became a Latvian television series (with a parliamentary setting, and played by Latvian celebrities)."
        assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
Пример #4
0
    def test_french_apostrophes(self):
        tokenizer = MosesTokenizer(lang="fr")
        detokenizer = MosesDetokenizer(lang="fr")

        text = u"L'amitié nous a fait forts d'esprit"
        assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
Пример #5
0
 def test_trailing_dot_apostrophe(self):
     moses = MosesTokenizer()
     text = "'Hello.'"
     expected_tokens = "'Hello . '".split()
     self.assertEqual(moses.tokenize(text), expected_tokens)
Пример #6
0
    def test_detokenize_with_aggressive_split(self):
        mt = MosesTokenizer()
        md = MosesDetokenizer()

        text = "foo-bar"
        assert md.detokenize(mt.tokenize(text, aggressive_dash_splits=True)) == text
Пример #7
0
def preprocess(src_file, mt_file, output_dir, tokenize_lang=None):
    """
        pre-process input file before post-editing
        split at <br> and remove <i> tags and music symbols.
        store everything in a codes file in output_dir

        Args:
            src_file: src_file of the translation to be preprocessed
            mt_file: output of the mt system file to be preprocessed
            output_dir: output directory to output the preprocessed files and codes file

    """

    punct_normalizer = MosesPunctNormalizer()

    # set tokenizer
    tokenizer = None
    if tokenize_lang:
        tokenizer = MosesTokenizer(lang=tokenize_lang)

    code_file = output_dir+'/codes.'+os.path.basename(mt_file)
    src_out_file = output_dir+'/'+os.path.basename(src_file)+'.pre'
    mt_out_file = output_dir+'/'+os.path.basename(mt_file)+'.pre'
    with open(src_out_file,'w') as fosrc, open(mt_out_file,'w') as fomt, open(code_file,'w') as fcodes, open(src_file) as fsrc, open(mt_file) as fmt:
        idx=0
        for src,mt in zip(fsrc,fmt):
            src, mt = src.strip(), mt.strip()
            

            idx+=1
            
            # standardize br tags
            src = re.sub('<\s*br\s*\/*>', '<br>', src, flags=re.IGNORECASE)
            mt = re.sub('<\s*br\s*\/*>', '<br>', mt, flags=re.IGNORECASE)


            # if number of <br> is same, split and save it as multiple lines
            src_split = re.split(r'\s*<br>\s*',src)
            mt_split = re.split(r'\s*<br>\s*',mt)

            # if the src, mt, do not have the same number of <br>, then do not split it
            if not (len(src_split) == len(mt_split)):
                src_split = [src]
                mt_split = [mt]
                


            for src_part, mt_part in zip(src_split, mt_split):
                code = "{}\t".format(idx)

                # check if they start with the hyphen
                has_hyphen = False
                if src_part.startswith('-'):
                    has_hyphen = True
                    src_part = src_part[1:].lstrip()

                if mt_part.startswith('-'):
                    has_hyphen = True
                    mt_part = mt_part[1:].lstrip()

                # check if they start with the music symbol
                music_syms = ('♫','♬','♪')
                has_music = False
                if re.search(r'\s*[{}]\s*'.format(''.join(music_syms)), src_part):
                    has_music = True
                    src_part = re.sub(r'\s*[{}]\s*'.format(''.join(music_syms)), '', src_part)

                #if mt_part.startswith(music_syms) or mt_part.endswith(music_syms):
                if re.search(r'\s*[{}]\s*'.format(''.join(music_syms)), mt_part):                
                    has_music = True
                    mt_part = re.sub(r'\s*[{}]\s*'.format(''.join(music_syms)), '', mt_part)

                # check if it has enclosing italics tags. otherwise leave it as it is
                itag = '<i>'
                eitag = '</i>'
                has_itag = False
                if src_part.startswith(itag) or src_part.endswith(eitag):
                    has_itag = True

                if mt_part.startswith(itag) or mt_part.endswith(eitag):
                    has_itag = True


                #if re.match(r'^<i>[^<]*</i>$', src_part):
                if has_hyphen == True:
                    code += 'HYPHENBEGIN\t'
                if has_music == True:
                    code += 'MUSIC\t'
                if has_itag == True:
                    code += 'ITALICTAGS\t'

                src_part = punct_normalizer.normalize(cleanup(src_part))
                mt_part = punct_normalizer.normalize(cleanup(mt_part))

                if tokenizer:
                    src_part = " ".join(tokenizer.tokenize(src_part, escape=False))
                    mt_part = " ".join(tokenizer.tokenize(mt_part, escape=False))

                fosrc.write(src_part.strip()+'\n')
                fomt.write(mt_part.strip()+'\n')
                fcodes.write("{}\n".format(code))
Пример #8
0
    def test_moses_tokenize(self):
        moses = MosesTokenizer()

        # Tokenize a sentence.
        text = (
            u"This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf"
        )
        expected_tokens = u"This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf"
        tokenized_text = moses.tokenize(text, return_str=True)
        assert tokenized_text == expected_tokens

        # The nonbreaking prefixes should tokenize the final fullstop.
        assert moses.tokenize("abc def.") == [u"abc", u"def", u"."]

        # The nonbreaking prefixes should deal the situation when numeric only prefix is the last token.
        # In below example, "pp" is the last element, and there is no digit after it.
        assert moses.tokenize("2016, pp.") == [u"2016", u",", u"pp", u"."]

        # Test escape_xml
        text = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?"
        expected_tokens_with_xmlescape = [
            "This",
            "ain",
            "&apos;t",
            "funny",
            ".",
            "It",
            "&apos;s",
            "actually",
            "hillarious",
            ",",
            "yet",
            "double",
            "Ls",
            ".",
            "&#124;",
            "&#91;",
            "&#93;",
            "&lt;",
            "&gt;",
            "&#91;",
            "&#93;",
            "&amp;",
            "You",
            "&apos;re",
            "gonna",
            "shake",
            "it",
            "off",
            "?",
            "Don",
            "&apos;t",
            "?",
        ]
        expected_tokens_wo_xmlescape = [
            "This",
            "ain",
            "'t",
            "funny",
            ".",
            "It",
            "'s",
            "actually",
            "hillarious",
            ",",
            "yet",
            "double",
            "Ls",
            ".",
            "|",
            "[",
            "]",
            "<",
            ">",
            "[",
            "]",
            "&",
            "You",
            "'re",
            "gonna",
            "shake",
            "it",
            "off",
            "?",
            "Don",
            "'t",
            "?",
        ]
        assert moses.tokenize(text, escape=True) == expected_tokens_with_xmlescape
        assert moses.tokenize(text, escape=False) == expected_tokens_wo_xmlescape

        # Test to check https://github.com/alvations/sacremoses/issues/19
        text = "this 'is' the thing"
        expected_tokens = ["this", "&apos;", "is", "&apos;", "the", "thing"]
        assert moses.tokenize(text, escape=True) == expected_tokens
Пример #9
0
    def test_expected_num_only_prefixes(self):
        """Testing if the functionality of the NUMERIC_ONLY_PREFIXES parsing is the same without redos-able regex."""
        expected_prefixes = {
            'as': [],
            'bn': [],
            'ca': [],
            'cs': [],
            'de': [],
            'el': [],
            'en': [('No', 'No #NUMERIC_ONLY#'), ('Art', 'Art #NUMERIC_ONLY#'),
                   ('pp', 'pp #NUMERIC_ONLY#')],
            'es': [],
            'et': [],
            'fi': [],
            'fr': [],
            'ga': [('lch', 'lch #NUMERIC_ONLY#'),
                   ('lgh', 'lgh #NUMERIC_ONLY#'),
                   ('uimh', 'uimh #NUMERIC_ONLY#')],
            'gu': [],
            'hi': [],
            'hu':
            [('jan', 'jan #NUMERIC_ONLY#'), ('Jan', 'Jan #NUMERIC_ONLY#'),
             ('Feb', 'Feb #NUMERIC_ONLY#'), ('feb', 'feb #NUMERIC_ONLY#'),
             ('márc', 'márc #NUMERIC_ONLY#'), ('Márc', 'Márc #NUMERIC_ONLY#'),
             ('ápr', 'ápr #NUMERIC_ONLY#'), ('Ápr', 'Ápr #NUMERIC_ONLY#'),
             ('máj', 'máj #NUMERIC_ONLY#'), ('Máj', 'Máj #NUMERIC_ONLY#'),
             ('jún', 'jún #NUMERIC_ONLY#'), ('Jún', 'Jún #NUMERIC_ONLY#'),
             ('Júl', 'Júl #NUMERIC_ONLY#'), ('júl', 'júl #NUMERIC_ONLY#'),
             ('aug', 'aug #NUMERIC_ONLY#'), ('Aug', 'Aug #NUMERIC_ONLY#'),
             ('Szept', 'Szept #NUMERIC_ONLY#'),
             ('szept', 'szept #NUMERIC_ONLY#'), ('okt', 'okt #NUMERIC_ONLY#'),
             ('Okt', 'Okt #NUMERIC_ONLY#'), ('nov', 'nov #NUMERIC_ONLY#'),
             ('Nov', 'Nov #NUMERIC_ONLY#'), ('dec', 'dec #NUMERIC_ONLY#'),
             ('Dec', 'Dec #NUMERIC_ONLY#'), ('tel', 'tel #NUMERIC_ONLY#'),
             ('Tel', 'Tel #NUMERIC_ONLY#'), ('Fax', 'Fax #NUMERIC_ONLY#'),
             ('fax', 'fax #NUMERIC_ONLY#')],
            'is': [('no', 'no #NUMERIC_ONLY#'), ('No', 'No #NUMERIC_ONLY#'),
                   ('nr', 'nr #NUMERIC_ONLY#'), ('Nr', 'Nr #NUMERIC_ONLY#'),
                   ('nR', 'nR #NUMERIC_ONLY#'), ('NR', 'NR #NUMERIC_ONLY#')],
            'it': [('No', 'No #NUMERIC_ONLY#'), ('Art', 'Art #NUMERIC_ONLY#'),
                   ('pp', 'pp #NUMERIC_ONLY#')],
            'kn': [],
            'lt': [('No', 'No #NUMERIC_ONLY#')],
            'lv': [('Nr', 'Nr #NUMERIC_ONLY#')],
            'ml': [],
            'mni': [],
            'mr': [],
            'nl': [('Nr', 'Nr #NUMERIC_ONLY#'), ('nr', 'nr #NUMERIC_ONLY#')],
            'or': [],
            'pa': [],
            'pl':
            [('nr', 'nr #NUMERIC_ONLY#'), ('Nr', 'Nr #NUMERIC_ONLY#'),
             ('pkt', 'pkt #NUMERIC_ONLY#'), ('str', 'str #NUMERIC_ONLY#'),
             ('tab', 'tab #NUMERIC_ONLY#'), ('Tab', 'Tab #NUMERIC_ONLY#'),
             ('ust', 'ust #NUMERIC_ONLY#'), ('par', 'par #NUMERIC_ONLY#'),
             ('r', 'r #NUMERIC_ONLY#'), ('l', 'l #NUMERIC_ONLY#'),
             ('s', 's #NUMERIC_ONLY#')],
            'pt': [('No', 'No #NUMERIC_ONLY#'), ('Art', 'Art #NUMERIC_ONLY#'),
                   ('p', 'p #NUMERIC_ONLY#'), ('pp', 'pp #NUMERIC_ONLY#')],
            'ro': [],
            'ru': [],
            'sk': [],
            'sl': [('št', 'št #NUMERIC_ONLY#'), ('Št', 'Št #NUMERIC_ONLY#')],
            'sv': [],
            'ta': [],
            'te': [],
            'tdt': [('No', 'No #NUMERIC_ONLY#'), ('Art', 'Art #NUMERIC_ONLY#'),
                    ('p', 'p #NUMERIC_ONLY#'), ('pp', 'pp #NUMERIC_ONLY#')],
            'yue': [('No', 'No #NUMERIC_ONLY#'), ('Nr', 'Nr #NUMERIC_ONLY#')],
            'zh': [('No', 'No #NUMERIC_ONLY#'), ('Nr', 'Nr #NUMERIC_ONLY#')]
        }

        nonbreaking_prefixes = NonbreakingPrefixes()
        moses = MosesTokenizer()
        lang2numonlyprefix = defaultdict(list)

        for lang in nonbreaking_prefixes.available_langs.values():
            lang2numonlyprefix[lang] = [
                (w.rpartition(" ")[0], w)
                for w in nonbreaking_prefixes.words(lang)
                if moses.has_numeric_only(w)
            ]

        assert lang2numonlyprefix == expected_prefixes
Пример #10
0
 def test_dot_splitting(self):
     moses = MosesTokenizer()
     text = "The meeting will take place at 11:00 a.m. Tuesday."
     expected_tokens = "The meeting will take place at 11 : 00 a.m. Tuesday .".split(
     )
     self.assertEqual(moses.tokenize(text), expected_tokens)
Пример #11
0
import numpy as np
import pandas as pd
from argopt import argopt
from helpers import pre_treat_text, tokenize_text_parallel
from nltk.tokenize.regexp import RegexpTokenizer
from sacremoses.tokenize import MosesTokenizer
from tqdm import tqdm

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

pattern = r"\@-\@|\w+['´`]|\w+|\S+"
regex_tokenizer = RegexpTokenizer(pattern, flags=re.UNICODE | re.IGNORECASE)

moses_tokenizer = MosesTokenizer(lang="fr")

MONTHS = [
    "janvier", "février", "mars", "avril", "mai", "june", "juillet", "août",
    "septembre", "octobre", "novembre", "décembre"
]

NAMES_TOKENIZER = re.compile(r"(?:\@-\@)|\s")


def _load_names(filter_n=10):
    df_names = pd.read_csv("resources/names/prenom.csv")
    df_last_names = pd.read_csv("resources/names/patronymes.csv")

    # Filter top filter_n names
    df_names = df_names[df_names["sum"] >= filter_n]
 def test_japanese_tokenization(self):
     tokenizer = MosesTokenizer(lang="ja")
     text = u"電話でんわの邪魔じゃまをしないでください"
     assert tokenizer.tokenize(text) == [text]
 def test_korean_tokenization(self):
     tokenizer = MosesTokenizer(lang="ko")
     detokenizer = MosesDetokenizer(lang="ko")
     text = u"세계 에서 가장 강력한."
     assert tokenizer.tokenize(text) == [u'세계', u'에서', u'가장', u'강력한', u'.']
     assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
 def test_chinese_tokenization(self):
     tokenizer = MosesTokenizer(lang="zh")
     text = u"记者 应谦 美国"
     assert tokenizer.tokenize(text) == [u'记者', u'应谦', u'美国']
Пример #15
0
import re
import logging
from sacremoses.tokenize import MosesTokenizer, MosesDetokenizer

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = flask.Flask(__name__)
CORS(app)

MODEL_PATH = None
PSEUDO_SERVICE_URL = True
MODEL = None
KEYWORD_PROCESSOR = None
TRAINING_TAGS = None
MOSES_TOKENIZER = MosesTokenizer(lang="fr")
MOSES_DETOKENIZER = MosesDetokenizer(lang="fr")


# pattern = r"\@-\@|\w+['´`]|\w+|\S+"
# regex_tokenizer = RegexpTokenizer(pattern, flags=re.UNICODE | re.IGNORECASE)


def load_names_processor():
    # :: Load vocabulary for is_name features ::

    global KEYWORD_PROCESSOR
    from flashtext import KeywordProcessor
    KEYWORD_PROCESSOR = KeywordProcessor()
    KEYWORD_PROCESSOR.add_keywords_from_list(list(load_names(FR_NAMES_PATH).keys()))
    logging.info("Loaded french proper names...")