예제 #1
0
    def test_detokenize_with_aggressive_split(self):
        mt = MosesTokenizer()
        md = MosesDetokenizer()

        text = 'foo-bar'
        assert md.detokenize(mt.tokenize(text,
                                         aggressive_dash_splits=True)) == text
예제 #2
0
    def test_opening_brackets(self):
        tokenizer = MosesTokenizer()
        detokenizer = MosesDetokenizer()

        text = "By the mid 1990s a version of the game became a Latvian television series (with a parliamentary setting, and played by Latvian celebrities)."

        assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
예제 #3
0
 def test_mixed_cjk_tokenization(self):
     tokenizer = MosesTokenizer()
     detokenizer = MosesDetokenizer()
     text = u"Japan is 日本 in Japanese."
     assert tokenizer.tokenize(text) == [
         u"Japan",
         u"is",
         u"日",
         u"本",
         u"in",
         u"Japanese",
         u".",
     ]
     assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
예제 #4
0
    def test_moses_detokenize(self):
        mt = MosesTokenizer()
        md = MosesDetokenizer()

        text = (
            u"This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf"
        )
        expected_tokens = mt.tokenize(text)
        expected_detokens = u"This, is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf"

        assert md.detokenize(expected_tokens) == expected_detokens

        text = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?"
        expected_tokens = [
            u"This",
            u"ain",
            u"&apos;t",
            u"funny",
            u".",
            u"It",
            u"&apos;s",
            u"actually",
            u"hillarious",
            u",",
            u"yet",
            u"double",
            u"Ls",
            u".",
            u"&#124;",
            u"&#91;",
            u"&#93;",
            u"&lt;",
            u"&gt;",
            u"&#91;",
            u"&#93;",
            u"&amp;",
            u"You",
            u"&apos;re",
            u"gonna",
            u"shake",
            u"it",
            u"off",
            u"?",
            u"Don",
            u"&apos;t",
            u"?",
        ]
        expected_detokens = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [] & You're gonna shake it off? Don't?"
        assert mt.tokenize(text) == expected_tokens
        assert md.detokenize(expected_tokens) == expected_detokens
예제 #5
0
    def test_moses_detokenize(self):
        mt = MosesTokenizer()
        md = MosesDetokenizer()

        text = u'This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
        expected_tokens = mt.tokenize(text)
        expected_detokens = = u'This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
        assert md.detokenize(expected_tokens) == expected_detokens

        text = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?"
        expected_tokens = [u'This', u'ain', u'&apos;t', u'funny', u'.', u'It', u'&apos;s', u'actually', u'hillarious', u',', u'yet', u'double', u'Ls', u'.', u'&#124;', u'&#91;', u'&#93;', u'&lt;', u'&gt;', u'&#91;', u'&#93;', u'&amp;', u'You', u'&apos;re', u'gonna', u'shake', u'it', u'off', u'?', u'Don', u'&apos;t', u'?']
        expected_detokens = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [] & You're gonna shake it off? Don't?"
        assert mt.tokenize(text) == expected_tokens
        assert md.detokenize(expected_tokens) == expected_detokens
예제 #6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        "--input-pe-file",
                        required=True,
                        help="output of the pe system to be post processed")
    parser.add_argument("-tlang",
                        "--target-language",
                        dest="target_language",
                        help="required for detokenization")
    parser.add_argument("-detok",
                        "--detokenize",
                        action="store_true",
                        help="do detokenization (-tlang is necessary)")
    parser.add_argument("-c",
                        "--codes-file",
                        required=True,
                        help="codes file used to post process the output")
    args = parser.parse_args()

    detokenizer = None
    if args.detokenize == True:
        assert (
            args.target_language
            is not None), "--target-language is required for detokenization"
        detokenizer = MosesDetokenizer(lang=args.target_language)

    postprocess(ape_file=args.input_pe_file,
                codes_file=args.codes_file,
                detokenizer=detokenizer)
예제 #7
0
파일: cli.py 프로젝트: alvations/sacremoses
def detokenize_file(
    iterator,
    language,
    processes,
    quiet,
    xml_unescape,
):
    moses = MosesDetokenizer(lang=language)
    moses_detokenize = partial(moses.detokenize, return_str=True, unescape=xml_unescape)
    return parallel_or_not(
        list(map(str.split, iterator)), moses_detokenize, processes, quiet
    )
예제 #8
0
def detokenize_file(language, processes, xml_unescape, encoding):
    moses = MosesDetokenizer(lang=language)
    moses_detokenize = partial(moses.detokenize, return_str=True, unescape=xml_unescape)
    with click.get_text_stream("stdin", encoding=encoding) as fin:
        with click.get_text_stream("stdout", encoding=encoding) as fout:
            # If it's single process, joblib parallization is slower,
            # so just process line by line normally.
            if processes == 1:
                for line in tqdm(fin.readlines()):
                    print(moses_detokenize(str.split(line)), end="\n", file=fout)
            else:
                document_iterator = map(str.split, fin.readlines())
                for outline in parallelize_preprocess(
                    moses_detokenize, document_iterator, processes, progress_bar=True
                ):
                    print(outline, end="\n", file=fout)
예제 #9
0
 def test_korean_tokenization(self):
     tokenizer = MosesTokenizer(lang="ko")
     detokenizer = MosesDetokenizer(lang="ko")
     text = u"세계 에서 가장 강력한."
     assert tokenizer.tokenize(text) == [u"세계", u"에서", u"가장", u"강력한", u"."]
     assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
예제 #10
0
    def test_french_apostrophes(self):
        tokenizer = MosesTokenizer(lang="fr")
        detokenizer = MosesDetokenizer(lang="fr")

        text = u"L'amitié nous a fait forts d'esprit"
        assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
 def test_korean_tokenization(self):
     tokenizer = MosesTokenizer(lang="ko")
     detokenizer = MosesDetokenizer(lang="ko")
     text = u"세계 에서 가장 강력한."
     assert tokenizer.tokenize(text) == [u'세계', u'에서', u'가장', u'강력한', u'.']
     assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
예제 #12
0
import logging
from sacremoses.tokenize import MosesTokenizer, MosesDetokenizer

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = flask.Flask(__name__)
CORS(app)

MODEL_PATH = None
PSEUDO_SERVICE_URL = True
MODEL = None
KEYWORD_PROCESSOR = None
TRAINING_TAGS = None
MOSES_TOKENIZER = MosesTokenizer(lang="fr")
MOSES_DETOKENIZER = MosesDetokenizer(lang="fr")


# pattern = r"\@-\@|\w+['´`]|\w+|\S+"
# regex_tokenizer = RegexpTokenizer(pattern, flags=re.UNICODE | re.IGNORECASE)


def load_names_processor():
    # :: Load vocabulary for is_name features ::

    global KEYWORD_PROCESSOR
    from flashtext import KeywordProcessor
    KEYWORD_PROCESSOR = KeywordProcessor()
    KEYWORD_PROCESSOR.add_keywords_from_list(list(load_names(FR_NAMES_PATH).keys()))
    logging.info("Loaded french proper names...")
    return KEYWORD_PROCESSOR