Python MosesTokenizer.tokenize 예제들, sacremoses.tokenize.MosesTokenizer.tokenize Python 예제들

예제 #1

0

파일 보기

파일: test_tokenizer.py 프로젝트: alvations/sacremoses

    def test_protect_patterns(self):
        moses = MosesTokenizer()
        text = "this is a webpage https://stackoverflow.com/questions/6181381/how-to-print-variables-in-perl that kicks ass"
        expected_tokens = [
            "this",
            "is",
            "a",
            "webpage",
            "https://stackoverflow.com/questions/6181381/how-to-print-variables-in-perl",
            "that",
            "kicks",
            "ass",
        ]
        assert (
            moses.tokenize(text, protected_patterns=moses.BASIC_PROTECTED_PATTERNS)
            == expected_tokens
        )

        # Testing against pattern from https://github.com/alvations/sacremoses/issues/35
        noe_patterns = [
            r"(?:http|ftp)s?://"  # http:// or https://
            r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?))"
            r"(?::\d+)?"  # optional port
            r"(?:/\w+)*"
            r"(?:(?:\.[a-z]+)|/?)"
        ]
        assert moses.tokenize(text, protected_patterns=noe_patterns) == expected_tokens

예제 #2

0

파일 보기

    def test_aggressive_split(self):
        moses = MosesTokenizer()
        expected_tokens_wo_aggressive_dash_split = ['foo-bar']
        expected_tokens_with_aggressive_dash_split = ['foo', '@-@', 'bar']

        assert moses.tokenize('foo-bar') == expected_tokens_wo_aggressive_dash_split
        assert moses.tokenize('foo-bar', aggressive_dash_splits=True) == expected_tokens_with_aggressive_dash_split

예제 #3

0

파일 보기

파일: test_tokenizer.py 프로젝트: maksym-del/sacremoses

    def test_moses_tokenize(self):
        moses = MosesTokenizer()

        # Tokenize a sentence.
        text = u'This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
        expected_tokens = u'This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
        tokenized_text = moses.tokenize(text, return_str=True)
        assert tokenized_text == expected_tokens

        # The nonbreaking prefixes should tokenize the final fullstop.
        assert moses.tokenize('abc def.') == [u'abc', u'def', u'.']

        # The nonbreaking prefixes should deal the situation when numeric only prefix is the last token.
        # In below example, "pp" is the last element, and there is no digit after it.
        assert moses.tokenize('2016, pp.') == [u'2016', u',', u'pp', u'.']

        # Test escape_xml
        text = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?"
        expected_tokens_with_xmlescape = [
            'This', 'ain', '&apos;t', 'funny', '.', 'It', '&apos;s',
            'actually', 'hillarious', ',', 'yet', 'double', 'Ls', '.',
            '&#124;', '&#91;', '&#93;', '&lt;', '&gt;', '&#91;', '&#93;',
            '&amp;', 'You', '&apos;re', 'gonna', 'shake', 'it', 'off', '?',
            'Don', '&apos;t', '?'
        ]
        expected_tokens_wo_xmlescape = [
            'This', 'ain', "'t", 'funny', '.', 'It', "'s", 'actually',
            'hillarious', ',', 'yet', 'double', 'Ls', '.', '|', '[', ']', '<',
            '>', '[', ']', '&', 'You', "'re", 'gonna', 'shake', 'it', 'off',
            '?', 'Don', "'t", '?'
        ]
        assert moses.tokenize(text,
                              escape=True) == expected_tokens_with_xmlescape
        assert moses.tokenize(text,
                              escape=False) == expected_tokens_wo_xmlescape

예제 #4

0

파일 보기

파일: test_tokenizer.py 프로젝트: ksergeev0102/conversation-topic-classification

    def test_aggressive_split(self):
        moses = MosesTokenizer()
        expected_tokens_wo_aggressive_dash_split = ["foo-bar"]
        expected_tokens_with_aggressive_dash_split = ["foo", "@-@", "bar"]

        assert moses.tokenize(
            "foo-bar") == expected_tokens_wo_aggressive_dash_split
        assert (moses.tokenize("foo-bar", aggressive_dash_splits=True) ==
                expected_tokens_with_aggressive_dash_split)

예제 #5

0

파일 보기

파일: test_tokenizer.py 프로젝트: alvations/sacremoses

    def test_moses_detokenize(self):
        mt = MosesTokenizer()
        md = MosesDetokenizer()

        text = (
            u"This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf"
        )
        expected_tokens = mt.tokenize(text)
        expected_detokens = u"This, is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf"

        assert md.detokenize(expected_tokens) == expected_detokens

        text = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?"
        expected_tokens = [
            u"This",
            u"ain",
            u"&apos;t",
            u"funny",
            u".",
            u"It",
            u"&apos;s",
            u"actually",
            u"hillarious",
            u",",
            u"yet",
            u"double",
            u"Ls",
            u".",
            u"&#124;",
            u"&#91;",
            u"&#93;",
            u"&lt;",
            u"&gt;",
            u"&#91;",
            u"&#93;",
            u"&amp;",
            u"You",
            u"&apos;re",
            u"gonna",
            u"shake",
            u"it",
            u"off",
            u"?",
            u"Don",
            u"&apos;t",
            u"?",
        ]
        expected_detokens = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [] & You're gonna shake it off? Don't?"
        assert mt.tokenize(text) == expected_tokens
        assert md.detokenize(expected_tokens) == expected_detokens

예제 #6

0

파일 보기

    def test_moses_detokenize(self):
        mt = MosesTokenizer()
        md = MosesDetokenizer()

        text = u'This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
        expected_tokens = mt.tokenize(text)
        expected_detokens = = u'This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
        assert md.detokenize(expected_tokens) == expected_detokens

        text = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?"
        expected_tokens = [u'This', u'ain', u'&apos;t', u'funny', u'.', u'It', u'&apos;s', u'actually', u'hillarious', u',', u'yet', u'double', u'Ls', u'.', u'&#124;', u'&#91;', u'&#93;', u'&lt;', u'&gt;', u'&#91;', u'&#93;', u'&amp;', u'You', u'&apos;re', u'gonna', u'shake', u'it', u'off', u'?', u'Don', u'&apos;t', u'?']
        expected_detokens = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [] & You're gonna shake it off? Don't?"
        assert mt.tokenize(text) == expected_tokens
        assert md.detokenize(expected_tokens) == expected_detokens

예제 #7

0

파일 보기

파일: test_tokenizer.py 프로젝트: alvations/sacremoses

 def test_mixed_cjk_tokenization(self):
     tokenizer = MosesTokenizer()
     detokenizer = MosesDetokenizer()
     text = u"Japan is 日本 in Japanese."
     assert tokenizer.tokenize(text) == [
         u"Japan",
         u"is",
         u"日",
         u"本",
         u"in",
         u"Japanese",
         u".",
     ]
     assert detokenizer.detokenize(tokenizer.tokenize(text)) == text

예제 #8

0

파일 보기

    def test_opening_brackets(self):
        tokenizer = MosesTokenizer()
        detokenizer = MosesDetokenizer()

        text = "By the mid 1990s a version of the game became a Latvian television series (with a parliamentary setting, and played by Latvian celebrities)."

        assert detokenizer.detokenize(tokenizer.tokenize(text)) == text

예제 #9

0

파일 보기

파일: test_tokenizer.py 프로젝트: yannvgn/sacremoses

    def test_detokenize_with_aggressive_split(self):
        mt = MosesTokenizer()
        md = MosesDetokenizer()

        text = 'foo-bar'
        assert md.detokenize(mt.tokenize(text,
                                         aggressive_dash_splits=True)) == text

예제 #10

0

파일 보기

파일: test_tokenizer.py 프로젝트: alvations/sacremoses

 def test_dot_splitting(self):
     moses = MosesTokenizer()
     text = "The meeting will take place at 11:00 a.m. Tuesday."
     expected_tokens = (
         "The meeting will take place at 11 : 00 a.m. Tuesday .".split()
     )
     self.assertEqual(moses.tokenize(text), expected_tokens)

예제 #11

0

파일 보기

파일: test_tokenizer.py 프로젝트: alvations/sacremoses

 def test_final_comma_split_after_number(self):
     moses = MosesTokenizer()
     text = u"Sie sollten vor dem Upgrade eine Sicherung dieser Daten erstellen (wie unter Abschnitt 4.1.1, „Sichern aller Daten und Konfigurationsinformationen“ beschrieben). "
     expected_tokens = [
         "Sie",
         "sollten",
         "vor",
         "dem",
         "Upgrade",
         "eine",
         "Sicherung",
         "dieser",
         "Daten",
         "erstellen",
         "(",
         "wie",
         "unter",
         "Abschnitt",
         "4.1.1",
         ",",
         u"„",
         "Sichern",
         "aller",
         "Daten",
         "und",
         "Konfigurationsinformationen",
         u"“",
         "beschrieben",
         ")",
         ".",
     ]
     self.assertEqual(moses.tokenize(text), expected_tokens)

예제 #12

0

파일 보기

파일: test_tokenizer.py 프로젝트: alvations/sacremoses

    def test_opening_brackets(self):
        moses = MosesTokenizer()

        text = "By the mid 1990s a version of the game became a Latvian television series (with a parliamentary setting, and played by Latvian celebrities)."

        # echo By the mid 1990s a version of the game became a Latvian television series (with a parliamentary setting, and played by Latvian celebrities). | perl mosesdecoder\scripts\tokenizer\tokenizer.perl en
        expected_tokens = "By the mid 1990s a version of the game became a Latvian television series ( with a parliamentary setting , and played by Latvian celebrities ) .".split()

        assert moses.tokenize(text) == expected_tokens

예제 #13

0

파일 보기

파일: test_tokenizer.py 프로젝트: yannvgn/sacremoses

 def test_final_comma_split_after_number(self):
     moses = MosesTokenizer()
     text = u"Sie sollten vor dem Upgrade eine Sicherung dieser Daten erstellen (wie unter Abschnitt 4.1.1, „Sichern aller Daten und Konfigurationsinformationen“ beschrieben). "
     expected_tokens = [
         'Sie', 'sollten', 'vor', 'dem', 'Upgrade', 'eine', 'Sicherung',
         'dieser', 'Daten', 'erstellen', '(', 'wie', 'unter', 'Abschnitt',
         '4.1.1', ',', u'„', 'Sichern', 'aller', 'Daten', 'und',
         'Konfigurationsinformationen', u'“', 'beschrieben', ')', '.'
     ]
     self.assertEqual(moses.tokenize(text), expected_tokens)

예제 #14

0

파일 보기

파일: test_tokenizer.py 프로젝트: alvations/sacremoses

    def test_final_dot_unconditionally(self):
        # Make sure that it works for examples on
        # https://github.com/moses-smt/mosesdecoder/pull/204
        text = "'So am I."
        expected_tokens = "&apos;So am I .".split()
        self.assertEqual(moses.tokenize(text), expected_tokens)

        moses = MosesTokenizer(lang="fr")
        text = "Des gens admirent une œuvre d'art."
        expected_tokens = "Des gens admirent une œuvre d' art .".split()
        self.assertEqual(moses.tokenize(text), expected_tokens)

        moses = MosesTokenizer(lang="de")
        text = "...schwer wie ein iPhone 5."
        expected_tokens = "... schwer wie ein iPhone 5 .".split()
        self.assertEqual(moses.tokenize(text), expected_tokens)

        moses = MosesTokenizer(lang="cz")
        text = "Dvě děti, které běží bez bot."
        expected_tokens = "Dvě děti , které běží bez bot .".split()
        self.assertEqual(moses.tokenize(text), expected_tokens)

예제 #15

0

파일 보기

파일: test_tokenizer.py 프로젝트: yannvgn/sacremoses

    def test_protect_patterns(self):
        moses = MosesTokenizer()
        text = "this is a webpage https://stackoverflow.com/questions/6181381/how-to-print-variables-in-perl that kicks ass"
        expected_tokens = [
            'this', 'is', 'a', 'webpage',
            'https://stackoverflow.com/questions/6181381/how-to-print-variables-in-perl',
            'that', 'kicks', 'ass'
        ]
        assert moses.tokenize(text,
                              protected_patterns=moses.BASIC_PROTECTED_PATTERNS
                              ) == expected_tokens

        # Testing against pattern from https://github.com/alvations/sacremoses/issues/35
        noe_patterns = [
            r'(?:http|ftp)s?://'  # http:// or https://
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?))'
            r'(?::\d+)?'  # optional port
            r'(?:/\w+)*'
            r'(?:(?:\.[a-z]+)|/?)'
        ]
        assert moses.tokenize(
            text, protected_patterns=noe_patterns) == expected_tokens

예제 #16

0

파일 보기

파일: test_tokenizer.py 프로젝트: alvations/sacremoses

 def test_japanese_tokenization(self):
     tokenizer = MosesTokenizer(lang="ja")
     text = u"電話でんわの邪魔じゃまをしないでください"
     assert tokenizer.tokenize(text) == [text]

예제 #17

0

파일 보기

파일: test_tokenizer.py 프로젝트: alvations/sacremoses

 def test_korean_tokenization(self):
     tokenizer = MosesTokenizer(lang="ko")
     detokenizer = MosesDetokenizer(lang="ko")
     text = u"세계 에서 가장 강력한."
     assert tokenizer.tokenize(text) == [u"세계", u"에서", u"가장", u"강력한", u"."]
     assert detokenizer.detokenize(tokenizer.tokenize(text)) == text

예제 #18

0

파일 보기

파일: test_tokenizer.py 프로젝트: alvations/sacremoses

 def test_chinese_tokenization(self):
     tokenizer = MosesTokenizer(lang="zh")
     text = u"记者 应谦 美国"
     assert tokenizer.tokenize(text) == [u"记者", u"应谦", u"美国"]

예제 #19

0

파일 보기

파일: test_tokenizer.py 프로젝트: alvations/sacremoses

    def test_french_apostrophes(self):
        tokenizer = MosesTokenizer(lang="fr")
        detokenizer = MosesDetokenizer(lang="fr")

        text = u"L'amitié nous a fait forts d'esprit"
        assert detokenizer.detokenize(tokenizer.tokenize(text)) == text

예제 #20

0

파일 보기

파일: test_tokenizer.py 프로젝트: ksergeev0102/conversation-topic-classification

 def test_chinese_tokenization(self):
     tokenizer = MosesTokenizer(lang="zh")
     text = u"记者 应谦 美国"
     assert tokenizer.tokenize(text) == [u'记者', u'应谦', u'美国']

예제 #21

0

파일 보기

파일: test_tokenizer.py 프로젝트: ksergeev0102/conversation-topic-classification

 def test_korean_tokenization(self):
     tokenizer = MosesTokenizer(lang="ko")
     detokenizer = MosesDetokenizer(lang="ko")
     text = u"세계 에서 가장 강력한."
     assert tokenizer.tokenize(text) == [u'세계', u'에서', u'가장', u'강력한', u'.']
     assert detokenizer.detokenize(tokenizer.tokenize(text)) == text

예제 #22

0

파일 보기

파일: test_tokenizer.py 프로젝트: alvations/sacremoses

 def test_trailing_dot_apostrophe(self):
     moses = MosesTokenizer()
     text = "'Hello.'"
     expected_tokens = "&apos;Hello . &apos;".split()
     self.assertEqual(moses.tokenize(text), expected_tokens)

예제 #23

0

파일 보기

파일: preprocess_decode.py 프로젝트: shamilcm/pedra

def preprocess(src_file, mt_file, output_dir, tokenize_lang=None):
    """
        pre-process input file before post-editing
        split at <br> and remove <i> tags and music symbols.
        store everything in a codes file in output_dir

        Args:
            src_file: src_file of the translation to be preprocessed
            mt_file: output of the mt system file to be preprocessed
            output_dir: output directory to output the preprocessed files and codes file

    """

    punct_normalizer = MosesPunctNormalizer()

    # set tokenizer
    tokenizer = None
    if tokenize_lang:
        tokenizer = MosesTokenizer(lang=tokenize_lang)

    code_file = output_dir+'/codes.'+os.path.basename(mt_file)
    src_out_file = output_dir+'/'+os.path.basename(src_file)+'.pre'
    mt_out_file = output_dir+'/'+os.path.basename(mt_file)+'.pre'
    with open(src_out_file,'w') as fosrc, open(mt_out_file,'w') as fomt, open(code_file,'w') as fcodes, open(src_file) as fsrc, open(mt_file) as fmt:
        idx=0
        for src,mt in zip(fsrc,fmt):
            src, mt = src.strip(), mt.strip()
            

            idx+=1
            
            # standardize br tags
            src = re.sub('<\s*br\s*\/*>', '<br>', src, flags=re.IGNORECASE)
            mt = re.sub('<\s*br\s*\/*>', '<br>', mt, flags=re.IGNORECASE)


            # if number of <br> is same, split and save it as multiple lines
            src_split = re.split(r'\s*<br>\s*',src)
            mt_split = re.split(r'\s*<br>\s*',mt)

            # if the src, mt, do not have the same number of <br>, then do not split it
            if not (len(src_split) == len(mt_split)):
                src_split = [src]
                mt_split = [mt]
                


            for src_part, mt_part in zip(src_split, mt_split):
                code = "{}\t".format(idx)

                # check if they start with the hyphen
                has_hyphen = False
                if src_part.startswith('-'):
                    has_hyphen = True
                    src_part = src_part[1:].lstrip()

                if mt_part.startswith('-'):
                    has_hyphen = True
                    mt_part = mt_part[1:].lstrip()

                # check if they start with the music symbol
                music_syms = ('♫','♬','♪')
                has_music = False
                if re.search(r'\s*[{}]\s*'.format(''.join(music_syms)), src_part):
                    has_music = True
                    src_part = re.sub(r'\s*[{}]\s*'.format(''.join(music_syms)), '', src_part)

                #if mt_part.startswith(music_syms) or mt_part.endswith(music_syms):
                if re.search(r'\s*[{}]\s*'.format(''.join(music_syms)), mt_part):                
                    has_music = True
                    mt_part = re.sub(r'\s*[{}]\s*'.format(''.join(music_syms)), '', mt_part)

                # check if it has enclosing italics tags. otherwise leave it as it is
                itag = '<i>'
                eitag = '</i>'
                has_itag = False
                if src_part.startswith(itag) or src_part.endswith(eitag):
                    has_itag = True

                if mt_part.startswith(itag) or mt_part.endswith(eitag):
                    has_itag = True


                #if re.match(r'^<i>[^<]*</i>$', src_part):
                if has_hyphen == True:
                    code += 'HYPHENBEGIN\t'
                if has_music == True:
                    code += 'MUSIC\t'
                if has_itag == True:
                    code += 'ITALICTAGS\t'

                src_part = punct_normalizer.normalize(cleanup(src_part))
                mt_part = punct_normalizer.normalize(cleanup(mt_part))

                if tokenizer:
                    src_part = " ".join(tokenizer.tokenize(src_part, escape=False))
                    mt_part = " ".join(tokenizer.tokenize(mt_part, escape=False))

                fosrc.write(src_part.strip()+'\n')
                fomt.write(mt_part.strip()+'\n')
                fcodes.write("{}\n".format(code))

예제 #24

0

파일 보기

파일: preprocess_train.py 프로젝트: shamilcm/pedra

    # write to file
    for src, mt, pe in zip(src_split, mt_split, pe_split):
        src = cleanup(src, args)
        mt = cleanup(mt, args)
        pe = cleanup(pe, args)

        if len(src) < args.min_l or len(mt) < args.min_l or len(
                pe) < args.min_l:
            continue

        if len(src) > args.max_l or len(mt) > args.max_l or len(
                pe) > args.max_l:
            continue

        if tokenizer:
            src = " ".join(tokenizer.tokenize(src, escape=False))
            mt = " ".join(tokenizer.tokenize(mt, escape=False))
            pe = " ".join(tokenizer.tokenize(pe, escape=False))

        if args.output_stdout == False:
            # remove all tags including <br> and write to file
            fos[0].write(src + '\n')
            fos[1].write(mt + '\n')
            fos[2].write(pe + '\n')
        else:
            print("{}\t{}\t{}".format(src, mt, pe))

if args.output_stdout == False:
    for fo in fos:
        fo.close()

예제 #25

0

파일 보기

파일: test_tokenizer.py 프로젝트: alvations/sacremoses

    def test_moses_tokenize(self):
        moses = MosesTokenizer()

        # Tokenize a sentence.
        text = (
            u"This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf"
        )
        expected_tokens = u"This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf"
        tokenized_text = moses.tokenize(text, return_str=True)
        assert tokenized_text == expected_tokens

        # The nonbreaking prefixes should tokenize the final fullstop.
        assert moses.tokenize("abc def.") == [u"abc", u"def", u"."]

        # The nonbreaking prefixes should deal the situation when numeric only prefix is the last token.
        # In below example, "pp" is the last element, and there is no digit after it.
        assert moses.tokenize("2016, pp.") == [u"2016", u",", u"pp", u"."]

        # Test escape_xml
        text = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?"
        expected_tokens_with_xmlescape = [
            "This",
            "ain",
            "&apos;t",
            "funny",
            ".",
            "It",
            "&apos;s",
            "actually",
            "hillarious",
            ",",
            "yet",
            "double",
            "Ls",
            ".",
            "&#124;",
            "&#91;",
            "&#93;",
            "&lt;",
            "&gt;",
            "&#91;",
            "&#93;",
            "&amp;",
            "You",
            "&apos;re",
            "gonna",
            "shake",
            "it",
            "off",
            "?",
            "Don",
            "&apos;t",
            "?",
        ]
        expected_tokens_wo_xmlescape = [
            "This",
            "ain",
            "'t",
            "funny",
            ".",
            "It",
            "'s",
            "actually",
            "hillarious",
            ",",
            "yet",
            "double",
            "Ls",
            ".",
            "|",
            "[",
            "]",
            "<",
            ">",
            "[",
            "]",
            "&",
            "You",
            "'re",
            "gonna",
            "shake",
            "it",
            "off",
            "?",
            "Don",
            "'t",
            "?",
        ]
        assert moses.tokenize(text, escape=True) == expected_tokens_with_xmlescape
        assert moses.tokenize(text, escape=False) == expected_tokens_wo_xmlescape

        # Test to check https://github.com/alvations/sacremoses/issues/19
        text = "this 'is' the thing"
        expected_tokens = ["this", "&apos;", "is", "&apos;", "the", "thing"]
        assert moses.tokenize(text, escape=True) == expected_tokens