示例#1
0
def wl_syl_tokenize(main, inputs, lang, syl_tokenizer = 'default'):
    if inputs and lang in main.settings_global['syl_tokenizers']:
        syls_tokens = []

        if syl_tokenizer == 'default':
            syl_tokenizer = main.settings_custom['syl_tokenization']['syl_tokenizers'][lang]

        wl_nlp_utils.init_syl_tokenizers(
            main,
            lang = lang,
            syl_tokenizer = syl_tokenizer
        )

        section_size = main.settings_custom['files']['misc']['read_files_in_chunks']

        if type(inputs) == str:
            sections = wl_nlp_utils.split_into_chunks_text(inputs, section_size = section_size)

            for section in sections:
                syls_tokens.extend(wl_syl_tokenize_text(main, section, lang, syl_tokenizer))
        else:
            texts = wl_nlp_utils.to_sections_unequal(inputs, section_size = section_size * 50)

            for tokens in texts:
                syls_tokens.extend(wl_syl_tokenize_tokens(main, tokens, lang, syl_tokenizer))
    else:
        if type(inputs) == str:
            syls_tokens = [[token] for token in wl_word_tokenization.wl_word_tokenize_flat(main, inputs, lang = lang)]
        else:
            syls_tokens = [[token] for token in inputs]

    return syls_tokens
示例#2
0
    def add_tags_tokenization(self, text):
        if (text := text.strip()):
            tokens = wl_word_tokenization.wl_word_tokenize_flat(self.main,
                                                                text,
                                                                lang=self.lang)

            self.tags.extend([[] for _ in tokens])
示例#3
0
def test_word_detokenize(lang):
    lang_text = wl_conversion.to_lang_text(main, lang)

    print(f'{lang_text} ({lang}):')

    tokens = wl_word_tokenization.wl_word_tokenize_flat(
        main,
        text = getattr(wl_test_lang_examples, f"SENTENCE_{lang.upper() if lang != 'other' else 'ENG_GB'}"),
        lang = lang
    )
    text = wl_word_detokenization.wl_word_detokenize(
        main,
        tokens = tokens,
        lang = lang
    )

    print(text)

    if lang == 'zho_cn':
        assert text == '汉语,又称汉文、中文、中国话、中国语、华语、华文、唐话[2] ,或被视为一个语族,或被视为隶属于汉藏语系汉语族之一种语言。'
    elif lang == 'zho_tw':
        assert text == '漢語,又稱漢文、中文、中國話、中國語、華語、華文、唐話[2] ,或被視為一個語族,或被視為隸屬於漢藏語系漢語族之一種語言。'
    elif lang == 'eng_gb':
        assert text == 'English is a West Germanic language of the Indo - European language family, originally spoken by the inhabitants of early medieval England.[3][4][5]'
    elif lang == 'jpn':
        assert text == '日本語(にほんご、にっぽんご[注2]、英: Japanese )は、日本国内や、かつての日本領だった国、そして日本人同士の間で使用されている言語。'
    elif lang == 'tha':
        assert text == 'ภาษาไทยหรือภาษาไทยกลางเป็นภาษาราชการและภาษาประจำชาติของประเทศไทย'
    elif lang == 'bod':
        assert text == 'བོད་ཀྱི་སྐད་ཡིག་ནི་བོད་ཡུལ་དང་དེའི་ཉེ་འཁོར་གྱི་ས་ཁུལ་ཏེ།'
    elif lang == 'other':
        assert text == 'English is a West Germanic language of the Indo - European language family, originally spoken by the inhabitants of early medieval England.[3][4][5]'
    else:
        raise Exception(f'Error: Tests for language "{lang}" is skipped!')
示例#4
0
def wl_lemmatize(
    main, inputs, lang,
    tokenized = _tr('wl_lemmatize', 'No'),
    tagged = _tr('wl_lemmatize', 'No'),
    lemmatizer = 'default'
):
    if inputs and lang in main.settings_global['lemmatizers']:
        lemmas = []

        if lemmatizer == 'default':
            lemmatizer = main.settings_custom['lemmatization']['lemmatizers'][lang]

        wl_nlp_utils.init_word_tokenizers(
            main,
            lang = lang
        )
        wl_nlp_utils.init_lemmatizers(
            main,
            lang = lang,
            lemmatizer = lemmatizer
        )

        section_size = main.settings_custom['files']['misc']['read_files_in_chunks']

        if type(inputs) == str:
            # Input of SudachiPy cannot be more than 49149 BYTES
            if lemmatizer in ['spacy_jpn', 'sudachipy_jpn'] and len(inputs) > 49149 // 4:
                # Around 300 tokens per line 4 characters per token and 4 bytes per character (≈ 49149 / 4 / 4 / 300)
                sections = wl_nlp_utils.split_into_chunks_text(inputs, section_size = 10)
            else:
                sections = wl_nlp_utils.split_into_chunks_text(inputs, section_size = section_size)

            for section in sections:
                lemmas.extend(wl_lemmatize_text(main, section, lang, tokenized, tagged, lemmatizer))
        else:
            # Input of SudachiPy cannot be more than 49149 BYTES
            if lemmatizer in ['spacy_jpn', 'sudachipy_jpn'] and sum([len(token) for token in inputs]) > 49149 // 4:
                # Around 4 characters per token and 4 bytes per character (≈ 49149 / 4 / 4)
                texts = wl_nlp_utils.to_sections_unequal(inputs, section_size = 3000)
            else:
                texts = wl_nlp_utils.to_sections_unequal(inputs, section_size = section_size * 50)

            for tokens in texts:
                lemmas.extend(wl_lemmatize_tokens(main, tokens, lang, tokenized, tagged, lemmatizer))
    else:
        if type(inputs) == str:
            lemmas = wl_word_tokenization.wl_word_tokenize_flat(main, inputs, lang = lang)
        else:
            lemmas = inputs.copy()

    return lemmas
示例#5
0
def wl_syl_tokenize_text(main, text, lang, syl_tokenizer):
    syls_tokens = []

    tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang = lang)

    for token in tokens:
        # Pyphen
        if syl_tokenizer.startswith('pyphen_'):
            pyphen_syl_tokenizer = main.__dict__[f'pyphen_syl_tokenizer_{lang}']

            syls_tokens.append(re.split(r'\-+', pyphen_syl_tokenizer.inserted(token)))
        # Thai
        elif syl_tokenizer == 'pythainlp_tha':
            syls_tokens.append(pythainlp.subword_tokenize(token, engine = 'dict'))
        elif syl_tokenizer == 'ssg_tha':
            # Delay import of ssg as a temporary work-around of the encoding issue of python-crfsuite
            # See: https://github.com/scrapinghub/python-crfsuite/pull/121
            import ssg

            syls_tokens.append(ssg.syllable_tokenize(token))

    return syls_tokens
def test_word_tokenize(lang, word_tokenizer):
    lang_text = wl_conversion.to_lang_text(main, lang)

    print(f'{lang_text} ({lang}) / {word_tokenizer}:')

    tokens = wl_word_tokenization.wl_word_tokenize_flat(
        main,
        text = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'),
        lang = lang,
        word_tokenizer = word_tokenizer
    )
    # Use 0 to 9 only since nagisa would split numerals into single numbers (except 24 and maybe some others)
    tokens_long_text = wl_word_tokenization.wl_word_tokenize_flat(
        main,
        text = ''.join([f'{i % 10}\n' for i in range(101)]),
        lang = lang,
        word_tokenizer = word_tokenizer
    )

    print(tokens)

    # The count of tokens should be more than 1
    assert len(tokens) > 1
    # The count of tokens should be more than the length of tokens split by space
    assert len(tokens) > len(f'SENTENCE_{lang.upper()}'.split())
    # Test long texts
    assert tokens_long_text == [str(i % 10) for i in range(101)]

    if lang == 'afr':
        assert tokens == ['Afrikaans', 'is', 'tipologies', 'beskou', "'", 'n', 'Indo', '-', 'Europese', ',', 'Wes', '-', 'Germaanse', ',', 'Nederfrankiese', 'taal,[2', ']', 'wat', 'aan', 'die', 'suidpunt', 'van', 'Afrika', 'onder', 'invloed', 'van', 'verskeie', 'ander', 'tale', 'en', 'taalgroepe', 'ontstaan', 'het', '.']
    elif lang == 'sqi':
        assert tokens == ['Gjuha', 'shqipe', '(', 'ose', 'thjeshtë', 'shqipja', ')', 'është', 'gjuhë', 'dhe', 'degë', 'e', 'veçantë', 'e', 'familjes', 'indo', '-', 'evropiane', 'të', 'folur', 'nga', 'më', 'shumë', 'se', '6', 'milionë', 'njerëz[4', ']', ',', 'kryesisht', 'në', 'Shqipëri', ',', 'Kosovë', 'dhe', 'Republikën', 'e', 'Maqedonisë', ',', 'por', 'edhe', 'në', 'zona', 'të', 'tjera', 'të', 'Evropës', 'Jugore', 'ku', 'ka', 'një', 'popullsi', 'shqiptare', ',', 'duke', 'përfshirë', 'Malin', 'e', 'Zi', 'dhe', 'Luginën', 'e', 'Preshevës', '.']
    elif lang == 'amh':
        assert tokens == ['አማርኛ[1', ']', '፡', 'የኢትዮጵያ', '፡', 'መደበኛ', '፡', 'ቋንቋ', '፡', 'ነው', '።']
    elif lang == 'ara':
        assert tokens == ['اللُّغَة', 'العَرَبِيّة', 'هي', 'أكثر', 'اللغات', 'السامية', 'تحدثاً', 'وإحدى', 'أكثر', 'اللغات', 'انتشاراً', 'في', 'العالم', '،', 'يتحدثها', 'أكثر', 'من', '467', 'مليون', 'نسمة،(1', ')', 'ويتوزع', 'متحدثوها', 'في', 'الوطن', 'العربي', '،', 'بالإضافة', 'إلى', 'العديد', 'من', 'المناطق', 'الأخرى', 'المجاورة', 'كالأهواز', 'وتركيا', 'وتشاد', 'ومالي', 'والسنغال', 'وإرتيريا', 'وإثيوبيا', 'وجنوب', 'السودان', 'وإيران', '.']
    elif lang == 'hye':
        assert tokens == ['Հայերեն', '(', 'ավանդական՝', 'հայերէն', ')', ',', 'հնդեվրոպական', 'լեզվաընտանիքի', 'առանձին', 'ճյուղ', 'հանդիսացող', 'լեզու։']
    elif lang == 'asm':
        assert tokens == ['অসমীয়া', 'ভাষা', 'হৈছে', 'সকলোতকৈ', 'পূৰ্বীয়', 'ভাৰতীয়-আৰ্য', 'ভাষা', '।']
    elif lang == 'aze':
        assert tokens == ['Azərbaycan', 'dili[1][2][3', ']', '—', 'Azərbaycan', 'Respublikasının', 'və', 'Rusiya', 'Federasiyasının', 'Dağıstan', 'Respublikasının[4', ']', 'rəsmi', 'dövlət', 'dilidir', '.']
    elif lang == 'eus':
        assert tokens == ['Euskara', 'Euskal', 'Herriko', 'hizkuntza', 'da.[5', ']']
    elif lang == 'ben':
        assert tokens == ['বাংলা', 'ভাষা', '(', 'বাঙলা', ',', 'বাঙ্গলা', ',', 'তথা', 'বাঙ্গালা', 'নামগুলোতেও', 'পরিচিত', ')', 'একটি', 'ইন্দো', '-', 'আর্য', 'ভাষা', ',', 'যা', 'দক্ষিণ', 'এশিয়ার', 'বাঙালি', 'জাতির', 'প্রধান', 'কথ্য', 'ও', 'লেখ্য', 'ভাষা', '।']
    elif lang == 'bul':
        assert tokens == ['Бъ̀лгарският', 'езѝк', 'е', 'индоевропейски', 'език', 'от', 'групата', 'на', 'южнославянските', 'езици', '.']
    elif lang == 'cat':
        assert tokens == ['El', 'català', '(', 'denominació', 'oficial', 'a', 'Catalunya', ',', 'a', 'les', 'Illes', 'Balears', ',', 'a', 'Andorra', ',', 'a', 'la', 'ciutat', 'de', "l'", 'Alguer', 'i', 'tradicional', 'a', 'Catalunya', 'Nord', ')', 'o', 'valencià', '(', 'denominació', 'oficial', 'a', 'l', 'País', 'Valencià', 'i', 'tradicional', 'a', 'l', 'Carxe', ')', 'és', 'una', 'llengua', 'romànica', 'parlada', 'a', 'Catalunya', ',', 'el', 'País', 'Valencià', '(', 'tret', "d'", 'algunes', 'comarques', 'i', 'localitats', 'de', "l'", 'interior', ')', ',', 'les', 'Illes', 'Balears', ',', 'Andorra', ',', 'la', 'Franja', 'de', 'Ponent', '(', 'a', "l'", 'Aragó', ')', ',', 'la', 'ciutat', 'de', "l'", 'Alguer', '(', 'a', "l'", 'illa', 'de', 'Sardenya', ')', ',', 'la', 'Catalunya', 'd', 'el', 'Nord,[8', ']', 'el', 'Carxe', '(', 'un', 'petit', 'territori', 'de', 'Múrcia', 'poblat', 'per', 'immigrats', 'valencians),[9][10', ']', 'i', 'en', 'comunitats', 'arreu', 'd', 'el', 'món', '(', 'entre', 'les', 'quals', 'destaca', 'la', 'de', "l'", 'Argentina', ',', 'amb', '198.000', 'parlants).[11', ']']
    elif lang == 'zho_cn':
        if word_tokenizer == 'jieba_zho':
            assert tokens == ['汉语', ',', '又称', '汉文', '、', '中文', '、', '中国', '话', '、', '中国', '语', '、', '华语', '、', '华文', '、', '唐话', '[', '2', ']', ',', '或', '被', '视为', '一个', '语族', ',', '或', '被', '视为', '隶属于', '汉藏语系', '汉语', '族', '之', '一种', '语言', '。']
        elif word_tokenizer == 'pkuseg_zho':
            assert tokens == ['汉语', ',', '又', '称', '汉文', '、', '中文', '、', '中国话', '、', '中国语', '、', '华语', '、', '华文', '、', '唐', '话[', '2', ']', ',', '或', '被', '视为', '一个', '语族', ',', '或', '被', '视为', '隶属于', '汉藏', '语系', '汉语族', '之一', '种', '语言', '。']
        elif word_tokenizer == 'spacy_zho':
            assert tokens == ['汉语', ',', '又', '称', '汉文', '、', '中文', '、', '中国话', '、', '中国语', '、', '华语', '、', '华文', '、', '唐话', '[', '2', ']', ',', '或', '被', '视为', '一个', '语族', ',', '或', '被', '视为', '隶属于', '汉藏', '语系', '汉语族', '之一', '种', '语言', '。']
        elif word_tokenizer == 'wordless_zho_char':
            assert tokens == ['汉', '语', ',', '又', '称', '汉', '文', '、', '中', '文', '、', '中', '国', '话', '、', '中', '国', '语', '、', '华', '语', '、', '华', '文', '、', '唐', '话', '[', '2', ']', ',', '或', '被', '视', '为', '一', '个', '语', '族', ',', '或', '被', '视', '为', '隶', '属', '于', '汉', '藏', '语', '系', '汉', '语', '族', '之', '一', '种', '语', '言', '。']
        else:
            raise Exception(f'Error: Tests for word tokenizer "{word_tokenizer}" is skipped!')
    elif lang == 'zho_tw':
        if word_tokenizer == 'jieba_zho':
            assert tokens == ['漢語', ',', '又', '稱漢文', '、', '中文', '、', '中國話', '、', '中國語', '、', '華語', '、', '華文', '、', '唐話', '[', '2', ']', ',', '或', '被', '視為', '一個', '語族', ',', '或', '被', '視為', '隸屬', '於', '漢藏語', '系漢', '語族', '之一', '種語', '言', '。']
        elif word_tokenizer == 'pkuseg_zho':
            assert tokens == ['漢語', ',', '又', '稱', '漢文', '、', '中文', '、', '中', '國話', '、', '中國語', '、', '華語', '、', '華文', '、', '唐', '話[', '2', ']', ',', '或', '被', '視為', '一', '個', '語族', ',', '或', '被', '視', '為隸', '屬於', '漢藏', '語系', '漢語族', '之一', '種', '語言', '。']
        elif word_tokenizer == 'spacy_zho':
            assert tokens == ['漢語', ',', '又', '稱', '漢文', '、', '中文', '、', '中國話', '、', '中國語', '、', '華語', '、', '華文', '、', '唐話[', '2', ']', ',', '或', '被', '視為', '一', '個', '語族', ',', '或', '被', '視為', '隸屬', '於漢', '藏語', '系漢', '語族', '之一', '種', '語言', '。']
        elif word_tokenizer == 'wordless_zho_char':
            assert tokens == ['漢', '語', ',', '又', '稱', '漢', '文', '、', '中', '文', '、', '中', '國', '話', '、', '中', '國', '語', '、', '華', '語', '、', '華', '文', '、', '唐', '話', '[', '2', ']', ',', '或', '被', '視', '為', '一', '個', '語', '族', ',', '或', '被', '視', '為', '隸', '屬', '於', '漢', '藏', '語', '系', '漢', '語', '族', '之', '一', '種', '語', '言', '。']
        else:
            raise Exception(f'Error: Tests for word tokenizer "{word_tokenizer}" is skipped!')
    elif lang == 'hrv':
        assert tokens == ['Hrvatski', 'jezik', '(', 'ISO', '639', '-', '3', ':', 'hrv', ')', 'skupni', 'je', 'naziv', 'za', 'nacionalni', 'standardni', 'jezik', 'Hrvata', ',', 'te', 'za', 'skup', 'narječja', 'i', 'govora', 'kojima', 'govore', 'ili', 'su', 'nekada', 'govorili', 'Hrvati', '.']
    elif lang == 'ces':
        assert tokens == ['Čeština', 'neboli', 'český', 'jazyk', 'je', 'západoslovanský', 'jazyk', ',', 'nejbližší', 'slovenštině', ',', 'poté', 'lužické', 'srbštině', 'a', 'polštině', '.']
    elif lang == 'dan':
        assert tokens == ['Dansk', 'er', 'et', 'nordgermansk', 'sprog', 'af', 'den', 'østnordiske', '(', 'kontinentale', ')', 'gruppe', ',', 'der', 'tales', 'af', 'ca.', 'seks', 'millioner', 'mennesker', '.']
    elif lang == 'nld':
        assert tokens == ['Het', 'Nederlands', 'is', 'een', 'West-Germaanse', 'taal', 'en', 'de', 'officiële', 'taal', 'van', 'Nederland', ',', 'Suriname', 'en', 'een', 'van', 'de', 'drie', 'officiële', 'talen', 'van', 'België', '.']
    elif lang in ['eng_gb', 'eng_us']:
        if word_tokenizer in ['nltk_nist', 'nltk_twitter', 'sacremoses_moses']:
            assert tokens == ['English', 'is', 'a', 'West', 'Germanic', 'language', 'of', 'the', 'Indo-European', 'language', 'family', ',', 'originally', 'spoken', 'by', 'the', 'inhabitants', 'of', 'early', 'medieval', 'England', '.', '[', '3', ']', '[', '4', ']', '[', '5', ']']
        elif word_tokenizer in ['nltk_nltk', 'nltk_penn_treebank']:
            assert tokens == ['English', 'is', 'a', 'West', 'Germanic', 'language', 'of', 'the', 'Indo-European', 'language', 'family', ',', 'originally', 'spoken', 'by', 'the', 'inhabitants', 'of', 'early', 'medieval', 'England.', '[', '3', ']', '[', '4', ']', '[', '5', ']']
        elif word_tokenizer == 'nltk_tok_tok':
            assert tokens == ['English', 'is', 'a', 'West', 'Germanic', 'language', 'of', 'the', 'Indo-European', 'language', 'family', ',', 'originally', 'spoken', 'by', 'the', 'inhabitants', 'of', 'early', 'medieval', 'England.[', '3', ']', '[', '4', ']', '[', '5', ']']
        elif word_tokenizer == 'spacy_eng':
            assert tokens == ['English', 'is', 'a', 'West', 'Germanic', 'language', 'of', 'the', 'Indo', '-', 'European', 'language', 'family', ',', 'originally', 'spoken', 'by', 'the', 'inhabitants', 'of', 'early', 'medieval', 'England.[3][4][5', ']']
        else:
            raise Exception(f'Error: Tests for word tokenizer "{word_tokenizer}" is skipped!')
    elif lang == 'est':
        assert tokens == ['Eesti', 'keel', '(', 'varasem', 'nimetus', 'maakeel', ')', 'on', 'läänemeresoome', 'lõunarühma', 'kuuluv', 'keel', '.']
    elif lang == 'fin':
        assert tokens == ['Suomen', 'kieli', '(', 'suomi', ')', 'on', 'uralilaisten', 'kielten', 'itämerensuomalaiseen', 'ryhmään', 'kuuluva', 'kieli', '.']
    elif lang == 'fra':
        assert tokens == ['Le', 'français', 'est', 'une', 'langue', 'indo-européenne', 'de', 'la', 'famille', 'des', 'langues', 'romanes', 'dont', 'les', 'locuteurs', 'sont', 'appelés', 'francophones', '.']
    elif lang in ['deu_at', 'deu_de', 'deu_ch']:
        assert tokens == ['Die', 'deutsche', 'Sprache', 'bzw.', 'das', 'Deutsche', '(', '[', 'dɔɪ̯tʃ];[26', ']', 'abgekürzt', 'dt', '.', 'oder', 'dtsch', '.', ')', 'ist', 'eine', 'westgermanische', 'Sprache', ',', 'die', 'weltweit', 'etwa', '90', 'bis', '105', 'Millionen', 'Menschen', 'als', 'Muttersprache', 'und', 'weiteren', 'rund', '80', 'Millionen', 'als', 'Zweit-', 'oder', 'Fremdsprache', 'dient', '.']
    elif lang == 'grc':
        assert tokens == ['Ὅτι', 'μὲν', 'ὑμεῖς', ',', 'ὦ', 'ἄνδρες', 'Ἀθηναῖοι', ',', 'πεπόνθατε', 'ὑπὸ', 'τῶν', 'ἐμῶν', 'κατηγόρων', ',', 'οὐκ', 'οἶδα', '·', 'ἐγὼ', "δ'", 'οὖν', 'καὶ', 'αὐτὸς', "ὑπ'", 'αὐτῶν', 'ὀλίγου', 'ἐμαυτοῦ', 'ἐπελαθόμην', ',', 'οὕτω', 'πιθανῶς', 'ἔλεγον', '.']
    elif lang == 'ell':
        assert tokens == ['Η', 'ελληνική', 'γλώσσα', 'ανήκει', 'στην', 'ινδοευρωπαϊκή', 'οικογένεια[10', ']', 'και', 'αποτελεί', 'το', 'μοναδικό', 'μέλος', 'του', 'ελληνικού', 'κλάδου', ',', 'ενώ', 'είναι', 'η', 'επίσημη', 'γλώσσα', 'της', 'Ελλάδος', 'και', 'της', 'Κύπρου', '.']
    elif lang == 'guj':
        assert tokens == ['ગુજરાતી', '\u200d(/ɡʊdʒəˈrɑːti/[૭', ']', ',', 'રોમન', 'લિપિમાં', ':', 'Gujarātī', ',', 'ઉચ્ચાર', ':', '[', 'ɡudʒəˈɾɑːtiː', ']', ')', 'ભારત', 'દેશના', 'ગુજરાત', 'રાજ્યની', 'ઇન્ડો-આર્યન', 'ભાષા', 'છે', ',', 'અને', 'મુખ્યત્વે', 'ગુજરાતી', 'લોકો', 'દ્વારા', 'બોલાય', 'છે.']
    elif lang == 'heb':
        assert tokens == ['עִבְרִית', 'היא', 'שפה', 'שמית', ',', 'ממשפחת', 'השפות', 'האפרו', '-', 'אסיאתיות', ',', 'הידועה', 'כשפתם', 'של', 'היהודים', 'ושל', 'השומרונים', ',', 'אשר', 'ניב', 'מודרני', 'שלה', '(', 'עברית', 'ישראלית', ')', 'הוא', 'שפתה', 'הרשמית', 'של', 'מדינת', 'ישראל', ',', 'מעמד', 'שעוגן', 'בשנת', '2018', 'בחוק', 'יסוד', ':', 'ישראל', '–', 'מדינת', 'הלאום', 'של', 'העם', 'היהודי', '.']
    elif lang == 'hin':
        assert tokens == ['हिन्दी', 'विश्व', 'की', 'एक', 'प्रमुख', 'भाषा', 'है', 'एवं', 'भारत', 'की', 'राजभाषा', 'है', '।']
    elif lang == 'hun':
        assert tokens == ['A', 'magyar', 'nyelv', 'az', 'uráli', 'nyelvcsalád', 'tagja', ',', 'a', 'finnugor', 'nyelvek', 'közé', 'tartozó', 'ugor', 'nyelvek', 'egyike', '.']
    elif lang == 'isl':
        if word_tokenizer == 'tokenizer_isl':
            assert tokens == ['Íslenska', 'er', 'vesturnorrænt', ',', 'germanskt', 'og', 'indóevrópskt', 'tungumál', 'sem', 'er', 'einkum', 'talað', 'og', 'ritað', 'á', 'Íslandi', 'og', 'er', 'móðurmál', 'langflestra', 'Íslendinga', '.', '[', '4', ']']
        elif word_tokenizer == 'spacy_isl':
            assert tokens == ['Íslenska', 'er', 'vesturnorrænt', ',', 'germanskt', 'og', 'indóevrópskt', 'tungumál', 'sem', 'er', 'einkum', 'talað', 'og', 'ritað', 'á', 'Íslandi', 'og', 'er', 'móðurmál', 'langflestra', 'Íslendinga.[4', ']']
        else:
            raise Exception(f'Error: Tests for word tokenizer "{word_tokenizer}" is skipped!')
    elif lang == 'ind':
        assert tokens == ['Bahasa', 'Indonesia', 'adalah', 'bahasa', 'Melayu', 'baku', 'yang', 'dijadikan', 'sebagai', 'bahasa', 'resmi', 'Republik', 'Indonesia[1', ']', 'dan', 'bahasa', 'persatuan', 'bangsa', 'Indonesia.[2', ']']
    elif lang == 'gle':
        assert tokens == ['Is', 'ceann', 'de', 'na', 'teangacha', 'Ceilteacha', 'í', 'an', 'Ghaeilge', '(', 'nó', 'Gaeilge', 'na', 'hÉireann', 'mar', 'a', 'thugtar', 'uirthi', 'corruair', ')', ',', 'agus', 'ceann', 'den', 'dtrí', 'cinn', 'de', 'theangacha', 'Ceilteacha', 'ar', 'a', 'dtugtar', 'na', 'teangacha', 'Gaelacha', '(', '.i.', 'an', 'Ghaeilge', ',', 'Gaeilge', 'na', 'hAlban', 'agus', 'Gaeilge', 'Mhanann', ')', 'go', 'háirithe', '.']
    elif lang == 'ita':
        assert tokens == ["L'", 'italiano', '(', '[', 'itaˈljaːno][Nota', '1', ']', 'ascolta[?·info', ']', ')', 'è', 'una', 'lingua', 'romanza', 'parlata', 'principalmente', 'in', 'Italia', '.']
    elif lang == 'jpn':
        if word_tokenizer == 'nagisa_jpn':
            assert tokens == ['日本', '語', '(', 'にほんご', '、', 'にっぽん', 'ご', '[', '注', '2', ']', '、', '英', ':', 'Japanese', ')', 'は', '、', '日本', '国', '内', 'や', '、', 'かつて', 'の', '日本', '領', 'だっ', 'た', '国', '、', 'そして', '日本', '人', '同士', 'の', '間', 'で', '使用', 'さ', 'れ', 'て', 'いる', '言語', '。']
        elif word_tokenizer in [
            'spacy_jpn',
            'sudachipy_jpn_split_mode_a'
        ]:
            assert tokens == ['日本', '語', '(', 'にほん', 'ご', '、', 'にっぽん', 'ご', '[', '注', '2', ']', '、', '英', ':', 'Japanese', ')', 'は', '、', '日本', '国', '内', 'や', '、', 'かつて', 'の', '日本', '領', 'だっ', 'た', '国', '、', 'そして', '日本', '人', '同士', 'の', '間', 'で', '使用', 'さ', 'れ', 'て', 'いる', '言語', '。']
        elif word_tokenizer in [
            'sudachipy_jpn_split_mode_b',
            'sudachipy_jpn_split_mode_c'
        ]:
            assert tokens == ['日本語', '(', 'にほん', 'ご', '、', 'にっぽん', 'ご', '[', '注', '2', ']', '、', '英', ':', 'Japanese', ')', 'は', '、', '日本', '国', '内', 'や', '、', 'かつて', 'の', '日本', '領', 'だっ', 'た', '国', '、', 'そして', '日本人', '同士', 'の', '間', 'で', '使用', 'さ', 'れ', 'て', 'いる', '言語', '。']
        elif word_tokenizer == 'wordless_jpn_kanji':
            assert tokens == ['日', '本', '語', '(', 'にほんご', '、', 'にっぽん', 'ご', '[', '注', '2', ']', '、', '英', ':', 'Japanese', ')', 'は', '、', '日', '本', '国', '内', 'や', '、', 'かつて', 'の', '日', '本', '領', 'だっ', 'た', '国', '、', 'そして', '日', '本', '人', '同', '士', 'の', '間', 'で', '使', '用', 'さ', 'れ', 'て', 'いる', '言', '語', '。']
        else:
            raise Exception(f'Error: Tests for word tokenizer "{word_tokenizer}" is skipped!')
    elif lang == 'kan':
        assert tokens == ['ದ್ರಾವಿಡ', 'ಭಾಷೆಗಳಲ್ಲಿ', 'ಪ್ರಾಮುಖ್ಯವುಳ್ಳ', 'ಭಾಷೆಯೂ', 'ಭಾರತದ', 'ಪುರಾತನವಾದ', 'ಭಾಷೆಗಳಲ್ಲಿ', 'ಒಂದೂ', 'ಆಗಿರುವ', 'ಕನ್ನಡ', 'ಭಾಷೆಯನ್ನು', 'ಅದರ', 'ವಿವಿಧ', 'ರೂಪಗಳಲ್ಲಿ', 'ಸುಮಾರು', '೪೫', 'ದಶಲಕ್ಷ', 'ಜನರು', 'ಆಡು', 'ನುಡಿಯಾಗಿ', 'ಬಳಸುತ್ತಲಿದ್ದಾರೆ', '.']
    elif lang == 'kir':
        assert tokens == ['Кыргыз', 'тили', '—', 'Кыргыз', 'Республикасынын', 'мамлекеттик', 'тили', ',', 'түрк', 'тилдеринин', 'курамына', ',', 'анын', 'ичинде', 'кыргыз-кыпчак', 'тобуна', 'кирет', '.']
    elif lang == 'lav':
        assert tokens == ['Latviešu', 'valoda', 'ir', 'dzimtā', 'valoda', 'apmēram', '1,7', 'miljoniem', 'cilvēku', ',', 'galvenokārt', 'Latvijā', ',', 'kur', 'tā', 'ir', 'vienīgā', 'valsts', 'valoda.[3', ']']
    elif lang == 'lij':
        assert tokens == ['O', 'Lìgure', '(', 'in', 'monegasco', ':', 'lenga', 'ligüra', 'e', 'lenga', 'lìgura', ')', 'o', "l'", 'é', "'", 'na', 'lengoa[1', ']', 'do', 'gruppo', 'lengoìstego', 'itàlico', 'oçidentâ', 'parlâ', 'in', 'Italia', '(', 'Liguria', ',', 'Piemonte', ',', 'Emilia', '-', 'Romagna', 'e', 'Sardegna', ')', ',', 'into', 'sud', 'da', 'Fransa', ',', 'in', 'Còrsega', ',', 'e', 'into', 'Prinçipato', 'de', 'Monego', '.']
    elif lang == 'lit':
        assert tokens == ['Lietuvių', 'kalba', '–', 'iš', 'baltų', 'prokalbės', 'kilusi', 'lietuvių', 'tautos', 'kalba', ',', 'kuri', 'Lietuvoje', 'yra', 'valstybinė', ',', 'o', 'Europos', 'Sąjungoje', '–', 'viena', 'iš', 'oficialiųjų', 'kalbų', '.']
    elif lang == 'ltz':
        assert tokens == ["D'", 'Lëtzebuergesch', 'gëtt', 'an', 'der', 'däitscher', 'Dialektologie', 'als', 'ee', 'westgermaneschen', ',', 'mëtteldäitschen', 'Dialekt', 'aklasséiert', ',', 'deen', 'zum', 'Muselfränkesche', 'gehéiert', '.']
    elif lang == 'mkd':
        assert tokens == ['Македонски', 'јазик', '—', 'јужнословенски', 'јазик', ',', 'дел', 'од', 'групата', 'на', 'словенски', 'јазици', 'од', 'јазичното', 'семејство', 'на', 'индоевропски', 'јазици', '.']
    elif lang == 'mal':
        assert tokens == ['ഇന്ത്യയിൽ', 'പ്രധാനമായും', 'കേരള', 'സംസ്ഥാനത്തിലും', 'ലക്ഷദ്വീപിലും', 'പുതുച്ചേരിയുടെ', 'ഭാഗമായ', 'മയ്യഴിയിലും', 'സംസാരിക്കപ്പെടുന്ന', 'ഭാഷയാണ്', 'മലയാളം.']
    elif lang == 'mar':
        assert tokens == ['मराठीभाषा', 'ही', 'इंडो', '-', 'युरोपीय', 'भाषाकुलातील', 'एक', 'भाषा', 'आहे', '.']
    elif lang == 'mni':
        assert tokens == ['ꯃꯤꯇꯩꯂꯣꯟ', 'ꯍꯥꯏꯕꯁꯤ', 'ꯏꯟꯗꯤꯌꯥ', 'ꯑꯋꯥꯡ-ꯅꯣꯡꯄꯣꯛꯇ', 'ꯂꯩꯕ', 'ꯃꯅꯤꯄꯨꯔꯗ', 'ꯃꯔꯨꯑꯣꯏꯅ', 'ꯉꯥꯡꯅꯕ', 'ꯇꯤꯕꯦꯇꯣ-ꯕꯔꯃꯟ', 'ꯀꯥꯡꯂꯨꯞꯇ', 'ꯆꯤꯡꯕ', 'ꯂꯣꯟ', 'ꯑꯃꯅꯤ', '꯫', 'ꯚꯥꯔꯠ', 'ꯂꯩꯉꯥꯛꯅꯥ', 'ꯁꯛꯈꯪꯂꯕ', 'ꯂꯣꯟ', '꯲꯲', 'ꯁꯤꯡꯒꯤ', 'ꯃꯅꯨꯡꯗ', 'ꯃꯤꯇꯩꯂꯣꯟꯁꯤꯁꯨ', 'ꯑꯃꯅꯤ', '꯫', 'ꯃꯤꯇꯩꯂꯣꯟ', 'ꯑꯁꯤ', 'ꯏꯟꯗꯤꯌꯥꯒꯤ', 'ꯁ', '꯭', 'ꯇꯦꯠ', 'ꯑꯣꯏꯔꯤꯕ', 'ꯑꯁꯥꯝ', 'ꯑꯃꯁꯨꯡ', 'ꯇ', '꯭', 'ꯔꯤꯄꯨꯔꯥ', 'ꯑꯃꯗꯤ', 'ꯑꯇꯩ', 'ꯂꯩꯕꯥꯛꯁꯤꯡꯗ', 'ꯍꯥꯏꯕꯗꯤ', 'ꯕꯥꯡꯂꯥꯗꯦꯁ', 'ꯑꯃꯁꯨꯡ', 'ꯑꯋꯥꯗꯁꯨ', 'ꯉꯥꯡꯅꯩ', '꯫', 'ꯏꯪ', 'ꯀꯨꯝꯖ', '꯲꯰꯱꯱', 'ꯒꯤ', 'ꯃꯤꯀꯣꯛ', 'ꯊꯤꯕꯗ', 'ꯃꯤꯇꯩꯂꯣꯟꯕꯨ', 'ꯏꯃꯥꯂꯣꯟ', 'ꯑꯣꯢꯅ', 'ꯉꯥꯡꯕꯒꯤ', 'ꯃꯤꯁꯤꯡ', 'ꯂꯤꯆꯥ', '꯱꯸', 'ꯃꯨꯛ', 'ꯁꯨꯢ', '꯫']
    elif lang == 'nep':
        assert tokens == ['नेपाली', 'भाषा', '(', 'अन्तर्राष्ट्रिय', 'ध्वन्यात्मक', 'वर्णमाला', '[', 'neˈpali', 'bʱaʂa', ']', ')', 'नेपालको', 'सम्पर्क', 'भाषा', 'तथा', 'भारत', ',', 'भुटान', 'र', 'म्यानमारको', 'केही', 'भागमा', 'मातृभाषाको', 'रूपमा', 'बोलिने', 'भाषा', 'हो', '।']
    elif lang == 'nob':
        assert tokens == ['Bokmål', 'er', 'en', 'varietet', 'av', 'norsk', 'språk', '.']
    elif lang == 'ori':
        assert tokens == ['ଓଡ଼ିଆ', '(', 'ଇଂରାଜୀ', 'ଭାଷାରେ', 'Odia', '/', 'əˈdiːə', '/', 'or', 'Oriya', '/', 'ɒˈriːə', '/', ',', ')', 'ଏକ', 'ଭାରତୀୟ', 'ଭାଷା', 'ଯାହା', 'ଏକ', 'ଇଣ୍ଡୋ-ଇଉରୋପୀୟ', 'ଭାଷାଗୋଷ୍ଠୀ', 'ଅନ୍ତର୍ଗତ', 'ଇଣ୍ଡୋ-ଆର୍ଯ୍ୟ', 'ଭାଷା', '।']
    elif lang == 'fas':
        assert tokens == ['فارسی', 'یا', 'پارسی', 'یکی', 'از', 'زبان\u200cهای', 'هندواروپایی', 'در', 'شاخهٔ', 'زبان\u200cهای', 'ایرانی', 'جنوب', 'غربی', 'است', 'که', 'در', 'کشورهای', 'ایران', '،', 'افغانستان،[۳', ']', 'تاجیکستان[۴', ']', 'و', 'ازبکستان[۵', ']', 'به', 'آن', 'سخن', 'می\u200cگویند', '.']
    elif lang == 'pol':
        assert tokens == ['Język', 'polski', ',', 'polszczyzna', '–', 'język', 'lechicki', 'z', 'grupy', 'zachodniosłowiańskiej', '(', 'do', 'której', 'należą', 'również', 'czeski', ',', 'kaszubski', ',', 'słowacki', 'i', 'języki', 'łużyckie', ')', ',', 'stanowiącej', 'część', 'rodziny', 'indoeuropejskiej', '.']
    elif lang in ['por_br', 'por_pt']:
        assert tokens == ['A', 'língua', 'portuguesa', ',', 'também', 'designada', 'português', ',', 'é', 'uma', 'língua', 'românica', 'flexiva', 'ocidental', 'originada', 'no', 'galego-português', 'falado', 'no', 'Reino', 'da', 'Galiza', 'e', 'no', 'norte', 'de', 'Portugal', '.']
    elif lang == 'pan':
        assert tokens == ['ਪੰਜਾਬੀ', 'ਭਾਸ਼ਾ', '/', 'pʌnˈdʒɑːbi', '/', '(', 'ਸ਼ਾਹਮੁਖੀ', ':', '\u200e', 'پنجابی', '\u200e', ')', '(', 'ਗੁਰਮੁਖੀ', ':', 'ਪੰਜਾਬੀ', ')', 'ਪੰਜਾਬ', 'ਦੀ', 'ਭਾਸ਼ਾ', ',', 'ਜਿਸ', 'ਨੂੰ', 'ਪੰਜਾਬ', 'ਖੇਤਰ', 'ਦੇ', 'ਵਸਨੀਕ', 'ਜਾਂ', 'ਸੰਬੰਧਿਤ', 'ਲੋਕ', 'ਬੋਲਦੇ', 'ਹਨ', '।', '[', '1', ']']
    elif lang == 'ron':
        assert tokens == ['Limba', 'română', 'este', 'o', 'limbă', 'indo-europeană', ',', 'din', 'grupul', 'italic', 'și', 'din', 'subgrupul', 'oriental', 'al', 'limbilor', 'romanice', '.']
    elif lang == 'rus':
        assert tokens == ['Ру́сский', 'язы́к', '(', '[', 'ˈruskʲɪi̯', 'jɪˈzɨk', ']', 'Информация', 'о', 'файле', 'слушать)[~', '3', ']', '[', '⇨', ']', '—', 'один', 'из', 'восточнославянских', 'языков', ',', 'национальный', 'язык', 'русского', 'народа', '.']
    elif lang == 'san':
        assert tokens == ['संस्कृतम्', '(', 'IPA', ':', '[', 'ˈsɐ̃skr̩tɐm', ']', '(', 'शृणु', ')', ')', 'जगतः', 'एकतमा', 'अतिप्राचीना', 'समृद्धा', 'शास्त्रीया', 'च', 'भाषा', 'वर्तते', '।']
    elif lang == 'srp_cyrl':
        assert tokens == ['Српски', 'језик', 'припада', 'словенској', 'групи', 'језика', 'породице', 'индоевропских', 'језика.[12', ']']
    elif lang == 'srp_latn':
        assert tokens == ['Srpski', 'jezik', 'pripada', 'slovenskoj', 'grupi', 'jezika', 'porodice', 'indoevropskih', 'jezika.[12', ']']
    elif lang == 'sin':
        assert tokens == ['ශ්\u200dරී', 'ලංකාවේ', 'ප්\u200dරධාන', 'ජාතිය', 'වන', 'සිංහල', 'ජනයාගේ', 'මව්', 'බස', 'සිංහල', 'වෙයි', '.']
    elif lang == 'slk':
        assert tokens == ['Slovenčina', 'patrí', 'do', 'skupiny', 'západoslovanských', 'jazykov', '(', 'spolu', 's', 'češtinou', ',', 'poľštinou', ',', 'hornou', 'a', 'dolnou', 'lužickou', 'srbčinou', 'a', 'kašubčinou', ')', '.']
    elif lang == 'slv':
        assert tokens == ['Slovenščina', '[', 'slovénščina', ']', '/', '[', 'sloˈʋenʃtʃina', ']', 'je', 'združeni', 'naziv', 'za', 'uradni', 'knjižni', 'jezik', 'Slovencev', 'in', 'skupno', 'ime', 'za', 'narečja', 'in', 'govore', ',', 'ki', 'jih', 'govorijo', 'ali', 'so', 'jih', 'nekoč', 'govorili', 'Slovenci', '.']
    elif lang == 'spa':
        assert tokens == ['El', 'español', 'o', 'castellano', 'es', 'una', 'lengua', 'romance', 'procedente', 'del', 'latín', 'hablado', ',', 'perteneciente', 'a', 'la', 'familia', 'de', 'lenguas', 'indoeuropeas', '.']
    elif lang == 'swe':
        assert tokens == ['Svenska', '(', 'svenska', '(', 'info', ')', ')', 'är', 'ett', 'östnordiskt', 'språk', 'som', 'talas', 'av', 'ungefär', 'tio', 'miljoner', 'personer', 'främst', 'i', 'Sverige', 'där', 'språket', 'har', 'en', 'dominant', 'ställning', 'som', 'huvudspråk', ',', 'men', 'även', 'som', 'det', 'ena', 'nationalspråket', 'i', 'Finland', 'och', 'som', 'enda', 'officiella', 'språk', 'på', 'Åland', '.']
    elif lang == 'tgl':
        assert tokens == ['Ang', 'Wikang', 'Tagalog[2', ']', '(', 'Baybayin', ':', 'ᜏᜒᜃᜅ᜔', 'ᜆᜄᜎᜓᜄ᜔', ')', ',', 'na', 'kilala', 'rin', 'sa', 'payak', 'na', 'pangalang', 'Tagalog', ',', 'ay', 'isa', 'sa', 'mga', 'pangunahing', 'wika', 'ng', 'Pilipinas', 'at', 'sinasabing', 'ito', 'ang', 'de', 'facto', '(', '"', 'sa', 'katunayan', '"', ')', 'ngunit', 'hindî', 'de', 'jure', '(', '"', 'sa', 'batas', '"', ')', 'na', 'batayan', 'na', 'siyang', 'pambansang', 'Wikang', 'Filipino', '(', 'mula', '1961', 'hanggang', '1987', ':', 'Pilipino).[2', ']']
    elif lang == 'tgk':
        assert tokens == ['Забони', 'тоҷикӣ', '—', 'забоне', ',', 'ки', 'дар', 'Эрон', ':', 'форсӣ', ',', 'ва', 'дар', 'Афғонистон', 'дарӣ', 'номида', 'мешавад', ',', 'забони', 'давлатии', 'кишварҳои', 'Тоҷикистон', ',', 'Эрон', 'ва', 'Афғонистон', 'мебошад', '.']
    elif lang == 'tam':
        assert tokens == ['தமிழ்', 'மொழி', '(', 'Tamil', 'language', ')', 'தமிழர்களினதும்', ',', 'தமிழ்', 'பேசும்', 'பலரதும்', 'தாய்மொழி', 'ஆகும்', '.']
    elif lang == 'tat':
        assert tokens == ['Татар', 'теле', '—', 'татарларның', 'милли', 'теле', ',', 'Татарстанның', 'дәүләт', 'теле', ',', 'таралышы', 'буенча', 'Русиядә', 'икенче', 'тел', '.']
    elif lang == 'tel':
        tokens == ['ఆంధ్ర', 'ప్రదేశ్', ',', 'తెలంగాణ', 'రాష్ట్రాల', 'అధికార', 'భాష', 'తెలుగు', '.']
    elif lang == 'tdt':
        tokens == ['Tetun', '(', 'iha', 'portugés', ':', 'tétum', ';', 'iha', 'inglés', ':', 'Tetum', ')', 'ne', "'", 'e', 'lian', 'nasionál', 'no', 'ko-ofisiál', 'Timór', 'Lorosa', "'", 'e', 'nian', '.']
    elif lang == 'tha':
        if word_tokenizer in [
            'pythainlp_longest_matching',
            'pythainlp_max_matching_tcc',
            'pythainlp_max_matching_tcc_safe_mode',
            'pythainlp_nercut'
        ]:
            assert tokens == ['ภาษาไทย', 'หรือ', 'ภาษาไทย', 'กลาง', 'เป็น', 'ภาษาราชการ', 'และ', 'ภาษาประจำชาติ', 'ของ', 'ประเทศ', 'ไทย']
        elif word_tokenizer == 'pythainlp_max_matching':
            assert tokens == ['ภาษาไทย', 'หรือ', 'ภาษาไทยกลาง', 'เป็น', 'ภาษาราชการ', 'และ', 'ภาษาประจำชาติ', 'ของ', 'ประเทศ', 'ไทย']
        else:
            raise Exception(f'Error: Tests for word tokenizer "{word_tokenizer}" is skipped!')
    elif lang == 'bod':
        assert tokens == ['བོད་', 'ཀྱི་', 'སྐད་ཡིག་', 'ནི་', 'བོད་ཡུལ་', 'དང་', 'དེ', 'འི་', 'ཉེ་འཁོར་', 'གྱི་', 'ས་ཁུལ་', 'ཏེ', '།']
    elif lang == 'tir':
        assert tokens == ['ትግርኛ', 'ኣብ', 'ኤርትራን', 'ኣብ', 'ሰሜናዊ', 'ኢትዮጵያን', 'ኣብ', 'ክልል', 'ትግራይ', 'ዝዝረብ', 'ሴማዊ', 'ቋንቋ', 'እዩ', '።']
    elif lang == 'tsn':
        assert tokens == ['Setswana', 'ke', 'teme', 'e', 'e', 'buiwang', 'mo', 'mafatsheng', 'a', 'Aforika', 'Borwa', ',', 'Botswana', ',', 'Namibia', 'le', 'Zimbabwe', '.']
    elif lang == 'tur':
        assert tokens == ['Türkçe', 'ya', 'da', 'Türk', 'dili', ',', 'batıda', 'Balkanlar’dan', 'başlayıp', 'doğuda', 'Hazar', 'Denizi', 'sahasına', 'kadar', 'konuşulan', 'Türkî', 'diller', 'dil', 'ailesine', 'ait', 'sondan', 'eklemeli', 'bir', 'dil.[12', ']']
    elif lang == 'ukr':
        assert tokens == ['Украї́нська', 'мо́ва', '(', 'МФА', ':', '[', 'ukrɑ̽ˈjɪnʲsʲkɑ̽', 'ˈmɔwɑ̽', ']', ',', 'історичні', 'назви', '—', 'ру́ська', ',', 'руси́нська[9][10][11', ']', '[', '*', '2', ']', ')', '—', 'національна', 'мова', 'українців', '.']
    elif lang == 'urd':
        assert tokens == ['اُردُو', 'لشکری', 'زبان[8', ']', '(', 'یا', 'جدید', 'معیاری', 'اردو', ')', 'برصغیر', 'کی', 'معیاری', 'زبانوں', 'میں', 'سے', 'ایک', 'ہے', '۔']
    elif lang == 'vie':
        assert tokens == ['Tiếng', 'Việt', ',', 'cũng', 'gọi là', 'tiếng', 'Việt Nam', '[', '5 ]', 'hay', 'Việt ngữ', 'là', 'ngôn ngữ', 'của', 'người', 'Việt', 'và', 'là', 'ngôn ngữ', 'chính thức', 'tại', 'Việt Nam', '.']
    elif lang == 'yor':
        assert tokens == ['Èdè', 'Yorùbá', 'Ni', 'èdè', 'tí', 'ó', 'ṣàkójọ', 'pọ̀', 'gbogbo', 'kú', 'oótu', 'o', '-', 'ò', '-', 'jíire', 'bí', ',', 'níapá', 'ìwọ̀', 'Oòrùn', 'ilẹ̀', 'Nàìjíríà', ',', 'tí', 'a', 'bá', 'wo', 'èdè', 'Yorùbá', ',', 'àwọn', 'onímọ̀', 'pín', 'èdè', 'náà', 'sábẹ́', 'ẹ̀yà', 'Kwa', 'nínú', 'ẹbí', 'èdè', 'Niger', '-', 'Congo', '.']
    else:
        raise Exception(f'Error: Tests for language "{lang}" is skipped!')
示例#7
0
def wl_pos_tag_text(main, text, lang, pos_tagger, tagset):
    tokens_tagged = []

    if pos_tagger == 'nagisa_jpn':
        # Defer import to save loading time
        import nagisa

    # spaCy
    if pos_tagger.startswith('spacy_'):
        if not lang.startswith('srp_'):
            lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)

        if tagset == 'default':
            tokens_tagged = [(token.text, token.tag_) for token in doc]
        elif tagset == 'universal':
            tokens_tagged = [(token.text, token.pos_) for token in doc]
    # Chinese
    elif pos_tagger == 'jieba_zho':
        tokens_tagged = jieba.posseg.cut(text)
    # English & Russian
    elif pos_tagger == 'nltk_perceptron':
        lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        tokens = wl_word_tokenization.wl_word_tokenize_flat(main,
                                                            text,
                                                            lang=lang)
        tokens_tagged = nltk.pos_tag(tokens, lang=lang)
    # Japanese
    elif pos_tagger == 'nagisa_jpn':
        tokens_tagged = nagisa.tagging(text)
        tokens_tagged = zip(tokens_tagged.words, tokens_tagged.postags)
    elif pos_tagger == 'sudachipy_jpn':
        tokens_tagged = [(token.surface(), '-'.join([
            pos for pos in token.part_of_speech()[:4] if pos != '*'
        ])) for token in main.sudachipy_word_tokenizer.tokenize(text)]
    # Russian & Ukrainian
    elif pos_tagger == 'pymorphy2_morphological_analyzer':
        if lang == 'rus':
            morphological_analyzer = main.pymorphy2_morphological_analyzer_rus
        elif lang == 'ukr':
            morphological_analyzer = main.pymorphy2_morphological_analyzer_ukr

        tokens = wl_word_tokenization.wl_word_tokenize_flat(main,
                                                            text,
                                                            lang=lang)

        for token in tokens:
            tokens_tagged.append(
                (token, morphological_analyzer.parse(token)[0].tag._POS))
    # Thai
    elif pos_tagger.startswith('pythainlp_'):
        tokens = wl_word_tokenization.wl_word_tokenize_flat(main,
                                                            text,
                                                            lang=lang)

        if pos_tagger == 'pythainlp_perceptron_lst20':
            tokens_tagged = pythainlp.tag.pos_tag(tokens,
                                                  engine='perceptron',
                                                  corpus='lst20')
        elif pos_tagger == 'pythainlp_perceptron_orchid':
            tokens_tagged = pythainlp.tag.pos_tag(tokens,
                                                  engine='perceptron',
                                                  corpus='orchid')
        elif pos_tagger == 'pythainlp_perceptron_pud':
            tokens_tagged = pythainlp.tag.pos_tag(tokens,
                                                  engine='perceptron',
                                                  corpus='pud')
    # Tibetan
    elif pos_tagger == 'botok_bod':
        tokens = main.botok_word_tokenizer.tokenize(text)

        for token in tokens:
            if token.pos:
                tokens_tagged.append((token.text, token.pos))
            else:
                tokens_tagged.append((token.text, token.chunk_type))
    # Vietnamese
    elif pos_tagger == 'underthesea_vie':
        tokens_tagged = underthesea.pos_tag(text)

    # Remove empty tokens and strip whitespace in tokens
    tokens_tagged = [(str(token).strip(), tag) for token, tag in tokens_tagged
                     if str(token).strip()]

    return tokens_tagged
def test_syl_tokenize(lang, syl_tokenizer):
    lang_text = wl_conversion.to_lang_text(main, lang)

    print(f'{lang_text} ({lang}) / {syl_tokenizer}:')

    # Untokenized
    syls = wl_syl_tokenization.wl_syl_tokenize(main,
                                               inputs=getattr(
                                                   wl_test_lang_examples,
                                                   f'SENTENCE_{lang.upper()}'),
                                               lang=lang,
                                               syl_tokenizer=syl_tokenizer)

    # Tokenized
    tokens = wl_word_tokenization.wl_word_tokenize_flat(
        main,
        text=getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'),
        lang=lang)
    syls_tokenized = wl_syl_tokenization.wl_syl_tokenize(
        main, inputs=tokens, lang=lang, syl_tokenizer=syl_tokenizer)

    syls_long_text = wl_syl_tokenization.wl_syl_tokenize(
        main,
        inputs=''.join([f'{i}\n' for i in range(101)]),
        lang=lang,
        syl_tokenizer=syl_tokenizer)
    syls_long_text_tokenized = wl_syl_tokenization.wl_syl_tokenize(
        main,
        inputs=[str(i) for i in range(101) for j in range(50)],
        lang=lang,
        syl_tokenizer=syl_tokenizer)

    print(syls)

    # Check for missing syllables
    assert all(True for syls_token in syls if all(syls_token))

    # The count of syllables should be more than 1
    assert sum([len(syls_token) for syls_token in syls]) > 1

    # Tokenization should not be modified
    assert len(tokens) == len(syls_tokenized)

    # Test long texts
    assert syls_long_text == [[str(i)] for i in range(101)]
    assert syls_long_text_tokenized == [[str(i)] for i in range(101)
                                        for j in range(50)]

    if lang == 'afr':
        assert syls == [['Afri', 'kaans'], ['is'], ['ti', 'po', 'lo', 'gies'],
                        ['be', 'skou'], ["'"], ['n'], ['In', 'do'], ['', ''],
                        ['Eu', 'ro', 'pe', 'se'], [','], ['Wes'], ['', ''],
                        ['Ger', 'maan', 'se'], [','],
                        ['Ne', 'derfran', 'kie', 'se'], ['taal', ',[2'], [']'],
                        ['wat'], ['aan'], ['die'], ['suid', 'punt'], ['van'],
                        ['Afri', 'ka'], ['on', 'der'], ['in', 'vloed'],
                        ['van'], ['ver', 'skeie'], ['an', 'der'], ['ta', 'le'],
                        ['en'], ['taal', 'groe', 'pe'], ['ont', 'staan'],
                        ['het'], ['.']]
    elif lang == 'sqi':
        assert syls == [['Gju', 'ha'], ['shqi', 'pe'], ['('], ['ose'],
                        ['thje', 'sh', 'të'], ['shqi', 'p', 'ja'], [')'],
                        ['ësh', 'të'], ['gju', 'hë'], ['dhe'], ['de', 'gë'],
                        ['e'], ['ve', 'ça', 'n', 'të'], ['e'],
                        ['fa', 'mi', 'l', 'jes'], ['in', 'do'], ['', ''],
                        ['ev', 'ro', 'pi', 'ane'], ['të'], ['fo', 'lur'],
                        ['nga'], ['më'], ['shu', 'më'], ['se'], ['6'],
                        ['mi', 'li', 'onë'], ['nje', 'rëz[4'], [']'], [','],
                        ['kry', 'esisht'], ['në'], ['Shqi', 'pë', 'ri'], [','],
                        ['Ko', 'so', 'vë'], ['dhe'],
                        ['Re', 'pu', 'b', 'li', 'kën'], ['e'],
                        ['Ma', 'qe', 'do', 'ni',
                         'së'], [','], ['por'], ['edhe'], ['në'], ['zo', 'na'],
                        ['të'], ['tje', 'ra'], ['të'], ['Ev', 'ro', 'pës'],
                        ['Ju', 'go', 're'], ['ku'], ['ka'], ['një'],
                        ['po', 'pu', 'll', 'si'], ['shqi', 'p', 'ta', 're'],
                        [','], ['du', 'ke'], ['pë', 'r', 'f', 'shi', 'rë'],
                        ['Ma', 'lin'], ['e'], ['Zi'], ['dhe'],
                        ['Lu', 'gi', 'nën'], ['e'], ['Pre', 'she', 'vës'],
                        ['.']]
    elif lang == 'bel':
        assert syls == [['Бе', 'ла', 'ру́с', 'кая'], ['мо́', 'ва'], ['—'],
                        ['на', 'цы', 'я', 'на', 'ль', 'ная'], ['мо', 'ва'],
                        ['бе', 'ла', 'ру', 'саў'], [','],
                        ['ува', 'хо', 'дзіць'], ['у'],
                        ['ін', 'да', 'еў', 'ра', 'пей', 'с', 'кую'],
                        ['моў', 'ную'], ['ся', "м'ю"], [','],
                        ['сла', 'вя', 'н', 'с', 'кую'], ['гру', 'пу'], [','],
                        [
                            'ус', 'хо', 'д', 'не', 'с', 'ла', 'вя', 'н', 'с',
                            'кую'
                        ], ['па', 'д', 'г', 'ру', 'пу'], ['.']]
    elif lang == 'bul':
        assert syls == [['Бъ', '̀л', 'гар', 'с', 'ки', 'ят'], ['ез', 'ѝк'],
                        ['е'], ['ин', 'до', 'ев', 'ро', 'пейс', 'ки'],
                        ['език'], ['от'], ['гру', 'па', 'та'], ['на'],
                        ['юж', 'нос', 'ла', 'вян', 'с', 'ки', 'те'],
                        ['ези', 'ци'], ['.']]
    elif lang == 'hrv':
        assert syls == [['Hr', 'vat', 'ski'], ['je', 'zik'], ['('], ['ISO'],
                        ['639'], ['', ''], ['3'], [':'], ['hrv'], [')'],
                        ['skup', 'ni'], ['je'], ['na', 'ziv'], ['za'],
                        ['na', 'ci', 'onal', 'ni'], ['stan', 'dard', 'ni'],
                        ['je', 'zik'], ['Hr',
                                        'va', 'ta'], [','], ['te'], ['za'],
                        ['skup'], ['na', 'rje', 'čja'], ['i'],
                        ['go', 'vo', 'ra'], ['ko', 'ji', 'ma'],
                        ['go', 'vo', 're'], ['ili'], ['su'],
                        ['ne', 'ka', 'da'], ['go', 'vo', 'ri', 'li'],
                        ['Hr', 'va', 'ti'], ['.']]
    elif lang == 'ces':
        assert syls == [['Češ', 'ti', 'na'], ['ne', 'bo', 'li'], ['čes', 'ký'],
                        ['ja', 'zyk'], ['je'],
                        ['zá', 'pa', 'doslo', 'van', 'ský'], ['ja', 'zyk'],
                        [','], ['nej', 'bliž', 'ší'],
                        ['slo', 'ven', 'šti', 'ně'], [','], ['po', 'té'],
                        ['lužic', 'ké'], ['srbšti', 'ně'], ['a'],
                        ['pol', 'šti', 'ně'], ['.']]
    elif lang == 'dan':
        assert syls == [['Dansk'], ['er'], ['et'], ['nord', 'ger', 'mansk'],
                        ['sprog'], ['af'], ['den'],
                        ['øst', 'n', 'or', 'di', 'ske'], ['('],
                        ['kon', 'ti', 'nen', 'tale'], [')'], ['grup', 'pe'],
                        [','], ['der'], ['ta', 'les'], ['af'], ['ca.'],
                        ['seks'], ['mil', 'li', 'o', 'ner'],
                        ['men', 'ne', 'sker'], ['.']]
    elif lang == 'nld':
        assert syls == [['Het'], ['Ne', 'der', 'lands'], ['is'], ['een'],
                        ['Wes', 't', 'Ger', 'maan', 'se'], ['taal'], ['en'],
                        ['de'], ['of', 'fi', 'ci', 'ë', 'le'], ['taal'],
                        ['van'], ['Ne', 'der', 'land'], [','],
                        ['Su', 'ri', 'na', 'me'], ['en'], ['een'], ['van'],
                        ['de'], ['drie'], ['of', 'fi', 'ci', 'ë', 'le'],
                        ['ta', 'len'], ['van'], ['Bel', 'gië'], ['.']]
    elif lang == 'eng_gb':
        assert syls == [['Eng', 'lish'], ['is'], ['a'], ['West'],
                        ['Ger', 'man', 'ic'], ['lan', 'guage'], ['of'],
                        ['the'], ['Indo'], ['', ''], ['European'],
                        ['lan', 'guage'], ['fam', 'ily'], [','],
                        ['ori', 'gin', 'ally'], ['spoken'], ['by'], ['the'],
                        ['in', 'hab', 'it', 'ants'], ['of'], ['early'],
                        ['me', 'di', 'ev', 'al'], ['Eng', 'land.[3][4][5'],
                        [']']]
    elif lang == 'eng_us':
        assert syls == [['Eng', 'lish'], ['is'], ['a'], ['West'],
                        ['Ger', 'man', 'ic'], ['lan', 'guage'], ['of'],
                        ['the'], ['In', 'do'], ['', ''], ['Eu', 'ro', 'pean'],
                        ['lan', 'guage'], ['fam', 'i', 'ly'], [','],
                        ['orig', 'i', 'nal', 'ly'], ['spo', 'ken'], ['by'],
                        ['the'], ['in', 'hab', 'i', 'tants'], ['of'],
                        ['ear', 'ly'], ['me', 'dieval'],
                        ['Eng', 'land.[3][4][5'], [']']]
    elif lang == 'epo':
        assert syls == [['Es', 'pe', 'r', 'anto'], [','], ['ori', 'gi', 'ne'],
                        ['la'], ['Lin', 'g', 'vo'],
                        ['In', 'ter', 'na', 'ci', 'a', ',[4'], [']'],
                        ['es', 'tas'], ['la'], ['plej'],
                        ['dis', 'vas', 't', 'iĝ', 'inta'],
                        ['in', 'ter', 'na', 'cia'],
                        ['plan', 'lin', 'g', 'vo', '.[5'], [']']]
    elif lang == 'est':
        assert syls == [['Ees', 'ti'], ['keel'], ['('], ['vara', 'sem'],
                        ['ni', 'me', 'tus'], ['maa', 'keel'], [')'], ['on'],
                        ['lää', 'ne', 'me', 're', 'soo', 'me'],
                        ['lõu', 'na', 'rüh', 'ma'], ['kuu', 'luv'], ['keel'],
                        ['.']]
    elif lang == 'fra':
        assert syls == [['Le'], ['fran', 'çais'], ['est'], ['une'], ['langue'],
                        ['in', 'do', 'eu', 'ro', 'péenne'], ['de'], ['la'],
                        ['fa', 'mille'], ['des'], ['langues'], ['ro', 'manes'],
                        ['dont'], ['les'], ['lo', 'cu', 'teurs'], ['sont'],
                        ['ap', 'pe', 'lés'], ['fran', 'co', 'phones'], ['.']]
    elif lang == 'glg':
        assert syls == [['O'], ['ga', 'le', 'go'], ['('], ['['], ['ɡaˈleɣo̝'],
                        [']'], [')'], ['é'], ['unha'], ['lin', 'gua'],
                        ['in', 'do', 'eu', 'ro', 'pea'], ['que'],
                        ['per', 'ten', 'ce'], ['á'], ['póla'], ['de'],
                        ['lin', 'guas'], ['ro', 'má', 'ni', 'cas'], ['.']]
    elif lang in ['deu_at', 'deu_de', 'deu_ch']:
        assert syls == [['Die'], ['deut', 'sche'], ['Spra', 'che'], ['bzw.'],
                        ['das'], ['Deut', 'sche'], ['('], ['['],
                        ['dɔɪ̯tʃ];[26'], [']'], ['ab', 'ge', 'kürzt'], ['dt'],
                        ['.'], ['oder'], ['dtsch'], ['.'], [')'], ['ist'],
                        ['ei', 'ne'], ['west', 'ger', 'ma', 'ni', 'sche'],
                        ['Spra', 'che'], [','], ['die'], ['welt', 'weit'],
                        ['et', 'wa'], ['90'], ['bis'], ['105'],
                        ['Mil', 'li', 'o', 'nen'], ['Men', 'schen'], ['als'],
                        ['Mut', 'ter', 'spra', 'che'], ['und'],
                        ['wei', 'te', 'ren'], ['rund'], ['80'],
                        ['Mil', 'li', 'o', 'nen'], ['als'], ['Zweit', ''],
                        ['oder'], ['Fremd', 'spra', 'che'], ['dient'], ['.']]
    elif lang == 'ell':
        assert syls == [['Η'], ['ελ', 'λη', 'νι', 'κή'], ['γλώσ', 'σα'],
                        ['ανή', 'κει'], ['στην'],
                        ['ιν', 'δο', 'ευ', 'ρω', 'παϊκή'],
                        ['οι', 'κο', 'γένεια', '[10'], [']'], ['και'],
                        ['απο', 'τε', 'λεί'], ['το'], ['μο', 'να', 'δι', 'κό'],
                        ['μέλος'], ['του'], ['ελ', 'λη', 'νι', 'κού'],
                        ['κλάδου'], [','], ['ενώ'], ['εί', 'ναι'], ['η'],
                        ['επί', 'ση', 'μη'], ['γλώσ', 'σα'], ['της'],
                        ['Ελ', 'λάδος'], ['και'], ['της'], ['Κύ', 'πρου'],
                        ['.']]
    elif lang == 'hun':
        assert syls == [['A'], ['ma', 'gyar'], ['nyelv'], ['az'],
                        ['urá', 'li'], ['nyelv', 'csa', 'lád'], ['tag', 'ja'],
                        [','], ['a'], ['finn', 'ugor'], ['nyel', 'vek'],
                        ['kö', 'zé'], ['tar', 'to', 'zó'], ['ugor'],
                        ['nyel', 'vek'], ['egyi', 'ke'], ['.']]
    elif lang == 'isl':
        assert syls == [['Ís', 'lenska'],
                        ['er'], ['vest', 'ur', 'nor', 'rænt'], [','],
                        ['germ', 'anskt'], ['og'], ['indó', 'evr', 'ópskt'],
                        ['tungu', 'mál'], ['sem'], ['er'], ['eink', 'um'],
                        ['tal', 'að'], ['og'], ['rit', 'að'], ['á'],
                        ['Ís', 'landi'], ['og'], ['er'], ['móð', 'ur', 'mál'],
                        ['lang', 'flestra'], ['Ís', 'lend', 'inga'], ['.'],
                        ['['], ['4'], [']']]
    elif lang == 'ind':
        assert syls == [['Ba', 'ha', 'sa'], ['In', 'do', 'ne', 'sia'],
                        ['ada', 'lah'], ['ba', 'ha', 'sa'], ['Me', 'la', 'yu'],
                        ['ba', 'ku'], ['yang'], ['di', 'ja', 'di', 'kan'],
                        ['se', 'ba', 'gai'], ['ba', 'ha', 'sa'], ['res', 'mi'],
                        ['Re', 'pub', 'lik'], ['In', 'do', 'ne', 'si', 'a[1'],
                        [']'], ['dan'], ['ba', 'ha', 'sa'],
                        ['per', 'sa', 'tu', 'an'], ['bang', 'sa'],
                        ['In', 'do', 'ne', 'si', 'a.[2'], [']']]
    elif lang == 'ita':
        assert syls == [["L'"], ['ita', 'lia', 'no'], ['('], ['['],
                        ['itaˈ', 'l', 'jaː', 'no][', 'No', 'ta'], ['1'], [']'],
                        ['ascol', 'ta[?·in', 'fo'], [']'],
                        [')'], ['è'], ['una'], ['lin', 'gua'],
                        ['ro', 'man', 'za'], ['par', 'la', 'ta'],
                        ['prin', 'ci', 'pal', 'men', 'te'], ['in'],
                        ['Ita', 'lia'], ['.']]
    elif lang == 'lit':
        assert syls == [['Lie', 'tu', 'vių'], ['kal', 'ba'], ['–'], ['iš'],
                        ['bal', 'tų'], ['pro', 'kal',
                                        'bės'], ['ki', 'lu', 'si'],
                        ['lie', 'tu', 'vių'], ['tau', 'tos'], ['kal', 'ba'],
                        [','], ['ku', 'ri'], ['Lie', 'tu', 'vo', 'je'],
                        ['yra'], ['vals', 'ty', 'bi', 'nė'], [','], ['o'],
                        ['Eu', 'ro', 'pos'], ['Są', 'jun', 'go', 'je'], ['–'],
                        ['vie', 'na'], ['iš'], ['ofi', 'cia', 'lių', 'jų'],
                        ['kal', 'bų'], ['.']]
    elif lang == 'lav':
        assert syls == [['Lat', 'vie', 'šu'], ['va', 'lo', 'da'], ['ir'],
                        ['dzim', 'tā'], ['va', 'lo',
                                         'da'], ['ap', 'mē', 'ram'], ['1,7'],
                        ['mil', 'jo', 'niem'], ['cil', 'vē', 'ku'], [','],
                        ['gal', 've', 'no', 'kārt'], ['Lat', 'vi', 'jā'],
                        [','], ['kur'], ['tā'], ['ir'], ['vien', 'ī', 'gā'],
                        ['valsts'], ['va', 'lo', 'da.[3'], [']']]
    elif lang == 'lav':
        assert syls == [['Lat', 'vie', 'šu'], ['va', 'lo', 'da'], ['ir'],
                        ['dzim', 'tā'], ['va', 'lo',
                                         'da'], ['ap', 'mē', 'ram'], ['1,7'],
                        ['mil', 'jo', 'niem'], ['cil', 'vē', 'ku'], [','],
                        ['gal', 've', 'no', 'kārt'], ['Lat', 'vi', 'jā'],
                        [','], ['kur'], ['tā'], ['ir'], ['vien', 'ī', 'gā'],
                        ['valsts'], ['va', 'lo', 'da.[3'], [']']]
    elif lang == 'mon':
        assert syls == [['Мон', 'гол'], ['хэл'], ['нь'], ['Мон', 'гол'],
                        ['ул', 'сын'], ['ал', 'бан'], ['ёс', 'ны'], ['хэл'],
                        ['юм'], ['.']]
    elif lang == 'nob':
        assert syls == [['Bok', 'mål'], ['er'], ['en'], ['va', 'rie', 'tet'],
                        ['av'], ['norsk'], ['språk'], ['.']]
    elif lang == 'nno':
        assert syls == [['Ny', 'norsk'], [','], ['før'], ['1929'],
                        ['of', 'fi', 'si', 'elt'], ['kal', 'la'],
                        ['lands', 'mål'], [','], ['er'], ['si', 'dan'],
                        ['jam', 'stil', 'lings', 'ved', 'ta', 'ket'], ['av'],
                        ['12'], ['.'], ['mai'], ['1885'], ['ei'], ['av'],
                        ['dei'], ['to'], ['of', 'fi', 'si', 'el', 'le'],
                        ['mål', 'for', 'me', 'ne'], ['av'], ['norsk'], [';'],
                        ['den'], ['and', 're'], ['for', 'ma'], ['er'],
                        ['bok', 'mål'], ['.']]
    elif lang == 'pol':
        assert syls == [['Ję', 'zyk'], ['pol', 'ski'], [','],
                        ['pol', 'sz', 'czy', 'zna'], ['–'], ['ję', 'zyk'],
                        ['le', 'chic', 'ki'], ['z'], ['gru', 'py'],
                        ['za', 'chod', 'nio', 'sło', 'wiań', 'skiej'], ['('],
                        ['do'], ['któ', 'rej'], ['na', 'le', 'żą'],
                        ['rów', 'nież'], ['cze', 'ski'], [','],
                        ['ka', 'szub', 'ski'], [','], ['sło', 'wac', 'ki'],
                        ['i'], ['ję', 'zy', 'ki'], ['łu', 'życ', 'kie'], [')'],
                        [','], ['sta', 'no', 'wią', 'cej'], ['część'],
                        ['ro', 'dzi', 'ny'],
                        ['in', 'do', 'eu', 'ro', 'pej', 'skiej'], ['.']]
    elif lang in ['por_br', 'por_pt']:
        assert syls == [['A'], ['lín', 'gua'], ['por', 'tu',
                                                'gue', 'sa'], [','],
                        ['tam', 'bém'], ['de', 'sig', 'na', 'da'],
                        ['por', 'tu', 'guês'], [','], ['é'], ['uma'],
                        ['lín', 'gua'], ['ro', 'mâ', 'ni', 'ca'],
                        ['fle', 'xi', 'va'], ['oci', 'den', 'tal'],
                        ['ori', 'gi', 'na', 'da'], ['no'],
                        ['ga', 'le', 'go', 'por', 'tu', 'guês'],
                        ['fa', 'la', 'do'], ['no'], ['Rei', 'no'], ['da'],
                        ['Ga', 'li', 'za'], ['e'], ['no'], ['nor', 'te'],
                        ['de'], ['Por', 'tu', 'gal'], ['.']]
    elif lang == 'ron':
        assert syls == [['Lim', 'ba'], ['ro', 'mâ', 'nă'], ['es', 'te'], ['o'],
                        ['lim', 'bă'],
                        ['in', 'do', 'e', 'u', 'ro', 'pe', 'a', 'nă'], [','],
                        ['din'], ['gru', 'pul'], ['ita',
                                                  'lic'], ['și'], ['din'],
                        ['sub', 'gru', 'pul'], ['orien', 'tal'], ['al'],
                        ['lim', 'bi', 'lor'], ['ro', 'ma', 'ni', 'ce'], ['.']]
    elif lang == 'rus':
        assert syls == [['Ру́с',
                         'ский'], ['язы́к'], ['('], ['['], ['ˈruskʲɪi̯'],
                        ['jɪˈzɨk'], [']'], ['Ин', 'фор', 'ма', 'ция'], ['о'],
                        ['фай', 'ле'], ['слу', 'шать)[~'], ['3'], [']'], ['['],
                        ['⇨'], [']'], ['—'], ['один'], ['из'],
                        ['вос', 'точ', 'но', 'сла', 'вян', 'ских'],
                        ['язы', 'ков'], [','],
                        ['на', 'ци', 'о', 'наль', 'ный'], ['язык'],
                        ['рус', 'ско', 'го'], ['на', 'ро', 'да'], ['.']]
    elif lang == 'srp_cyrl':
        assert syls == [['Срп', 'ски'], ['је', 'зик'], ['при', 'па', 'да'],
                        ['сло', 'вен', 'ској'], ['гру', 'пи'],
                        ['је', 'зи', 'ка'], ['по', 'ро', 'ди', 'це'],
                        ['ин', 'до', 'е', 'вроп', 'ских'],
                        ['је', 'зи', 'ка', '.[12'], [']']]
    elif lang == 'srp_latn':
        assert syls == [['Srpski'], ['jezik'], ['pripada'], ['slovenskoj'],
                        ['grupi'], ['jezika'], ['porodice'], ['indoevropskih'],
                        ['jezika.[12'], [']']]
    elif lang == 'slk':
        assert syls == [['Slo', 'ven', 'či', 'na'], ['pat', 'rí'], ['do'],
                        ['sku', 'pi', 'ny'],
                        ['zá', 'pa', 'do', 'slo', 'van', 'ských'],
                        ['ja', 'zy', 'kov'], ['('], ['spo', 'lu'], ['s'],
                        ['češ', 'ti', 'nou'], [','], ['poľš', 'ti', 'nou'],
                        [','], ['hor', 'nou'], ['a'], ['dol', 'nou'],
                        ['lu', 'žic', 'kou'], ['srb', 'či', 'nou'], ['a'],
                        ['ka', 'šub', 'či', 'nou'], [')'], ['.']]
    elif lang == 'slv':
        assert syls == [['Slo', 'ven', 'šči', 'na'], ['['],
                        ['slo', 'vén', 'šči', 'na'], [']'], ['/'], ['['],
                        ['slo', 'ˈʋe', 'nʃtʃi', 'na'], [']'], ['je'],
                        ['zdru', 'že', 'ni'], ['na', 'ziv'], ['za'],
                        ['ura', 'dni'], ['knji', 'žni'], ['je', 'zik'],
                        ['Slo', 'ven', 'cev'], ['in'], ['sku', 'pno'], ['ime'],
                        ['za'], ['na', 're', 'čja'], ['in'],
                        ['go', 'vo', 're'], [','], ['ki'], ['jih'],
                        ['go', 'vo', 'ri', 'jo'], ['ali'], ['so'], ['jih'],
                        ['ne', 'koč'], ['go', 'vo', 'ri', 'li'],
                        ['Slo', 'ven', 'ci'], ['.']]
    elif lang == 'spa':
        assert syls == [['El'], ['es', 'pa', 'ñol'], ['o'],
                        ['cas', 'te', 'llano'], ['es'], ['una'],
                        ['len', 'gua'], ['ro', 'man', 'ce'],
                        ['pro', 'ce', 'den', 'te'], ['del'], ['la', 'tín'],
                        ['ha', 'bla', 'do'], [','],
                        ['per', 'te', 'ne', 'cien', 'te'], ['a'], ['la'],
                        ['fa', 'mi', 'lia'], ['de'], ['len', 'guas'],
                        ['in', 'doeu', 'ro', 'peas'], ['.']]
    elif lang == 'swe':
        assert syls == [['Svens', 'ka'], ['('], ['svens', 'ka'], ['('],
                        ['in', 'fo'], [')'], [')'], ['är'], ['ett'],
                        ['öst', 'nor', 'diskt'], ['språk'], ['som'],
                        ['ta', 'las'], ['av'], ['un', 'ge', 'fär'], ['tio'],
                        ['mil', 'jo', 'ner'], ['per', 'so', 'ner'], ['främst'],
                        ['i'], ['Sve', 'ri', 'ge'], ['där'], ['språ', 'ket'],
                        ['har'], ['en'], ['do', 'mi', 'nant'],
                        ['ställ', 'ning'], ['som'], ['hu', 'vud', 'språk'],
                        [','], ['men'], ['även'], ['som'], ['det'], ['ena'],
                        ['na', 'tio', 'nal', 'språ', 'ket'], ['i'],
                        ['Fin', 'land'], ['och'], ['som'], ['en', 'da'],
                        ['of', 'fi', 'ci', 'el', 'la'], ['språk'], ['på'],
                        ['Åland'], ['.']]
    elif lang == 'tel':
        assert syls == [['ఆం', 'ధ్ర'], ['ప్ర', 'దే', 'శ్'], [','],
                        ['తె', 'లం', 'గాణ'], ['రా', 'ష్ట్రాల'], ['అధి', 'కార'],
                        ['భాష'], ['తె', 'లు', 'గు'], ['.']]
    elif lang == 'tha':
        assert syls == [['ภา', 'ษา', 'ไทย'], ['หรือ'], ['ภา', 'ษา', 'ไทย'],
                        ['กลาง'], ['เป็น'], ['ภา', 'ษา', 'ราช', 'การ'],
                        ['และ'], ['ภา', 'ษา', 'ประ', 'จำ', 'ชาติ'], ['ของ'],
                        ['ประ', 'เทศ'], ['ไทย']]
    elif lang == 'ukr':
        assert syls == [['Укра', 'ї', '́', 'н', 'сь', 'ка'], ['мо', '́',
                                                              'ва'], ['('],
                        ['МФА'], [':'], ['['], ['ukrɑ̽ˈjɪnʲsʲkɑ̽'], ['ˈmɔwɑ̽'],
                        [']'], [','], ['іс', 'то', 'ри', 'ч', 'ні'],
                        ['на', 'зви'], ['—'], ['ру', '́', 'сь', 'ка'], [','],
                        ['ру', 'си', '́', 'н', 'сь', 'ка', '[9][10][11'],
                        [']'], ['['], ['*'], ['2'], [']'], [')'], ['—'],
                        ['на', 'ціо', 'на', 'ль', 'на'], ['мо', 'ва'],
                        ['укра', 'ї', 'н', 'ців'], ['.']]
    elif lang == 'zul':
        assert syls == [['Zu', 'lu'], ['/ˈzu', 'ːlu',
                                       'ː/'], [','], ['no', 'ma'],
                        ['isi', 'Zu', 'lu'], ['wu', 'li', 'mi'],
                        ['lwa', 'ba', 'ntu'], ['ba', 'se'],
                        ['Ni', 'ngi', 'zi', 'mu'], ['neA', 'fri', 'ka'],
                        ['aba', 'yi', 'ngxe', 'nye'],
                        ['ya', 'ma', 'Ngu', 'ni'], ['.']]
    else:
        raise Exception(f'Error: Tests for language "{lang}" is skipped!')
示例#9
0
def wl_lemmatize_text(main, text, lang, tokenized, tagged, lemmatizer):
    lemmas = []

    # spaCy
    if lemmatizer.startswith('spacy_'):
        if not lang.startswith('srp_'):
            lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)

        lemmas = [token.lemma_ for token in doc]
    # English
    elif lemmatizer == 'nltk_wordnet':
        word_net_lemmatizer = nltk.WordNetLemmatizer()

        for token, pos in wl_pos_tagging.wl_pos_tag(
            main, text,
            lang = 'eng_us',
            pos_tagger = 'nltk_perceptron',
            tagset = 'universal'
        ):
            if pos == 'ADJ':
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADJ))
            elif pos in ['NOUN', 'PROPN']:
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.NOUN))
            elif pos == 'ADV':
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADV))
            elif pos in ['VERB', 'AUX']:
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.VERB))
            else:
                lemmas.append(word_net_lemmatizer.lemmatize(token))
    # Japanese
    elif lemmatizer == 'sudachipy_jpn':
        lemmas = [
            token.dictionary_form()
            for token in main.sudachipy_word_tokenizer.tokenize(text)
        ]
    # Russian & Ukrainian
    elif lemmatizer == 'pymorphy2_morphological_analyzer':
        if lang == 'rus':
            morphological_analyzer = main.pymorphy2_morphological_analyzer_rus
        elif lang == 'ukr':
            morphological_analyzer = main.pymorphy2_morphological_analyzer_ukr

        tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang = lang)

        for token in tokens:
            lemmas.append(morphological_analyzer.parse(token)[0].normal_form)
    # Tibetan
    elif lemmatizer == 'botok_bod':
        tokens = main.botok_word_tokenizer.tokenize(text)

        for token in tokens:
            if token.lemma:
                lemmas.append(token.lemma)
            else:
                lemmas.append(token.text)
    # Lemmatization Lists
    elif lemmatizer.startswith('lemmatization_lists_'):
        mapping_lemmas = {}

        lang = wl_conversion.to_iso_639_1(main, lang)
        lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        with open(wl_misc.get_normalized_path(f'lemmatization/Lemmatization Lists/lemmatization-{lang}.txt'), 'r', encoding = 'utf_8_sig') as f:
            for line in f:
                try:
                    lemma, word = line.rstrip().split('\t')

                    mapping_lemmas[word] = lemma
                except ValueError:
                    pass

        tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang = lang)
        lemmas = [mapping_lemmas.get(token, token) for token in tokens]

    # Remove empty lemmas and strip whitespace in tokens
    lemmas = [
        str(lemma).strip()
        for lemma in lemmas
        if str(lemma).strip()
    ]

    return lemmas
示例#10
0
def test_lemmatize(lang, lemmatizer):
    lang_text = wl_conversion.to_lang_text(main, lang)

    print(f'{lang_text} ({lang}) / {lemmatizer}:')

    # Untokenized
    lemmas = wl_lemmatization.wl_lemmatize(
        main,
        inputs = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'),
        lang = lang,
        lemmatizer = lemmatizer
    )

    # Tokenized
    tokens = wl_word_tokenization.wl_word_tokenize_flat(
        main,
        text = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'),
        lang = lang
    )
    lemmas_tokenized = wl_lemmatization.wl_lemmatize(
        main,
        inputs = tokens,
        lang = lang,
        lemmatizer = lemmatizer
    )

    lemmas_long_text = wl_lemmatization.wl_lemmatize(
        main,
        inputs = ''.join([f'{i}\n' for i in range(101)]),
        lang = lang,
        lemmatizer = lemmatizer
    )
    lemmas_long_text_tokenized = wl_lemmatization.wl_lemmatize(
        main,
        inputs = [str(i) for i in range(101) for j in range(50)],
        lang = lang,
        lemmatizer = lemmatizer
    )

    print(lemmas)

    # Check for missing lemmas
    assert all(lemmas)
    assert all(lemmas_tokenized)

    # Tokenization should not be modified
    assert len(tokens) == len(lemmas_tokenized)

    # Test long texts
    if lemmatizer == 'botok_bod':
        assert lemmas_long_text == ['\n'.join([str(i) for i in range(100)]), '100']
    else:
        assert lemmas_long_text == [str(i) for i in range(101)]
    assert lemmas_long_text_tokenized == [str(i) for i in range(101) for j in range(50)]

    if lang == 'ast':
        assert lemmas == ["L'asturianu", 'ser', 'unu', 'llingua', 'romance', 'propiu', "d'Asturies,[1", ']', 'perteneciente', 'al', 'subgrupu', 'asturllionés', '.']
    elif lang == 'ben':
        assert lemmas == ['বাংলা', 'ভাষা', '(', 'বাঙলা', ',', 'বাঙ্গলা', ',', 'তথা', 'বাঙ্গালা', 'নামগুলোতেও', 'পরিচিত', ')', 'একটি', 'ইন্দো', '-', 'আর্য', 'ভাষা', ',', 'যা', 'দক্ষিণ', 'এশিয়ার', 'বাঙালি', 'জাতির', 'প্রধান', 'কথ্য', 'ও', 'লেখ্য', 'ভাষা', '।']
    elif lang == 'bul':
        assert lemmas == ['Бъ̀лгарският', 'езѝк', 'съм', 'индоевропейски', 'език', 'от', 'група', 'на', 'южнославянските', 'език', '.']
    elif lang == 'cat':
        if lemmatizer == 'lemmatization_lists_cat':
            assert lemmas == ['El', 'català', '(', 'denominació', 'oficial', 'a', 'Catalunya', ',', 'a', 'ell', 'Illes', 'Balears', ',', 'a', 'Andorra', ',', 'a', 'ell', 'ciutat', 'de', "l'Alguer", 'i', 'tradicional', 'a', 'Catalunya', 'Nord', ')', 'o', 'valencià', '(', 'denominació', 'oficial', 'al', 'País', 'Valencià', 'i', 'tradicional', 'al', 'Carxe', ')', 'ser', 'un', 'llengua', 'romànic', 'parlar', 'a', 'Catalunya', ',', 'ell', 'País', 'Valencià', '(', 'treure', "d'algunes", 'comarca', 'i', 'localitat', 'de', "l'interior", ')', ',', 'ell', 'Illes', 'Balears', ',', 'Andorra', ',', 'ell', 'Franja', 'de', 'Ponent', '(', 'a', "l'Aragó", ')', ',', 'ell', 'ciutat', 'de', "l'Alguer", '(', 'a', "l'illa", 'de', 'Sardenya', ')', ',', 'ell', 'Catalunya', 'del', 'Nord,[8', ']', 'ell', 'Carxe', '(', 'un', 'petit', 'territori', 'de', 'Múrcia', 'poblar', 'per', 'immigrar', 'valencians),[9][10', ']', 'i', 'en', 'comunitat', 'arreu', 'del', 'món', '(', 'entrar', 'ell', 'qual', 'destacar', 'ell', 'de', "l'Argentina", ',', 'amb', '198.000', 'parlants).[11', ']']
        elif lemmatizer == 'spacy_cat':
            assert lemmas == ['el', 'català', '(', 'denominació', 'oficial', 'a', 'Catalunya', ',', 'a', 'el', 'Illes', 'Balears', ',', 'a', 'Andorra', ',', 'a', 'el', 'ciutat', 'de', 'el', 'Alguer', 'i', 'tradicional', 'a', 'Catalunya', 'Nord', ')', 'o', 'valencià', '(', 'denominació', 'oficial', 'a', 'el', 'País', 'Valencià', 'i', 'tradicional', 'a', 'el', 'Carxe', ')', 'ser', 'un', 'llengua', 'romànic', 'parlat', 'a', 'Catalunya', ',', 'el', 'País', 'Valencià', '(', 'tret', 'de', 'algun', 'comarca', 'i', 'localitat', 'de', 'el', 'interior', ')', ',', 'el', 'Illes', 'Balears', ',', 'Andorra', ',', 'el', 'Franja', 'de', 'Ponent', '(', 'a', 'el', 'Aragó', ')', ',', 'el', 'ciutat', 'de', 'el', 'Alguer', '(', 'a', 'el', 'illa', 'de', 'Sardenya', ')', ',', 'el', 'Catalunya', 'de', 'el', 'Nord,[8', ']', 'el', 'Carxe', '(', 'un', 'petit', 'territori', 'de', 'Múrcia', 'poblat', 'per', 'immigrat', 'valencians),[9][10', ']', 'i', 'en', 'comunitat', 'arreu', 'de', 'el', 'món', '(', 'entre', 'el', 'qual', 'destacar', 'el', 'de', 'el', 'Argentina', ',', 'amb', '198.000', 'parlants).[11', ']']
        else:
            raise Exception(f'Error: Tests for lemmatizer "{lemmatizer}" is skipped!')
    elif lang == 'hrv':
        assert lemmas == ['Hrvatski', 'jezik', '(', 'ISO', '639', '-', '3', ':', 'hrv', ')', 'skupni', 'biti', 'naziv', 'za', 'nacionalan', 'standardan', 'jezik', 'Hrvata', ',', 'te', 'za', 'skup', 'narječje', 'i', 'govor', 'koji', 'govoriti', 'ili', 'biti', 'nekada', 'govoriti', 'Hrvati', '.']
    elif lang == 'ces':
        assert lemmas == ['Čeština', 'neboli', 'český', 'jazyk', 'on', 'západoslovanský', 'jazyk', ',', 'blízký', 'slovenštině', ',', 'poté', 'lužické', 'srbštině', 'a', 'polštině', '.']
    elif lang == 'dan':
        assert lemmas == ['Dansk', 'være', 'en', 'nordgermansk', 'sprog', 'af', 'den', 'østnordiske', '(', 'kontinental', ')', 'gruppe', ',', 'der', 'tale', 'af', 'ca.', 'seks', 'million', 'menneske', '.']
    elif lang == 'nld':
        assert lemmas == ['het', 'nederlands', 'is', 'een', 'west-germaans', 'taal', 'en', 'de', 'officieel', 'taal', 'van', 'nederland', ',', 'suriname', 'en', 'e', 'van', 'de', 'drie', 'officieel', 'taal', 'van', 'belgië', '.']
    elif lang in ['eng_gb', 'eng_us']:
        if lemmatizer in ['lemmatization_lists_eng', 'nltk_wordnet']:
            assert lemmas == ['English', 'be', 'a', 'West', 'Germanic', 'language', 'of', 'the', 'Indo', '-', 'European', 'language', 'family', ',', 'originally', 'speak', 'by', 'the', 'inhabitant', 'of', 'early', 'medieval', 'England.[3][4][5', ']']
        elif lemmatizer == 'spacy_eng':
            assert lemmas == ['English', 'be', 'a', 'West', 'Germanic', 'language', 'of', 'the', 'Indo', '-', 'european', 'language', 'family', ',', 'originally', 'speak', 'by', 'the', 'inhabitant', 'of', 'early', 'medieval', 'England.[3][4][5', ']']
        else:
            raise Exception(f'Error: Tests for lemmatizer "{lemmatizer}" is skipped!')
    elif lang == 'est':
        assert lemmas == ['Eesti', 'kee', '(', 'varasem', 'nimetu', 'maakeel', ')', 'olema', 'läänemeresoome', 'lõunarühma', 'kuuluma', 'kee', '.']
    elif lang == 'fra':
        if lemmatizer == 'lemmatization_lists_fra':
            assert lemmas == ['Le', 'français', 'être', 'un', 'langue', 'indo', '-', 'européen', 'de', 'le', 'famille', 'un', 'langue', 'roman', 'do', 'nt', 'le', 'locuteurs', 'être', 'appeler', 'francophone', '.']
        elif lemmatizer == 'spacy_fra':
            assert lemmas == ['le', 'français', 'être', 'un', 'langue', 'indo-européen', 'de', 'le', 'famille', 'de', 'langue', 'roman', 'dont', 'le', 'locuteur', 'être', 'appeler', 'francophone', '.']
        else:
            raise Exception(f'Error: Tests for lemmatizer "{lemmatizer}" is skipped!')
    elif lang == 'glg':
        assert lemmas == ['O', 'galego', '(', '[', 'ɡaˈleɣo̝', ']', ')', 'ser', 'un', 'lingua', 'indoeuropeo', 'que', 'pertencer', 'á', 'póla', 'de', 'lingua', 'románico', '.']
    elif lang in ['deu_at', 'deu_de', 'deu_ch']:
        if lemmatizer == 'lemmatization_lists_deu':
            assert lemmas == ['Die', 'deutsch', 'Sprache', 'bzw', '.', 'der', 'deutschen', '(', '[', 'dɔɪ̯tʃ];[26', ']', 'abkürzen', 'dt', '.', 'oder', 'dtsch', '.', ')', 'sein', 'einen', 'westgermanische', 'Sprache', ',', 'der', 'weltweit', 'etwa', '90', 'bis', '105', 'Million', 'Mensch', 'als', 'Muttersprache', 'und', 'weit', 'rund', '80', 'Million', 'als', 'Zweit-', 'oder', 'Fremdsprache', 'dienen', '.']
        elif lemmatizer == 'spacy_deu':
            assert lemmas == ['der', 'deutsch', 'Sprache', 'bzw.', 'der', 'deutschen', '(', '[', 'dɔɪ̯tʃ];[26', ']', 'abkürzen', 'dt', '.', 'oder', 'dtsch', '.', ')', 'sein', 'einen', 'westgermanische', 'Sprache', ',', 'der', 'weltweit', 'etwa', '90', 'bis', '105', 'Million', 'Mensch', 'als', 'Muttersprache', 'und', 'weit', 'rund', '80', 'Million', 'als', 'Zweit-', 'oder', 'Fremdsprache', 'dienen', '.']
        else:
            raise Exception(f'Error: Tests for lemmatizer "{lemmatizer}" is skipped!')
    elif lang == 'grc':
        assert lemmas == ['Ὅτι', 'μέν', 'σύ', ',', 'ὦ', 'ἀνήρ', 'Ἀθηναῖοι', ',', 'πάσχω', 'ὑπό', 'ὁ', 'ἐμός', 'κατηγόρων', ',', 'οὐ', 'οἶδα', '·', 'ἐγώ', 'δέ', 'οὖν', 'καί', 'αὐτός', 'ὑπό', 'αὐτός', 'ὀλίγος', 'ἐμαυτοῦ', 'ἐπελαθόμην', ',', 'οὕτως', 'πιθανῶς', 'λέγω', '.']
    elif lang == 'ell':
        assert lemmas == ['η', 'ελληνικός', 'γλώσσα', 'ανήκω', 'στην', 'ινδοευρωπαϊκός', 'οικογένεια[10', ']', 'και', 'αποτελώ', 'το', 'μοναδικό', 'μέλος', 'το', 'ελληνικός', 'κλάδος', ',', 'ενώ', 'είναι', 'η', 'επίσημη', 'γλώσσα', 'της', 'ελλάδος', 'και', 'της', 'κύπρος', '.']
    elif lang == 'hun':
        assert lemmas == ['A', 'magyar', 'nyelv', 'az', 'uráli', 'nyelvcsalád', 'tag', ',', 'a', 'finnugor', 'nyelv', 'köz', 'tartozó', 'ugor', 'nyelv', 'egyik', '.']
    elif lang == 'ind':
        assert lemmas == ['Bahasa', 'Indonesia', 'adalah', 'bahasa', 'Melayu', 'baku', 'yang', 'dijadikan', 'bagai', 'bahasa', 'resmi', 'Republik', 'Indonesia[1', ']', 'dan', 'bahasa', 'satu', 'bangsa', 'Indonesia.[2', ']']
    elif lang == 'gle':
        if lemmatizer == 'lemmatization_lists_gle':
            assert lemmas == ['Is', 'ceann', 'de', 'na', 'teangach', 'Ceilteacha', 'í', 'an', 'Ghaeilge', '(', 'nó', 'Gaeilge', 'na', 'hÉireann', 'mar', 'a', 'tabhair', 'ar', 'corruair', ')', ',', 'agus', 'ceann', 'den', 'trí', 'ceann', 'de', 'teangach', 'Ceilteacha', 'air', 'a', 'tabhair', 'na', 'teangach', 'Gaelacha', '(', '.i', '.', 'an', 'Ghaeilge', ',', 'Gaeilge', 'na', 'hAlban', 'agus', 'Gaeilge', 'Mhanann', ')', 'go', 'áirithe', '.']
        elif lemmatizer == 'spacy_gle':
            assert lemmas == ['is', 'ceann', 'de', 'na', 'teangacha', 'ceilteacha', 'í', 'an', 'ghaeilge', '(', 'nó', 'gaeilge', 'na', 'héireann', 'mar', 'a', 'thugtar', 'uirthi', 'corruair', ')', ',', 'agus', 'ceann', 'den', 'dtrí', 'cinn', 'de', 'theangacha', 'ceilteacha', 'ar', 'a', 'dtugtar', 'na', 'teangacha', 'gaelacha', '(', '.i.', 'an', 'ghaeilge', ',', 'gaeilge', 'na', 'halban', 'agus', 'gaeilge', 'mhanann', ')', 'go', 'háirithe', '.']
        else:
            raise Exception(f'Error: Tests for lemmatizer "{lemmatizer}" is skipped!')
    elif lang == 'ita':
        if lemmatizer == 'lemmatization_lists_ita':
            assert lemmas == ["L'italiano", '(', '[', 'itaˈljaːno][Nota', '1', ']', 'ascolta[?·info', ']', ')', 'essere', 'una', 'lingua', 'romanzo', 'parlato', 'principalmente', 'in', 'Italia', '.']
        elif lemmatizer == 'spacy_ita':
            assert lemmas == ['il', 'italiano', '(', '[', 'itaˈljaːno][nota', '1', ']', 'ascolta[?·info', ']', ')', 'essere', 'uno', 'lingua', 'romanza', 'parlare', 'principalmente', 'in', 'Italia', '.']
        else:
            raise Exception(f'Error: Tests for lemmatizer "{lemmatizer}" is skipped!')
    elif lang == 'jpn':
        if lemmatizer == 'spacy_jpn':
            assert lemmas == ['日本', '語', '(', 'にほん', 'ご', '、', 'にっぽん', 'ご', '[', '注', '2', ']', '、', '英', ':', 'Japanese', ')', 'は', '、', '日本', '国', '内', 'や', '、', 'かつて', 'の', '日本', '領', 'だ', 'た', '国', '、', 'そして', '日本', '人', '同士', 'の', '間', 'で', '使用', 'する', 'れる', 'て', 'いる', '言語', '。']
        elif lemmatizer == 'sudachipy_jpn':
            assert lemmas == ['日本語', '(', 'にほん', 'ご', '、', 'にっぽん', 'ご', '[', '注', '2', ']', '、', '英', ':', 'Japanese', ')', 'は', '、', '日本', '国', '内', 'や', '、', 'かつて', 'の', '日本', '領', 'だ', 'た', '国', '、', 'そして', '日本人', '同士', 'の', '間', 'で', '使用', 'する', 'れる', 'て', 'いる', '言語', '。']
        else:
            raise Exception(f'Error: Tests for lemmatizer "{lemmatizer}" is skipped!')
    elif lang == 'lit':
        assert lemmas == ['lietuvė', 'kalbėti', '–', 'ižti', 'baltas', 'prokalbės', 'kilęs', 'lietuvė', 'tauta', 'kalbėti', ',', 'kuri', 'Lietuvoje', 'irti', 'valstybinis', ',', 'o', 'Europos', 'sąjunga', '–', 'viena', 'ižti', 'oficialus', 'kalbus', '.']
    elif lang == 'ltz':
        assert lemmas == ["D'", 'Lëtzebuergesch', 'ginn', 'an', 'der', 'däitsch', 'Dialektologie', 'als', 'een', 'westgermanesch', ',', 'mëtteldäitsch', 'Dialekt', 'aklasséieren', ',', 'deen', 'zum', 'Muselfränkesche', 'gehéieren', '.']
    elif lang == 'mkd':
        assert lemmas == ['Македонски', 'јаз', '—', 'јужнословенски', 'јаз', ',', 'дел', 'од', 'група', 'на', 'словенски', 'јазик', 'од', 'јазичен', 'семејство', 'на', 'индоевропски', 'јазик', '.']
    elif lang == 'glv':
        assert lemmas == ['She', 'Gaelg', '(', 'graït', ':', '/gɪlg/', ')', 'çhengey', 'Gaelagh', 'Mannin', '.']
    elif lang == 'nob':
        assert lemmas == ['bokmål', 'er', 'en', 'varietet', 'av', 'norsk', 'språk', '.']
    elif lang == 'fas':
        if lemmatizer == 'lemmatization_lists_fas':
            assert lemmas == ['فارسی', 'یا', 'پارسی', 'یکی', 'از', 'زبان\u200cهای', 'هندواروپایی', 'در', 'شاخهٔ', 'زبان\u200cهای', 'ایرانی', 'جنوب', 'غربی', 'است', 'که', 'در', 'کشورهای', 'ایران', '،', 'افغانستان،[۳', ']', 'تاجیکستان[۴', ']', 'را', 'ازبکستان[۵', ']', 'به', 'آن', 'سخن', 'می\u200cگویند', '.']
        elif lemmatizer == 'spacy_fas':
            assert lemmas == ['فارسی', 'یا', 'پارسی', 'یکی', 'از', 'زبان\u200cهای', 'هندواروپایی', 'در', 'شاخهٔ', 'زبان\u200cهای', 'ایرانی', 'جنوب', 'غربی', 'است', 'که', 'در', 'کشورهای', 'ایران', '،', 'افغانستان،[۳', ']', 'تاجیکستان[۴', ']', 'و', 'ازبکستان[۵', ']', 'به', 'آن', 'سخن', 'می\u200cگویند', '.']
        else:
            raise Exception(f'Error: Tests for lemmatizer "{lemmatizer}" is skipped!')
    elif lang == 'pol':
        assert lemmas == ['język', 'polski', ',', 'polszczyzna', '–', 'język', 'lechicki', 'z', 'grupa', 'zachodniosłowiańskiej', '(', 'do', 'której', 'należeć', 'również', 'czeski', ',', 'kaszubski', ',', 'słowacki', 'i', 'język', 'łużycki', ')', ',', 'stanowiącej', 'część', 'rodzina', 'indoeuropejski', '.']
    elif lang in ['por_br', 'por_pt']:
        if lemmatizer == 'lemmatization_lists_por':
            assert lemmas == ['A', 'língua', 'portuguesar', ',', 'também', 'designar', 'português', ',', 'ser', 'umar', 'língua', 'românico', 'flexivo', 'ocidental', 'originar', 'o', 'galego', '-', 'português', 'falar', 'o', 'Reino', 'da', 'Galiza', 'e', 'o', 'norte', 'de', 'Portugal', '.']
        elif lemmatizer == 'spacy_por':
            assert lemmas == ['A', 'língua', 'portuguesar', ',', 'também', 'designar', 'português', ',', 'ser', 'umar', 'língua', 'românico', 'flexivo', 'ocidental', 'originar', 'o', 'galego-português', 'falar', 'o', 'Reino', 'da', 'Galiza', 'e', 'o', 'norte', 'de', 'Portugal', '.']
        else:
            raise Exception(f'Error: Tests for lemmatizer "{lemmatizer}" is skipped!')
    elif lang == 'ron':
        if lemmatizer == 'lemmatization_lists_ron':
            assert lemmas == ['Limba', 'român', 'fi', 'vrea', 'limbă', 'indo', '-', 'european', ',', 'din', 'grup', 'italic', 'și', 'din', 'subgrupul', 'oriental', 'al', 'limbă', 'romanice', '.']
        elif lemmatizer == 'spacy_ron':
            assert lemmas == ['Limba', 'român', 'fi', 'vrea', 'limbă', 'indo-european', ',', 'din', 'grup', 'italic', 'și', 'din', 'subgrupul', 'oriental', 'al', 'limbă', 'romanice', '.']
        else:
            raise Exception(f'Error: Tests for lemmatizer "{lemmatizer}" is skipped!')
    elif lang == 'rus':
        if lemmatizer == 'lemmatization_lists_rus':
            assert lemmas == ['Ру́сский', 'язы́к', '(', '[', 'ˈruskʲɪi̯', 'jɪˈzɨk', ']', 'Информация', 'о', 'файл', 'слушать)[~', '3', ']', '[', '⇨', ']', 'стальной', 'один', 'из', 'восточнославянский', 'языковый', ',', 'национальный', 'язык', 'русский', 'народ', '.']
        elif lemmatizer == 'pymorphy2_morphological_analyzer':
            assert lemmas == ['ру́сский', 'язы́к', '(', '[', 'ˈruskʲɪi̯', 'jɪˈzɨk', ']', 'информация', 'о', 'файл', 'слушать)[~', '3', ']', '[', '⇨', ']', '—', 'один', 'из', 'восточнославянский', 'язык', ',', 'национальный', 'язык', 'русский', 'народ', '.']
        elif lemmatizer == 'spacy_rus':
            assert lemmas == ['ру́сский', 'язы́к', '(', '[', 'ˈruskʲɪi̯', 'jɪˈzɨk', ']', 'информация', 'о', 'файл', 'слушать)[~', '3', ']', '[', '⇨', ']', '—', 'один', 'из', 'восточнославянский', 'язык', ',', 'национальный', 'язык', 'русский', 'народ', '.']
        else:
            raise Exception(f'Error: Tests for lemmatizer "{lemmatizer}" is skipped!')
    elif lang == 'gla':
        assert lemmas == ["'S", 'i', 'cànan', 'dùthchasach', 'na', 'h', '-', 'Alba', 'a', 'th', "'", 'anns', 'a', "'", 'Ghàidhlig', '.']
    elif lang == 'srp_cyrl':
        assert lemmas == ['Српски', 'језик', 'припадати', 'словенски', 'група', 'језик', 'породица', 'индоевропских', 'језика.[12', ']']
    elif lang == 'slk':
        assert lemmas == ['Slovenčina', 'patriť', 'do', 'skupina', 'západoslovanský', 'jazyk', '(', 'spolu', 's', 'čeština', ',', 'poľština', ',', 'horný', 'as', 'dolný', 'lužickou', 'srbčina', 'as', 'kašubčinou', ')', '.']
    elif lang == 'slv':
        assert lemmas == ['Slovenščina', '[', 'slovénščina', ']', '/', '[', 'sloˈʋenʃtʃina', ']', 'onbiti', 'združen', 'naziv', 'za', 'uraden', 'knjižen', 'jezik', 'Slovenec', 'in', 'skupen', 'ime', 'za', 'narečje', 'in', 'govoriti', ',', 'ki', 'on', 'govoriti', 'ali', 'biti', 'on', 'nekoč', 'govoriti', 'Slovenec', '.']
    elif lang == 'spa':
        if lemmatizer == 'lemmatization_lists_spa':
            assert lemmas == ['El', 'español', 'o', 'castellano', 'ser', 'uno', 'lengua', 'romance', 'procedente', 'del', 'latín', 'hablar', ',', 'perteneciente', 'a', 'lo', 'familia', 'de', 'lengua', 'indoeuropeo', '.']
        elif lemmatizer == 'spacy_spa':
            assert lemmas == ['el', 'español', 'o', 'castellano', 'ser', 'uno', 'lengua', 'romance', 'procedente', 'del', 'latín', 'hablado', ',', 'perteneciente', 'a', 'el', 'familia', 'de', 'lengua', 'indoeuropea', '.']
        else:
            raise Exception(f'Error: Tests for lemmatizer "{lemmatizer}" is skipped!')
    elif lang == 'swe':
        if lemmatizer == 'lemmatization_lists_swe':
            assert lemmas == ['Svenska', '(', 'svensk', '(', 'info', ')', ')', 'vara', 'en', 'östnordiskt', 'språka', 'som', 'tala', 'av', 'ungefär', 'tio', 'miljon', 'person', 'främst', 'i', 'Sverige', 'där', 'språk', 'hare', 'man', 'dominant', 'ställning', 'som', 'huvudspråk', ',', 'mena', 'även', 'som', 'en', 'en', 'nationalspråk', 'i', 'Finland', 'och', 'som', 'enda', 'officiell', 'språka', 'på', 'Åland', '.']
        elif lemmatizer == 'spacy_swe':
            assert lemmas == ['svenska', '(', 'svenska', '(', 'info', ')', ')', 'är', 'ett', 'östnordiskt', 'språk', 'som', 'talas', 'av', 'ungefär', 'tio', 'miljoner', 'personer', 'främst', 'i', 'sverige', 'där', 'språket', 'har', 'en', 'dominant', 'ställning', 'som', 'huvudspråk', ',', 'men', 'även', 'som', 'det', 'ena', 'nationalspråket', 'i', 'finland', 'och', 'som', 'enda', 'officiella', 'språk', 'på', 'åland', '.']
        else:
            raise Exception(f'Error: Tests for lemmatizer "{lemmatizer}" is skipped!')
    elif lang == 'tgl':
        assert lemmas == ['Ang', 'Wikang', 'Tagalog[2', ']', '(', 'Baybayin', ':', 'ᜏᜒᜃᜅ᜔', 'ᜆᜄᜎᜓᜄ᜔', ')', ',', 'na', 'kilala', 'rin', 'sa', 'payak', 'na', 'pangalang', 'Tagalog', ',', 'ay', 'isa', 'sa', 'mga', 'pangunahing', 'wika', 'ng', 'Pilipinas', 'at', 'sinasabing', 'ito', 'ang', 'de', 'facto', '(', '"', 'sa', 'katunayan', '"', ')', 'ngunit', 'hindî', 'de', 'jure', '(', '"', 'sa', 'batas', '"', ')', 'na', 'batayan', 'na', 'siyang', 'pambansang', 'Wikang', 'Filipino', '(', 'mula', '1961', 'hanggang', '1987', ':', 'Pilipino).[2', ']']
    elif lang == 'bod':
        assert lemmas == ['བོད་', 'གི་', 'སྐད་ཡིག་', 'ནི་', 'བོད་ཡུལ་', 'དང་', 'དེ་', 'གི་', 'ཉེ་འཁོར་', 'གི་', 'ས་ཁུལ་', 'སྟེ་', '།']
    elif lang == 'tur':
        assert lemmas == ['Türkçe', 'ya', 'da', 'Türk', 'dil', ',', 'batı', 'Balkanlar’dan', 'başla', 'doğu', 'Hazar', 'Denizi', 'saha', 'kadar', 'konuş', 'Türkî', 'dil', 'dil', 'aile', 'ait', 'son', 'ekle', 'bir', 'dil.[12', ']']
    elif lang == 'ukr':
        if lemmatizer == 'lemmatization_lists_ukr':
            assert lemmas == ['Украї́нська', 'мо́ва', '(', 'МФА', ':', '[', 'ukrɑ̽ˈjɪnʲsʲkɑ̽', 'ˈmɔwɑ̽', ']', ',', 'історичний', 'назвати', '—', 'ру́ська', ',', 'руси́нська[9][10][11', ']', '[', '*', '2', ']', ')', '—', 'національний', 'мова', 'українець', '.']
        elif lemmatizer == 'pymorphy2_morphological_analyzer':
            assert lemmas == ['украї́нський', 'мо́вий', '(', 'мфа', ':', '[', 'ukrɑ̽ˈjɪnʲsʲkɑ̽', 'ˈmɔwɑ̽', ']', ',', 'історичний', 'назва', '—', 'ру́ський', ',', 'руси́нська[9][10][11', ']', '[', '*', '2', ']', ')', '—', 'національний', 'мова', 'українець', '.']
        else:
            raise Exception(f'Error: Tests for lemmatizer "{lemmatizer}" is skipped!')
    elif lang == 'urd':
        assert lemmas == ['اُردُو', 'لشکری', 'زبان[8', ']', '(', 'یا', 'جدید', 'معیاری', 'اردو', ')', 'برصغیر', 'کم', 'معیاری', 'زبان', 'میں', 'سے', 'ایک', 'ہونا', '۔']
    elif lang == 'cym':
        assert lemmas == ['Aelod', "o'r", 'cangen', 'Frythonaidd', "o'r", 'iaith', 'Celtaidd', 'a', 'siarad', 'bod', 'brodorol', 'yn', 'Nghymru', ',', 'can', 'Gymry', 'a', 'pobl', 'arall', 'aredig', 'gwasgar', 'bod', 'Lloegr', ',', 'a', 'can', 'cymuno', 'bechan', 'bod', 'Y', 'Wladfa', ',', 'gwybod', 'Ariannin[7', ']', "yw'r", 'Gymraeg', '(', 'hefyd', 'Cymraeg', 'heb', 'yr', 'bannod', ')', '.']
    else:
        raise Exception(f'Error: Tests for language "{lang}" is skipped!')