Exemplo n.º 1
0
    def run(self):
        preview_results = []

        preview_lang = self.main.settings_custom['lemmatization'][
            'preview_lang']
        preview_samples = self.main.settings_custom['lemmatization'][
            'preview_samples']

        for line in preview_samples.split('\n'):
            line = line.strip()

            if line:
                tokens = wl_word_tokenization.wl_word_tokenize(
                    self.main, line, lang=preview_lang)
                tokens = wl_misc.flatten_list(tokens)

                lemmas = wl_lemmatization.wl_lemmatize(
                    self.main,
                    tokens,
                    lang=preview_lang,
                    lemmatizer=self.lemmatizer)

                text = wl_word_detokenization.wl_word_detokenize(
                    self.main, lemmas, lang=preview_lang)

                preview_results.append(text)
            else:
                preview_results.append('')

        self.worker_done.emit(preview_samples, preview_results)
Exemplo n.º 2
0
    def run(self):
        preview_results = []

        preview_lang = self.main.settings_custom['word_detokenization']['preview_lang']
        preview_samples = self.main.settings_custom['word_detokenization']['preview_samples']

        for line in preview_samples.splitlines():
            line = line.strip()

            if line:
                text = wl_word_detokenization.wl_word_detokenize(
                    self.main,
                    tokens = line.split(),
                    lang = preview_lang,
                    word_detokenizer = self.word_detokenizer
                )
                
                preview_results.append(text)
            else:
                preview_results.append('')

        self.worker_done.emit(preview_samples, preview_results)
Exemplo n.º 3
0
def test_word_detokenize(lang, word_detokenizer, show_results=False):
    lang_text = wl_conversion.to_lang_text(main, lang)

    tokens = wl_word_tokenization.wl_word_tokenize(
        main,
        text=getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'),
        lang=lang)
    text = wl_word_detokenization.wl_word_detokenize(
        main, tokens=tokens, lang=lang, word_detokenizer=word_detokenizer)

    if show_results:
        print(f'{lang} / {word_detokenizer}:')
        print(text)

    if lang == 'cat':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == "El català (denominació oficial a Catalunya, a les Illes Balears, a Andorra, a la ciutat de l' Alguer i tradicional a Catalunya Nord) o valencià (denominació oficial al País Valencià i tradicional al Carxe) és una llengua romànica parlada a Catalunya, el País Valencià (tret d' algunes comarques i localitats de l' interior), les Illes Balears, Andorra, la Franja de Ponent (a l' Aragó), la ciutat de l' Alguer (a l' illa de Sardenya), la Catalunya del Nord,[8] el Carxe (un petit territori de Múrcia poblat per immigrats valencians),[9][10] i en petites comunitats arreu del món (entre les quals destaca la de l' Argentina, amb 195.000 parlants).[11 ]"
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == "El català (denominació oficial a Catalunya, a les Illes Balears, a Andorra, a la ciutat de l' Alguer i tradicional a Catalunya Nord) o valencià (denominació oficial al País Valencià i tradicional al Carxe) és una llengua romànica parlada a Catalunya, el País Valencià (tret d' algunes comarques i localitats de l' interior), les Illes Balears, Andorra, la Franja de Ponent (a l' Aragó), la ciutat de l' Alguer (a l' illa de Sardenya), la Catalunya del Nord,[8] el Carxe (un petit territori de Múrcia poblat per immigrats valencians),[9][10] i en petites comunitats arreu del món (entre les quals destaca la de l' Argentina, amb 195.000 parlants).[11]"
    elif lang == 'zho_cn':
        assert text == '汉语,又称汉文、中文、中国话、中国语、华语、华文、唐话[2],或被视为一个语族,或被视为隶属于汉藏语系汉语族之一种语言。'
    elif lang == 'zho_tw':
        assert text == '漢語,又稱漢文、中文、中國話、中國語、華語、華文、唐話[2],或被視為一個語族,或被視為隸屬於漢藏語系漢語族之一種語言。'
    elif lang == 'ces':
        assert text == 'Čeština neboli český jazyk je západoslovanský jazyk, nejbližší slovenštině, poté lužické srbštině a polštině.'
    elif lang == 'nld':
        assert text == 'Het Nederlands is een West-Germaanse taal en de moedertaal van de meeste inwoners van Nederland, België en Suriname.'
    elif lang == 'eng':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == 'English is a West Germanic language that was first spoken in early medieval England and eventually became a global lingua franca.[4][5 ]'
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == 'English is a West Germanic language that was first spoken in early medieval England and eventually became a global lingua franca.[4][5]'
    elif lang == 'fin':
        assert text == 'Suomen kieli (suomi) on uralilaisten kielten itämerensuomalaiseen ryhmään kuuluva kieli.'
    elif lang == 'fra':
        assert text == 'Le français est une langue indo-européenne de la famille des langues romanes.'
    elif lang == 'deu':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == 'Die deutsche Sprache bzw. Deutsch ([ dɔʏ̯t͡ʃ]; abgekürzt dt . oder dtsch .) ist eine westgermanische Sprache.'
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == 'Die deutsche Sprache bzw. Deutsch ([dɔʏ̯t͡ʃ]; abgekürzt dt. oder dtsch.) ist eine westgermanische Sprache.'
    elif lang == 'ell':
        assert text == 'Η ελληνική γλώσσα ανήκει στην ινδοευρωπαϊκή οικογένεια[9] και συγκεκριμένα στον ελληνικό κλάδο, μαζί με την τσακωνική, ενώ είναι η επίσημη γλώσσα της Ελλάδος και της Κύπρου.'
    elif lang == 'hun':
        assert text == 'A magyar nyelv az uráli nyelvcsalád tagja, a finnugor nyelvek közé tartozó ugor nyelvek egyike.'
    elif lang == 'isl':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == 'Íslenska er vesturnorrænt, germanskt og indóevrópskt tungumál sem er einkum talað og ritað á Íslandi og er móðurmál langflestra Íslendinga.[4 ]'
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == 'Íslenska er vesturnorrænt, germanskt og indóevrópskt tungumál sem er einkum talað og ritað á Íslandi og er móðurmál langflestra Íslendinga.[4]'
    elif lang == 'gle':
        assert text == 'Is ceann de na teangacha Ceilteacha í an Ghaeilge (nó Gaeilge na hÉireann mar a thugtar uirthi corruair), agus ceann den dtrí cinn de theangacha Ceilteacha ar a dtugtar na teangacha Gaelacha (.i. an Ghaeilge, Gaeilge na hAlban agus Gaeilge Mhanann) go háirithe.'
    elif lang == 'ita':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == "L' italiano ([ itaˈljaːno][Nota 1] ascolta[?·info] ) è una lingua romanza parlata principalmente in Italia."
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == "L'italiano ([itaˈljaːno][Nota 1] ascolta[?·info]) è una lingua romanza parlata principalmente in Italia."
    elif lang == 'jpn':
        assert text == '日本語(にほんご、にっぽんご[注1])は、主に日本国内や日本人同士の間で使用されている言語である。'
    elif lang == 'lav':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == 'Latviešu valoda ir dzimtā valoda apmēram 1,7 miljoniem cilvēku, galvenokārt Latvijā, kur tā ir vienīgā valsts valoda . [3 ]'
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == 'Latviešu valoda ir dzimtā valoda apmēram 1,7 miljoniem cilvēku, galvenokārt Latvijā, kur tā ir vienīgā valsts valoda. [3]'
    elif lang == 'lit':
        assert text == 'Lietuvių kalba – iš baltų prokalbės kilusi lietuvių tautos kalba, kuri Lietuvoje yra valstybinė, o Europos Sąjungoje – viena iš oficialiųjų kalbų.'
    elif lang == 'pol':
        assert text == 'Język polski, polszczyzna, skrót: pol. – język naturalny należący do grupy języków zachodniosłowiańskich (do której należą również czeski, słowacki, kaszubski, dolnołużycki, górnołużycki i wymarły połabski), stanowiącej część rodziny języków indoeuropejskich.'
    elif lang == 'por':
        assert text == 'A língua portuguesa, também designada português, é uma língua românica flexiva ocidental originada no galego-português falado no Reino da Galiza e no norte de Portugal.'
    elif lang == 'ron':
        assert text == 'Limba română este o limbă indo - europeană, din grupul italic și din subgrupul oriental al limbilor romanice.'
    elif lang == 'rus':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == 'Ру́сский язы́к ([ ˈruskʲɪi̯ jɪˈzɨk] Информация о файле слушать) [~ 3] [⇨] — один из восточнославянских языков, национальный язык русского народа.'
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == 'Ру́сский язы́к ([ˈruskʲɪi̯ jɪˈzɨk] Информация о файле слушать) [~ 3] [⇨] — один из восточнославянских языков, национальный язык русского народа.'
    elif lang == 'slk':
        assert text == 'Slovenčina patrí do skupiny západoslovanských jazykov (spolu s češtinou, poľštinou, hornou a dolnou lužickou srbčinou a kašubčinou).'
    elif lang == 'slv':
        assert text == 'Slovenščina [slovénščina] / [sloˈʋenʃtʃina] je združeni naziv za uradni knjižni jezik Slovencev in skupno ime za narečja in govore, ki jih govorijo ali so jih nekoč govorili Slovenci.'
    elif lang == 'spa':
        assert text == 'El español o castellano es una lengua romance procedente del latín hablado.'
    elif lang == 'swe':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == 'Svenska (svenska (info) ) är ett östnordiskt språk som talas av ungefär tio miljoner personer främst i Sverige där språket har en dominant ställning som huvudspråk, men även som det ena nationalspråket i Finland och som enda officiella språk på Åland.'
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == 'Svenska (svenska (info)) är ett östnordiskt språk som talas av ungefär tio miljoner personer främst i Sverige där språket har en dominant ställning som huvudspråk, men även som det ena nationalspråket i Finland och som enda officiella språk på Åland.'
    elif lang == 'tam':
        assert text == 'தமிழ் மொழி (Tamil language) தமிழர்களினதும், தமிழ் பேசும் பலரதும் தாய்மொழி ஆகும்.'
    elif lang == 'tha':
        assert text == 'ภาษาไทยหรือภาษาไทยกลางเป็นภาษาราชการและภาษาประจำชาติของประเทศไทย'
    elif lang == 'bod':
        assert text == 'བོད་ཀྱི་སྐད་ཡིག་ནི་བོད་ཡུལ་དང་དེའི་ཉེ་འཁོར་གྱི་ས་ཁུལ་ཏེ།'
Exemplo n.º 4
0
def wl_process_tokens_concordancer(text, token_settings):
    main = text.main
    tokens = text.tokens_flat.copy()

    settings = copy.deepcopy(token_settings)

    # Token Settings
    if settings['use_tags']:
        settings['ignore_tags'] = settings['ignore_tags_tags']
        settings['ignore_tags_type'] = settings['ignore_tags_type_tags']

    # Punctuations
    if not settings['puncs']:
        tokens = [
            token for token in tokens
            if not wl_checking_token.is_token_punc(token)
        ]

        text.offsets_paras = []
        text.offsets_sentences = []
        text.offsets_clauses = []
        text.tokens_flat = []

        for para in text.tokens_multilevel:
            text.offsets_paras.append(len(text.tokens_flat))

            for sentence in para:
                text.offsets_sentences.append(len(text.tokens_flat))

                for clause in sentence:
                    text.offsets_clauses.append(len(text.tokens_flat))

                    for token in clause:
                        if text.tokens_flat:
                            if wl_checking_token.is_token_punc(token):
                                text.tokens_flat[
                                    -1] = wl_word_detokenization.wl_word_detokenize(
                                        main, [text.tokens_flat[-1], token],
                                        lang=text.lang)
                            else:
                                text.tokens_flat.append(token)
                        else:
                            text.tokens_flat.append(token)

        # Remove duplicate offsets
        text.offsets_paras = sorted(set(text.offsets_paras))
        text.offsets_sentences = sorted(set(text.offsets_sentences))
        text.offsets_clauses = sorted(set(text.offsets_clauses))

        # Check if the first token is a punctuation mark
        if wl_checking_token.is_token_punc(text.tokens_flat[0]):
            tokens.insert(0, [])

    # Ignore tags
    if settings['ignore_tags']:
        # Ignore all tags
        if settings['ignore_tags_type'] == main.tr('all'):
            tokens = [(token, []) for token in tokens]
            text.tokens_flat = [(token, []) for token in text.tokens_flat]
        # Ignore POS tags
        elif settings['ignore_tags_type'] == main.tr('POS'):
            tokens = [(token, tags)
                      for token, tags in zip(tokens, text.tags_non_pos)]
            text.tokens_flat = [
                (token, tags)
                for token, tags in zip(text.tokens_flat, text.tags_non_pos)
            ]
        # Ignore non-POS tags
        elif settings['ignore_tags_type'] == main.tr('non-POS'):
            tokens = [(token, tags)
                      for token, tags in zip(tokens, text.tags_pos)]
            text.tokens_flat = [
                (token, tags)
                for token, tags in zip(text.tokens_flat, text.tags_pos)
            ]
    else:
        tokens = [(token, tags) for token, tags in zip(tokens, text.tags_all)]
        text.tokens_flat = [
            (token, tags)
            for token, tags in zip(text.tokens_flat, text.tags_all)
        ]

    # Use tags only
    if settings['use_tags']:
        tokens = [''.join(tags) for _, tags in tokens]
        text.tokens_flat = [''.join(tags) for _, tags in text.tokens_flat]
    else:
        tokens = [f"{token}{''.join(tags)}" for token, tags in tokens]
        text.tokens_flat = [
            f"{token}{''.join(tags)}" for token, tags in text.tokens_flat
        ]

    return tokens
Exemplo n.º 5
0
def wl_sentence_tokenize(main, text, lang, sentence_tokenizer='default'):
    sentences = []

    if lang not in main.settings_global['sentence_tokenizers']:
        lang = 'other'

    if sentence_tokenizer == 'default':
        sentence_tokenizer = main.settings_custom['sentence_tokenization'][
            'sentence_tokenizers'][lang]

    wl_text_utils.check_sentence_tokenizers(
        main, lang=lang, sentence_tokenizer=sentence_tokenizer)

    # NLTK
    if sentence_tokenizer == main.tr('NLTK - Punkt Sentence Tokenizer'):
        lang_texts = {
            'ces': 'czech',
            'dan': 'danish',
            'nld': 'dutch',
            'eng': 'english',
            'est': 'estonian',
            'fin': 'finnish',
            'fra': 'french',
            'deu': 'german',
            # Greek (Modern)
            'ell': 'greek',
            'ita': 'italian',
            # Norwegian Bokmål & Norwegian Nynorsk
            'nob': 'norwegian',
            'nno': 'norwegian',
            'pol': 'polish',
            'por': 'portuguese',
            'rus': 'russian',
            'slv': 'slovene',
            'spa': 'spanish',
            'swe': 'swedish',
            'tur': 'turkish',
            # Other languages
            'other': 'english'
        }

        sentences = nltk.sent_tokenize(text, language=lang_texts[lang])
    # spaCy
    elif sentence_tokenizer == main.tr('spaCy - Sentencizer'):
        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)
        # See Issue #3479: https://github.com/explosion/spaCy/issues/3479
        doc.is_parsed = True

        sentences = [sentence.text for sentence in doc.sents]
    # syntok
    elif sentence_tokenizer == main.tr('syntok - Sentence Segmenter'):
        for para in syntok.segmenter.analyze(text):
            for sentence in para:

                sentences.append(''.join(
                    [token.spacing + token.value for token in sentence]))
    # Chinese & Japanese
    elif sentence_tokenizer in [
            main.tr('Wordless - Chinese Sentence Tokenizer'),
            main.tr('Wordless - Japanese Sentence Tokenizer')
    ]:
        for line in text.splitlines():
            sentence_start = 0

            for i, char in enumerate(line):
                if i >= sentence_start and char in ['。', '!', '?', '!', '?']:
                    for j, char in enumerate(line):
                        if j > i and char not in [
                                '。', '!', '?', '!', '?', '’', '”', ')', ')'
                        ]:
                            sentences.append(line[sentence_start:j])

                            sentence_start = j

                            break

            if sentence_start <= len(line):
                sentences.append(line[sentence_start:])
    # Icelandic
    elif sentence_tokenizer == main.tr(
            'Tokenizer - Icelandic Sentence Tokenizer'):
        for sentence in tokenizer.split_into_sentences(text):
            sentences.append(
                wl_word_detokenization.wl_word_detokenize(
                    main, tokens=sentence.split(), lang='isl'))

    # Russian
    elif sentence_tokenizer == main.tr('razdel - Russian Sentenizer'):
        sentences = [sentence.text for sentence in razdel.sentenize(text)]
    # Thai
    elif sentence_tokenizer == main.tr('PyThaiNLP - CRFCut'):
        sentences = pythainlp.sent_tokenize(text)
    # Tibetan
    elif sentence_tokenizer == main.tr('botok - Tibetan Sentence Tokenizer'):
        wl_text_utils.check_word_tokenizers(main, lang='bod')
        tokens = main.botok_word_tokenizer.tokenize(text)

        for sentence_tokens in botok.sentence_tokenizer(tokens):
            sentences.append(''.join([
                sentence_token.text for sentence_token in sentence_tokens[1]
            ]))
    # Vietnamese
    elif sentence_tokenizer == main.tr(
            'Underthesea - Vietnamese Sentence Tokenizer'):
        sentences = underthesea.sent_tokenize(text)

    # Strip spaces
    sentences = [sentence.strip() for sentence in sentences]

    sentences = wl_text_utils.record_boundary_sentences(sentences, text)

    return sentences