Пример #1
0
def test_corpus_from_folder_not_existent():
    with pytest.raises(IOError):
        Corpus.from_folder('not_existent')
Пример #2
0
def test_corpus_filter_by_min_length():
    c = Corpus.from_folder('tests/data/gutenberg', force_unix_linebreaks=False)
    assert len(c.filter_by_min_length(1).docs) == 3
    assert len(c.filter_by_min_length(142694).docs) == 1
    assert len(c.filter_by_min_length(142695).docs) == 0
    assert len(c.filter_by_min_length(1).docs) == 0
Пример #3
0
def test_corpus_from_folder():
    c = Corpus.from_folder('examples/data/gutenberg')
    assert len(c.docs) == 3
Пример #4
0
def test_corpus_get_doc_labels():
    c = Corpus.from_folder('tests/data/gutenberg')
    assert set(c.docs.keys()) == set(c.get_doc_labels())
Пример #5
0
# -*- coding: utf-8 -*-
"""
An example for constructing a corpus of texts from files and passing them to the preprocessing step.
"""
from tmtoolkit.corpus import Corpus
from tmtoolkit.preprocess import TMPreproc

if __name__ == '__main__':  # this is necessary for multiprocessing on Windows!
    corpus = Corpus.from_folder('data/gutenberg')
    print("all loaded documents:")
    print(corpus.docs.keys())
    print("-----")

    corpus.split_by_paragraphs()
    print("documents split into paragraphs")
    print(corpus.docs.keys())
    print("-----")

    print("first 5 paragraphs of Werther:")
    for par_num in range(1, 6):
        doclabel = u'werther-goethe_werther1-%d' % par_num
        print(u"par%d (document label '%s'):" % (par_num, doclabel))
        print(corpus.docs[doclabel])
    print("-----")

    preproc = TMPreproc(corpus.docs, language=u'german')
    preproc.tokenize().tokens_to_lowercase()

    print("tokenized first 5 paragraphs of Werther:")
    for par_num in range(1, 6):
        doclabel = u'werther-goethe_werther1-%d' % par_num
Пример #6
0
"""
Script that generates "eval_table/eval_table.csv" from text samples in folder "eval_texts". This table is later
used to manually add correct lemmata.

Markus Konrad <*****@*****.**>, Wissenschaftszentrum Berlin für Sozialforschung
January 2019
"""

import pandas as pd
from tmtoolkit.corpus import Corpus
from tmtoolkit.preprocess import TMPreproc

corpus = Corpus.from_folder('eval_texts')

preproc = TMPreproc(corpus.docs, language='german')

postagged = preproc.tokenize().pos_tag()
postagged = postagged.filter_for_pos({'N', 'V', 'ADJ', 'ADV'})

tok_pos_df = pd.DataFrame()
for doc_id, tok_pos in postagged.tokens_with_pos_tags.items():
    tok, pos = zip(*tok_pos)
    tok_pos_df = tok_pos_df.append(pd.DataFrame({
        'doc_id': doc_id,
        'token': tok,
        'pos': pos
    }), ignore_index=True)

tok_pos_df.drop_duplicates(['token', 'pos'], inplace=True)

tok_pos_df.to_csv('eval_table/eval_table.csv')
Пример #7
0
    f_id: nltk.corpus.gutenberg.raw(f_id)
    for f_id in nltk.corpus.gutenberg.fileids()
}
smaller_docs_en = [(dl, txt[:min(nchar, MAX_DOC_LEN)])
                   for dl, txt, nchar in map(lambda x: (x[0], x[1], len(x[1])),
                                             all_docs_en.items())]

corpus_en = Corpus(
    dict(
        sample([(dl, txt) for dl, txt in smaller_docs_en
                if dl != u'melville-moby_dick.txt'], N_DOCS_EN - 2)))
corpus_en.docs['empty_doc'] = ''  # additionally test empty document
corpus_en.docs[u'melville-moby_dick.txt'] = dict(smaller_docs_en)[
    u'melville-moby_dick.txt']  # make sure we always have moby dick
#corpus_en = Corpus(dict(smaller_docs_en))
corpus_de = Corpus.from_folder('examples/data/gutenberg',
                               read_size=MAX_DOC_LEN)


@pytest.fixture
def tmpreproc_en():
    return TMPreproc(corpus_en.docs, language='english')


@pytest.fixture
def tmpreproc_de():
    return TMPreproc(corpus_de.docs, language='german')


def test_fixtures(tmpreproc_en, tmpreproc_de):
    assert len(tmpreproc_en.docs) == N_DOCS_EN
    assert len(tmpreproc_de.docs) == N_DOCS_DE
Пример #8
0
# %%
from tmtoolkit.corpus import Corpus
from tmtoolkit.preprocess import TMPreproc
import pandas as pd
from src.d00_utils.del_chars import del_chars
import re
from pprint import pprint
import string
import pickle
from tqdm import tqdm

# %%
PATH = 'data/corpus'
corpus = Corpus.from_folder(PATH + '/plenar', encoding='utf8')
# corpus = Corpus.from_folder(PATH + '/plenar')
# corpus.add_folder(PATH + '/presse')
# corpus.add_folder(PATH + '/twitter')

doc_labels = corpus.get_doc_labels(sort=True)

# %%
table_umlauts = {
    "ß": "ß",
    "ãÿ": "ß",
    "ã¤": "ä",
    "ã¼": "ü",
    "ã¶": "ö",
    'Ä': 'Ä',
    "Ãœ": "Ü",
    "Ö": "Ö",
    '€': '€'