def test_corpus_from_folder_not_existent(): with pytest.raises(IOError): Corpus.from_folder('not_existent')
def test_corpus_filter_by_min_length(): c = Corpus.from_folder('tests/data/gutenberg', force_unix_linebreaks=False) assert len(c.filter_by_min_length(1).docs) == 3 assert len(c.filter_by_min_length(142694).docs) == 1 assert len(c.filter_by_min_length(142695).docs) == 0 assert len(c.filter_by_min_length(1).docs) == 0
def test_corpus_from_folder(): c = Corpus.from_folder('examples/data/gutenberg') assert len(c.docs) == 3
def test_corpus_get_doc_labels(): c = Corpus.from_folder('tests/data/gutenberg') assert set(c.docs.keys()) == set(c.get_doc_labels())
# -*- coding: utf-8 -*- """ An example for constructing a corpus of texts from files and passing them to the preprocessing step. """ from tmtoolkit.corpus import Corpus from tmtoolkit.preprocess import TMPreproc if __name__ == '__main__': # this is necessary for multiprocessing on Windows! corpus = Corpus.from_folder('data/gutenberg') print("all loaded documents:") print(corpus.docs.keys()) print("-----") corpus.split_by_paragraphs() print("documents split into paragraphs") print(corpus.docs.keys()) print("-----") print("first 5 paragraphs of Werther:") for par_num in range(1, 6): doclabel = u'werther-goethe_werther1-%d' % par_num print(u"par%d (document label '%s'):" % (par_num, doclabel)) print(corpus.docs[doclabel]) print("-----") preproc = TMPreproc(corpus.docs, language=u'german') preproc.tokenize().tokens_to_lowercase() print("tokenized first 5 paragraphs of Werther:") for par_num in range(1, 6): doclabel = u'werther-goethe_werther1-%d' % par_num
""" Script that generates "eval_table/eval_table.csv" from text samples in folder "eval_texts". This table is later used to manually add correct lemmata. Markus Konrad <*****@*****.**>, Wissenschaftszentrum Berlin für Sozialforschung January 2019 """ import pandas as pd from tmtoolkit.corpus import Corpus from tmtoolkit.preprocess import TMPreproc corpus = Corpus.from_folder('eval_texts') preproc = TMPreproc(corpus.docs, language='german') postagged = preproc.tokenize().pos_tag() postagged = postagged.filter_for_pos({'N', 'V', 'ADJ', 'ADV'}) tok_pos_df = pd.DataFrame() for doc_id, tok_pos in postagged.tokens_with_pos_tags.items(): tok, pos = zip(*tok_pos) tok_pos_df = tok_pos_df.append(pd.DataFrame({ 'doc_id': doc_id, 'token': tok, 'pos': pos }), ignore_index=True) tok_pos_df.drop_duplicates(['token', 'pos'], inplace=True) tok_pos_df.to_csv('eval_table/eval_table.csv')
f_id: nltk.corpus.gutenberg.raw(f_id) for f_id in nltk.corpus.gutenberg.fileids() } smaller_docs_en = [(dl, txt[:min(nchar, MAX_DOC_LEN)]) for dl, txt, nchar in map(lambda x: (x[0], x[1], len(x[1])), all_docs_en.items())] corpus_en = Corpus( dict( sample([(dl, txt) for dl, txt in smaller_docs_en if dl != u'melville-moby_dick.txt'], N_DOCS_EN - 2))) corpus_en.docs['empty_doc'] = '' # additionally test empty document corpus_en.docs[u'melville-moby_dick.txt'] = dict(smaller_docs_en)[ u'melville-moby_dick.txt'] # make sure we always have moby dick #corpus_en = Corpus(dict(smaller_docs_en)) corpus_de = Corpus.from_folder('examples/data/gutenberg', read_size=MAX_DOC_LEN) @pytest.fixture def tmpreproc_en(): return TMPreproc(corpus_en.docs, language='english') @pytest.fixture def tmpreproc_de(): return TMPreproc(corpus_de.docs, language='german') def test_fixtures(tmpreproc_en, tmpreproc_de): assert len(tmpreproc_en.docs) == N_DOCS_EN assert len(tmpreproc_de.docs) == N_DOCS_DE
# %% from tmtoolkit.corpus import Corpus from tmtoolkit.preprocess import TMPreproc import pandas as pd from src.d00_utils.del_chars import del_chars import re from pprint import pprint import string import pickle from tqdm import tqdm # %% PATH = 'data/corpus' corpus = Corpus.from_folder(PATH + '/plenar', encoding='utf8') # corpus = Corpus.from_folder(PATH + '/plenar') # corpus.add_folder(PATH + '/presse') # corpus.add_folder(PATH + '/twitter') doc_labels = corpus.get_doc_labels(sort=True) # %% table_umlauts = { "ß": "ß", "ãÿ": "ß", "ã¤": "ä", "ã¼": "ü", "ã¶": "ö", 'Ä': 'Ä', "Ãœ": "Ü", "Ö": "Ö", '€': '€'