def test_corpus_from_files2(): c = Corpus.from_files([ 'examples/data/gutenberg/werther/goethe_werther1.txt', 'examples/data/gutenberg/werther/goethe_werther2.txt' ]) assert len(c.docs) == len(c.doc_paths) == 2 for k, d in c.docs.items(): assert k[:-1].endswith('goethe_werther') assert len(d) > 0
def test_corpus_from_files(): doc_path = 'examples/data/gutenberg/kafka_verwandlung.txt' c1 = Corpus.from_files([doc_path]) c2 = Corpus().add_files([doc_path]) assert len(c1.docs) == len(c1.doc_paths) == 1 assert len(c2.docs) == len(c2.doc_paths) == 1 assert c1.docs.keys() == c2.docs.keys() == c1.doc_paths.keys( ) == c2.doc_paths.keys() only_doc_label = next(iter(c1.docs.keys())) assert only_doc_label.endswith('kafka_verwandlung') only_doc = c1.docs[only_doc_label] assert len(only_doc) > 0 assert c1.doc_paths[only_doc_label] == doc_path
def test_corpus_from_files_not_existent(): with pytest.raises(IOError): Corpus.from_files([ 'examples/data/gutenberg/werther/goethe_werther1.txt', 'not_existent' ])
def test_corpus_from_files_nonlist_arg(): with pytest.raises(ValueError): Corpus.from_files('wrong')
def test_empty_corpora(): c1 = Corpus() c2 = Corpus.from_files([]) c3 = Corpus.from_files([]).add_files([]) assert c1.docs == c2.docs == c3.docs == {}
def test_corpus_from_files_nonlist_arg(): with pytest.raises(FileNotFoundError): Corpus.from_files('wrong')