def __test__ (ut) : from nlplib.core.process.index import Indexed from nlplib.core.model import Database, Document, Word, Gram _test_window(ut) document_strings = ['Python is a widely used general-purpose, high-level programming language.', ('Its design philosophy emphasizes code readability, and its syntax allows programmers to ' 'express concepts in fewer lines of code than would be possible in languages such as C.'), ('The language provides constructs intended to enable clear programs on both a small and ' 'large scale.'), ('Stackless Python is a significant fork of CPython that implements microthreads; it does not ' 'use the C memory stack, thus allowing massively concurrent programs. PyPy also has a ' 'stackless version')] db = Database() # This builds our index for testing. with db as session : for document_string in document_strings : session.add(Document(document_string)) indexed = Indexed(session) for document in session.access.all_documents() : indexed.add(document, max_gram_length=5) # Testing with db as session : is_a = session.access.gram('is a') concordance_of_is_a = Concordance(is_a) ut.assert_equal(len(concordance_of_is_a), 2) ut.assert_equal(list(concordance_of_is_a), list(is_a.concordance())) grams_for_concordance = concordance_of_is_a.grams(before=1, after=2) correct_strings = ['Python is a widely used', 'Python is a significant fork'] for gram, correct_string in zip(grams_for_concordance, correct_strings) : ut.assert_equal(gram, Gram(correct_string)) # Tests <raw>, note the differences in the white space. for stuff, correct_string in zip(concordance_of_is_a.raw(), ['is a', 'is a']) : raw_string = stuff[2] ut.assert_equal(raw_string, correct_string) ut.assert_equal(len(concordance_of_is_a.documents().keys()), 2) session.add(Word('foobar')) session.add(Gram('foo bar')) # Testing by glorified "word" counting. def test_count (access, string, count) : ut.assert_equal(len(Concordance(access(string))), count) for string, count in [('a', 4), ('of', 2), ('to', 2), ('and', 2), ('significant', 1), ('foobar', 0)] : test_count(session.access.word, string, count) for string, count in [('is a', 2), ('fork of', 1), ('foo bar', 0)] : test_count(session.access.gram, string, count)
def make_db(db, amount=100): total = 0 for chunk in chunked(enumerate(gather_documents(amount), total + 1), 10, trail=True): with db as session: indexed = Indexed(session) for total, document in chunk: if len(document): session.add(document) indexed.add(document) print(total, ":", repr(document)) return total
def _test_document (ut) : # Tests the addition and removal of documents and associated objects from the database. db = Database() with db as session : indexed = Indexed(session) indexed.add(session.add(Document('a b b a c d')), max_gram_length=1) indexed.add(session.add(Document('a c d e')), max_gram_length=2) session.add(Document('a c')) # This document isn't indexed, so no associated objects. def test_counts (session, seq, index) : ut.assert_equal(len(list(session.access.all_seqs())), seq) ut.assert_equal(len(list(session.access.all_indexes())), index) def longest (session) : return max(session.access.all_documents(), key=lambda document : len(str(document))) with db as session : documents = list(session.access.all_documents()) ut.assert_equal(len(documents), 3) ut.assert_true(session.access.word('a') in documents[0]) ut.assert_true(session.access.word('e') not in documents[0]) ut.assert_true(session.access.word('a') in documents[1]) ut.assert_true(session.access.word('e') in documents[1]) ut.assert_true(session.access.word('b') not in documents[1]) ut.assert_true(session.access.word('a') not in documents[2]) ut.assert_true(session.access.word('c') not in documents[2]) ut.assert_true(session.access.word('e') not in documents[2]) test_counts(session, 8, 13) session.remove(longest(session)) test_counts(session, 7, 7) session.remove(longest(session)) test_counts(session, 0, 0) session.remove(longest(session)) test_counts(session, 0, 0) ut.assert_equal(len(list(session.access.all_documents())), 0) with db as session : test_counts(session, 0, 0)