Exemplo n.º 1
0
def __test__ (ut) :
    from nlplib.core.process.index import Indexed
    from nlplib.core.model import Database, Document, Word, Gram

    _test_window(ut)

    document_strings = ['Python is a widely used general-purpose, high-level programming language.',
                        ('Its design philosophy emphasizes code readability, and its syntax allows programmers to '
                         'express concepts in fewer lines of code than would be possible in languages such as C.'),
                        ('The language provides constructs intended to enable clear programs on both a small and '
                         'large scale.'),
                        ('Stackless Python is  a significant fork of CPython that implements microthreads; it does not '
                         'use the C memory stack, thus allowing massively concurrent programs. PyPy also has a '
                         'stackless version')]

    db = Database()

    # This builds our index for testing.
    with db as session :
        for document_string in document_strings :
            session.add(Document(document_string))

        indexed = Indexed(session)
        for document in session.access.all_documents() :
            indexed.add(document, max_gram_length=5)

    # Testing
    with db as session :
        is_a = session.access.gram('is a')

        concordance_of_is_a = Concordance(is_a)
        ut.assert_equal(len(concordance_of_is_a), 2)

        ut.assert_equal(list(concordance_of_is_a), list(is_a.concordance()))

        grams_for_concordance = concordance_of_is_a.grams(before=1, after=2)
        correct_strings = ['Python is a widely used', 'Python is a significant fork']
        for gram, correct_string in zip(grams_for_concordance, correct_strings) :
            ut.assert_equal(gram, Gram(correct_string))

        # Tests <raw>, note the differences in the white space.
        for stuff, correct_string in zip(concordance_of_is_a.raw(), ['is a', 'is  a']) :
            raw_string = stuff[2]

            ut.assert_equal(raw_string, correct_string)

        ut.assert_equal(len(concordance_of_is_a.documents().keys()), 2)

        session.add(Word('foobar'))
        session.add(Gram('foo bar'))

        # Testing by glorified "word" counting.
        def test_count (access, string, count) :
            ut.assert_equal(len(Concordance(access(string))), count)

        for string, count in [('a', 4), ('of', 2), ('to', 2), ('and', 2), ('significant', 1), ('foobar', 0)] :
            test_count(session.access.word, string, count)

        for string, count in [('is a', 2), ('fork of', 1), ('foo bar', 0)] :
            test_count(session.access.gram, string, count)
Exemplo n.º 2
0
def make_db(db, amount=100):
    total = 0
    for chunk in chunked(enumerate(gather_documents(amount), total + 1), 10, trail=True):
        with db as session:
            indexed = Indexed(session)
            for total, document in chunk:
                if len(document):
                    session.add(document)
                    indexed.add(document)
                    print(total, ":", repr(document))
    return total
Exemplo n.º 3
0
def _test_document (ut) :
    # Tests the addition and removal of documents and associated objects from the database.

    db = Database()

    with db as session :
        indexed = Indexed(session)
        indexed.add(session.add(Document('a b b a c d')), max_gram_length=1)
        indexed.add(session.add(Document('a c d e')), max_gram_length=2)

        session.add(Document('a c')) # This document isn't indexed, so no associated objects.

    def test_counts (session, seq, index) :
        ut.assert_equal(len(list(session.access.all_seqs())), seq)
        ut.assert_equal(len(list(session.access.all_indexes())), index)

    def longest (session) :
        return max(session.access.all_documents(), key=lambda document : len(str(document)))

    with db as session :
        documents = list(session.access.all_documents())
        ut.assert_equal(len(documents), 3)

        ut.assert_true(session.access.word('a') in documents[0])
        ut.assert_true(session.access.word('e') not in documents[0])

        ut.assert_true(session.access.word('a') in documents[1])
        ut.assert_true(session.access.word('e') in documents[1])
        ut.assert_true(session.access.word('b') not in documents[1])

        ut.assert_true(session.access.word('a') not in documents[2])
        ut.assert_true(session.access.word('c') not in documents[2])
        ut.assert_true(session.access.word('e') not in documents[2])

        test_counts(session, 8, 13)
        session.remove(longest(session))
        test_counts(session, 7, 7)
        session.remove(longest(session))
        test_counts(session, 0, 0)
        session.remove(longest(session))
        test_counts(session, 0, 0)

        ut.assert_equal(len(list(session.access.all_documents())), 0)

    with db as session :
        test_counts(session, 0, 0)