Пример #1
0
 def test_access_documents(self):
     from corpus import InMemoryDocument, InMemoryCorpus
     corpus = InMemoryCorpus()
     corpus.add_document(InMemoryDocument(0, {"body": "this is a Test"}))
     corpus.add_document(InMemoryDocument(1, {"title": "prØve", "body": "en to tre"}))
     self.assertEqual(corpus.size(), 2)
     self.assertListEqual([d.document_id for d in corpus], [0, 1])
     self.assertListEqual([corpus[i].document_id for i in range(0, corpus.size())], [0, 1])
     self.assertListEqual([corpus.get_document(i).document_id for i in range(0, corpus.size())], [0, 1])
Пример #2
0
 def test_synthetic_corpus(self):
     from itertools import product, combinations_with_replacement
     from corpus import InMemoryDocument, InMemoryCorpus
     from invertedindex import InMemoryInvertedIndex
     from searchengine import SimpleSearchEngine
     corpus = InMemoryCorpus()
     words = ("".join(term) for term in product("bcd", "aei", "jkl"))
     texts = (" ".join(word) for word in combinations_with_replacement(words, 3))
     for text in texts:
         corpus.add_document(InMemoryDocument(corpus.size(), {"a": text}))
     engine = SimpleSearchEngine(corpus, InMemoryInvertedIndex(corpus, ["a"], self._normalizer, self._tokenizer))
     epsilon = 0.0001
     self._process_query_verify_matches("baj BAJ    baj", engine,
                                        {"match_threshold": 1.0, "hit_count": 27},
                                        (27, 9.0, [0]))
     self._process_query_verify_matches("baj caj", engine,
                                        {"match_threshold": 1.0, "hit_count": 100},
                                        (27, None, None))
     self._process_query_verify_matches("baj caj daj", engine,
                                        {"match_threshold": 2/3 + epsilon, "hit_count": 100},
                                        (79, None, None))
     self._process_query_verify_matches("baj caj", engine,
                                        {"match_threshold": 2/3 + epsilon, "hit_count": 100},
                                        (100, 3.0, [0, 9, 207, 2514]))
     self._process_query_verify_matches("baj cek dil", engine,
                                        {"match_threshold": 1.0, "hit_count": 10},
                                        (1, 3.0, [286]))
     self._process_query_verify_matches("baj cek dil", engine,
                                        {"match_threshold": 1.0, "hit_count": 10},
                                        (1, None, None))
     self._process_query_verify_matches("baj cek dil", engine,
                                        {"match_threshold": 2/3 + epsilon, "hit_count": 80},
                                        (79, 3.0, [13, 26, 273, 286, 377, 3107, 3198]))
     self._process_query_verify_matches("baj xxx yyy", engine,
                                        {"match_threshold": 2/3 + epsilon, "hit_count": 100},
                                        (0, None, None))
     self._process_query_verify_matches("baj xxx yyy", engine,
                                        {"match_threshold": 2/3 - epsilon, "hit_count": 100},
                                        (100, None, None))
Пример #3
0
def assignment_c_simplesearchengine_2():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    ranker = BrainDeadRanker()

    # Used for comparing floating point numbers.
    epsilon = 0.0001

    # Create a dummy test corpus.
    corpus = InMemoryCorpus()
    words = (''.join(term) for term in product("bcd", "aei", "jkl"))
    texts = (' '.join(word)
             for word in combinations_with_replacement(words, 3))
    for text in texts:
        corpus.add_document(InMemoryDocument(corpus.size(), {'a': text}))

    # What we're testing.
    engine = SimpleSearchEngine(
        corpus, InMemoryInvertedIndex(corpus, ["a"], normalizer, tokenizer))

    # Where the callback will collect the matches.
    results = []

    # Callback that collects matches.
    def collect(m):
        results.append((m['score'], m['document'].document_id))

    # Executes a query.
    def search(q, t, n):
        results.clear()
        engine.evaluate(q, {
            'match_threshold': t,
            'hit_count': n
        }, ranker, collect)

    # Sorts the collected matches.
    def sort_results():
        results.sort(key=lambda e: e[1])
        results.sort(key=lambda e: e[0], reverse=True)

    # Test predicate.
    def check_at(i, expected):
        assert results[i] == expected

    # Test predicate.
    def check_range(indices, score, document_ids):
        for i, d in zip(indices, document_ids):
            check_at(i, (score, d))

    # Test predicate.
    def check_hits(n):
        assert len(results) == n

    # Run tests!
    search('baj BAJ    baj', 1.0, 27)
    check_hits(27)
    check_at(0, (9.0, 0))
    sort_results()
    check_range(range(1, 27), 6.0, range(1, 27))
    search('baj caj', 1.0, 100)
    check_hits(27)
    search('baj caj daj', 2 / 3 + epsilon, 100)
    check_hits(79)
    search('baj caj', 2 / 3 + epsilon, 100)
    check_hits(100)
    sort_results()
    check_at(0, (3.0, 0))
    check_range(range(4, 12), 2.0, range(1, 9))
    check_range(range(12, 29), 2.0, range(10, 27))
    check_at(29, (2.0, 35))
    check_at(78, (2.0, 2531))
    search('baj cek dil', 1.0, 10)
    check_hits(1)
    check_at(0, (3.0, 286))
    search('baj cek dil', 2 / 3 + epsilon, 80)
    check_hits(79)
    sort_results()
    check_at(0, (3.0, 13))
    check_at(1, (3.0, 26))
    check_at(2, (3.0, 273))
    search('baj xxx yyy', 2 / 3 + epsilon, 100)
    check_hits(0)
    search('baj xxx yyy', 2 / 3 - epsilon, 100)
    check_hits(100)
Пример #4
0
def test_simple_search_engine():
    from itertools import product, combinations_with_replacement
    from tokenization import BrainDeadTokenizer
    from normalization import BrainDeadNormalizer
    from corpus import InMemoryCorpus, InMemoryDocument
    from invertedindex import InMemoryInvertedIndex
    from searchengine import SimpleSearchEngine
    from ranking import BrainDeadRanker

    Ɛ = 0.0001
    corpus = InMemoryCorpus()

    for txt in (' '.join(w) for w in combinations_with_replacement(
            list(''.join(t) for t in product(
                'bcd',
                'aei',
                'jkl',
            )), 3)):
        corpus.add_document(InMemoryDocument(corpus.size(), {'a': txt}))

    engine = SimpleSearchEngine(
        corpus,
        InMemoryInvertedIndex(corpus, ('a', ), BrainDeadNormalizer(),
                              BrainDeadTokenizer()))

    results = []

    def search(q, r, n):
        results.clear()

        def match(m):
            results.append((m['score'], m['document'].document_id))

        print('searching "' + q + '" at threshold', r, '…')
        engine.evaluate(q, {
            'recall_threshold': r,
            'hit_count': n
        }, BrainDeadRanker(), match)

    def sort_results():
        results.sort(key=lambda e: e[1])
        results.sort(key=lambda e: e[0], reverse=True)

    def check_at(i, expected):
        if results[i] != expected:
            print('FAILED, EXPECTED ', expected, ' RESULT', i, ' was',
                  results[i])

    def check_range(indices, score, docrange):
        for i, d in zip(indices, docrange):
            check_at(i, (score, d))

    def check_hits(n):
        if len(results) != n:
            print('FAILED, expected', n, 'results, got', len(results))

    search('baj BAJ    baj', 1.0, 27)
    check_hits(27)
    check_at(0, (9.0, 0))
    sort_results()
    check_range(range(1, 27), 6.0, range(1, 27))
    search('baj CAj', 1.0, 100)
    check_hits(27)
    search('baj caj daj', 2 / 3 + Ɛ, 100)
    check_hits(79)
    search('baj caj', 2 / 3 + Ɛ, 100)  # her
    check_hits(100)
    sort_results()
    check_at(0, (3.0, 0))
    check_range(range(4, 12), 2.0, range(1, 9))
    check_range(range(12, 29), 2.0, range(10, 27))
    check_at(29, (2.0, 35))
    check_at(78, (2.0, 2531))
    search('baj cek dil', 1.0, 10)
    check_hits(1)
    check_at(0, (3.0, 286))
    search('baj cek dil', 2 / 3 + Ɛ, 80)
    check_hits(79)
    sort_results()
    check_at(0, (3.0, 13))
    check_at(1, (3.0, 26))
    check_at(2, (3.0, 273))
    search('baj xxx yyy', 2 / 3 + Ɛ, 100)
    check_hits(0)
    search('baj xxx yyy', 2 / 3 - Ɛ, 100)
    check_hits(100)