Пример #1
0
def assignment_a():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()

    # Dump postings for a dummy two-document corpus.
    print("INDEXING...")
    corpus = InMemoryCorpus()
    corpus.add_document(InMemoryDocument(0, {"body": "this is a Test"}))
    corpus.add_document(InMemoryDocument(1, {"body": "test TEST prØve"}))
    index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)
    for (term, expected) in zip(index.get_terms("PRøvE wtf tesT"),
                                [[(1, 1)], [], [(0, 1), (1, 2)]]):
        print(term)
        assert term in ["prøve", "wtf", "test"]
        postings = list(index.get_postings_iterator(term))
        for posting in postings:
            print(posting)
        assert len(postings) == len(expected)
        assert [(p.document_id, p.term_frequency)
                for p in postings] == expected
    print(index)

    # Again, for a slightly bigger corpus.
    print("LOADING...")
    corpus = InMemoryCorpus("data/mesh.txt")
    print("INDEXING...")
    index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)
    for (term, expected_length) in [("hydrogen", 8), ("hydrocephalus", 2)]:
        print(term)
        for posting in index.get_postings_iterator(term):
            print(posting)
        assert len(list(index.get_postings_iterator(term))) == expected_length

    # Test that we merge posting lists correctly. Note implicit test for case- and whitespace robustness.
    print("MERGING...")
    merger = PostingsMerger()
    and_query = ("HIV  pROtein", "AND", [11316, 11319, 11320, 11321])
    or_query = ("water Toxic", "OR",
                [3078, 8138, 8635, 9379, 14472, 18572, 23234, 23985] +
                [i for i in range(25265, 25282)])
    for (query, operator, expected_document_ids) in [and_query, or_query]:
        print(re.sub("\W+", " " + operator + " ", query))
        terms = list(index.get_terms(query))
        assert len(terms) == 2
        postings = [
            index.get_postings_iterator(terms[i]) for i in range(len(terms))
        ]
        merged = {
            "AND": merger.intersection,
            "OR": merger.union
        }[operator](postings[0], postings[1])
        documents = [
            corpus.get_document(posting.document_id) for posting in merged
        ]
        print(*documents, sep="\n")
        assert len(documents) == len(expected_document_ids)
        assert [d.get_document_id()
                for d in documents] == expected_document_ids
Пример #2
0
 def test_access_documents(self):
     from corpus import InMemoryDocument, InMemoryCorpus
     corpus = InMemoryCorpus()
     corpus.add_document(InMemoryDocument(0, {"body": "this is a Test"}))
     corpus.add_document(InMemoryDocument(1, {"title": "prØve", "body": "en to tre"}))
     self.assertEqual(corpus.size(), 2)
     self.assertListEqual([d.document_id for d in corpus], [0, 1])
     self.assertListEqual([corpus[i].document_id for i in range(0, corpus.size())], [0, 1])
     self.assertListEqual([corpus.get_document(i).document_id for i in range(0, corpus.size())], [0, 1])
Пример #3
0
def assignment_b_stringfinder():

    # Use these throughout below.
    tokenizer = BrainDeadTokenizer()
    results = []

    # Simple test of using a trie-encoded dictionary for efficiently locating substrings in a buffer.
    trie = Trie()
    for s in [
            "romerike", "apple computer", "norsk", "norsk ørret", "sverige",
            "ørret", "banan"
    ]:
        trie.add(s, tokenizer)
    finder = StringFinder(trie, tokenizer)
    buffer = "det var en gang en norsk  ørret fra romerike som likte abba fra sverige"
    print("SCANNING...")
    results.clear()
    finder.scan(buffer, lambda m: results.append(m))
    print("Buffer \"" + buffer + "\" contains", results)
    assert [m["match"] for m in results
            ] == ["norsk", "norsk ørret", "ørret", "romerike", "sverige"]

    # Find all MeSH terms that occur verbatim in some selected Cranfield documents! Since MeSH
    # documents are medical terms and the Cranfield documents have technical content, the
    # overlap probably isn't that big.
    print("LOADING...")
    mesh = InMemoryCorpus("data/mesh.txt")
    cranfield = InMemoryCorpus("data/cran.xml")
    print("BUILDING...")
    trie = Trie()
    for d in mesh:
        trie.add(d["body"] or "", tokenizer)
    finder = StringFinder(trie, tokenizer)
    print("SCANNING...")
    for (document_id,
         expected_matches) in [(0, ["wing", "wing"]),
                               (3, ["solutions", "skin", "friction"]),
                               (1254, ["electrons", "ions"])]:
        document = cranfield.get_document(document_id)
        buffer = document["body"] or ""
        results.clear()
        finder.scan(buffer, lambda m: results.append(m))
        print("Cranfield document", document, "contains MeSH terms", results)
        assert [m["match"] for m in results] == expected_matches