def assignment_a_postingsmerger_1(): # A small but real corpus. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() corpus = InMemoryCorpus("./data/mesh.txt") index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer) # Test that we merge posting lists correctly. Note implicit test for case- and whitespace robustness. print("MERGING...") merger = PostingsMerger() and_query = ("HIV pROtein", "AND", [11316, 11319, 11320, 11321]) or_query = ("water Toxic", "OR", [3078, 8138, 8635, 9379, 14472, 18572, 23234, 23985] + [i for i in range(25265, 25282)]) for (query, operator, expected_document_ids) in [and_query, or_query]: print(re.sub("\W+", " " + operator + " ", query)) terms = list(index.get_terms(query)) assert len(terms) == 2 postings = [index[terms[i]] for i in range(len(terms))] merged = { "AND": merger.intersection, "OR": merger.union }[operator](postings[0], postings[1]) documents = [corpus[posting.document_id] for posting in merged] print(*documents, sep="\n") assert len(documents) == len(expected_document_ids) assert [d.document_id for d in documents] == expected_document_ids
def assignment_a(): # Use these throughout below. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() # Dump postings for a dummy two-document corpus. print("INDEXING...") corpus = InMemoryCorpus() corpus.add_document(InMemoryDocument(0, {"body": "this is a Test"})) corpus.add_document(InMemoryDocument(1, {"body": "test TEST prØve"})) index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer) for (term, expected) in zip(index.get_terms("PRøvE wtf tesT"), [[(1, 1)], [], [(0, 1), (1, 2)]]): print(term) assert term in ["prøve", "wtf", "test"] postings = list(index.get_postings_iterator(term)) for posting in postings: print(posting) assert len(postings) == len(expected) assert [(p.document_id, p.term_frequency) for p in postings] == expected print(index) # Again, for a slightly bigger corpus. print("LOADING...") corpus = InMemoryCorpus("data/mesh.txt") print("INDEXING...") index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer) for (term, expected_length) in [("hydrogen", 8), ("hydrocephalus", 2)]: print(term) for posting in index.get_postings_iterator(term): print(posting) assert len(list(index.get_postings_iterator(term))) == expected_length # Test that we merge posting lists correctly. Note implicit test for case- and whitespace robustness. print("MERGING...") merger = PostingsMerger() and_query = ("HIV pROtein", "AND", [11316, 11319, 11320, 11321]) or_query = ("water Toxic", "OR", [3078, 8138, 8635, 9379, 14472, 18572, 23234, 23985] + [i for i in range(25265, 25282)]) for (query, operator, expected_document_ids) in [and_query, or_query]: print(re.sub("\W+", " " + operator + " ", query)) terms = list(index.get_terms(query)) assert len(terms) == 2 postings = [ index.get_postings_iterator(terms[i]) for i in range(len(terms)) ] merged = { "AND": merger.intersection, "OR": merger.union }[operator](postings[0], postings[1]) documents = [ corpus.get_document(posting.document_id) for posting in merged ] print(*documents, sep="\n") assert len(documents) == len(expected_document_ids) assert [d.get_document_id() for d in documents] == expected_document_ids
def assignment_a_postingsmerger_3(): # Argument order shouldn't matter. merger = PostingsMerger() postings1 = [Posting(1, 0), Posting(2, 0), Posting(3, 0)] postings2 = [Posting(2, 0), Posting(3, 0), Posting(6, 0)] result12 = list(map(lambda p: p.document_id, merger.intersection(iter(postings1), iter(postings2)))) result21 = list(map(lambda p: p.document_id, merger.intersection(iter(postings2), iter(postings1)))) print(result12) print(result21) assert len(result12) == 2 assert result12 == result21 result12 = list(map(lambda p: p.document_id, merger.union(iter(postings1), iter(postings2)))) result21 = list(map(lambda p: p.document_id, merger.union(iter(postings2), iter(postings1)))) print(result12) print(result21) assert len(result12) == 4 assert result12 == result21
def assignment_a_postingsmerger_2(): # Test some corner cases with empty lists. merger = PostingsMerger() posting = Posting(123, 4) assert list(merger.intersection(iter([]), iter([]))) == [] assert list(merger.intersection(iter([]), iter([posting]))) == [] assert list(merger.intersection(iter([posting]), iter([]))) == [] assert list(merger.union(iter([]), iter([]))) == [] assert [p.document_id for p in merger.union(iter([]), iter([posting])) ] == [posting.document_id] assert [p.document_id for p in merger.union(iter([posting]), iter([])) ] == [posting.document_id]
def assignment_a_postingsmerger_2(): # Test some corner cases with empty lists. merger = PostingsMerger() posting = Posting(0, 0) assert list(merger.intersection(iter([]), iter([]))) == [] assert list(merger.intersection(iter([]), iter([posting]))) == [] assert list(merger.intersection(iter([posting]), iter([]))) == [] assert list(merger.union(iter([]), iter([]))) == [] assert list(merger.union(iter([]), iter([posting]))) == [posting] assert list(merger.union(iter([posting]), iter([]))) == [posting]