def assignment_b_suffixarray_1(): # Use these throughout below. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() # Prepare for some suffix array lookups. print("LOADING...") corpus = InMemoryCorpus("data/cran.xml") print("INDEXING...") engine = SuffixArray(corpus, ["body"], normalizer, tokenizer) results = [] hit_count = 5 # Callback for receiving matches. def match_collector(match): results.append(match) print("*** WINNER", match["score"], match["document"]) # Define the actual test queries. test1 = ("visc", 11, [328]) # Look for {'viscous', 'viscosity', ...}. test2 = ("Of A", 10, [946]) # Test robustness for case and whitespace. test3 = ("", 0, []) # Safety feature: Match nothing instead of everything. test4 = ("approximate solution", 3, [1374, 159]) # Multiple winners. # Test that the simple occurrence ranking works. Be robust towards how ties are resolved. for (query, winner_score, winner_document_ids) in [test1, test2, test3, test4]: print("SEARCHING for '" + query + "'...") results.clear() engine.evaluate(query, { "debug": False, "hit_count": hit_count }, match_collector) if winner_document_ids: assert results[0]["score"] == winner_score assert results[0]["document"].document_id in winner_document_ids assert len(results) <= hit_count else: assert len(results) == 0
def assignment_b_suffixarray_2(): # For testing. class TestNormalizer(Normalizer): _table = str.maketrans({'Ø': 'O'}) def canonicalize(self, buffer: str) -> str: return buffer def normalize(self, token: str) -> str: return token.upper().translate(self._table) # For testing. class TestDocument(Document): def __init__(self, document_id: int, a: str, b: str): self._document_id = document_id self._a = a self._b = b def get_document_id(self) -> int: return self._document_id def get_field(self, field_name: str, default: str) -> str: if field_name == "a": return self._a if field_name == "b": return self._b return default # For testing. class TestCorpus(Corpus): def __init__(self): self._docs = [] self._docs.append(TestDocument(len(self._docs), "ø o\n\n\nø\n\no", "ø o\nø \no")) self._docs.append(TestDocument(len(self._docs), "ba", "b bab")) self._docs.append(TestDocument(len(self._docs), "ø o Ø o", "ø o")) self._docs.append(TestDocument(len(self._docs), "øO" * 10000, "o")) self._docs.append(TestDocument(len(self._docs), "cbab o øbab Ø ", "ø o " * 10000)) def __iter__(self): return iter(self._docs) def size(self) -> int: return len(self._docs) def get_document(self, document_id: int) -> Document: return self._docs[document_id] # Run the tests! for fields in [("b",), ("a", "b")]: # Create the suffix array over the given set of fields. Measure memory usage. If memory usage is # excessive, most likely the implementation is copying strings or doing other silly stuff instead # of working with buffer indices. The naive reference implementation is not in any way optimized, # and uses about 1.5 MB of memory on this corpus. tracemalloc.start() snapshot1 = tracemalloc.take_snapshot() engine = SuffixArray(TestCorpus(), fields, TestNormalizer(), BrainDeadTokenizer()) snapshot2 = tracemalloc.take_snapshot() for statistic in snapshot2.compare_to(snapshot1, "filename"): if statistic.traceback[0].filename == inspect.getfile(SuffixArray): assert statistic.size_diff < 2000000, f"Memory usage is {statistic.size_diff}" tracemalloc.stop() results = [] def process(m): results.append((m['document'].document_id, m['score'])) expected_results = { ('b',): ( ('bab', [(1, 1)]), ('ø o', [(4, 19999), (0, 3), (2, 1)]), ('o O', [(4, 19999), (0, 3), (2, 1)]), ('oooooo', []), ('o o o o', [(4, 19997), (0, 1)]), ), ('a', 'b'): ( ('bab', [(1, 1)]), ('ø o', [(4, 20000), (0, 6), (2, 4)]), ('o O', [(4, 20000), (0, 6), (2, 4)]), ('oøØOøO', [(3, 1), ]), ('o o o o', [(4, 19997), (0, 2), (2, 1)]), ) } for query, expected in expected_results[fields]: results.clear() engine.evaluate(query, {'hit_count': 10}, process) assert results == expected
def assignment_b_suffixarray_2(): # For testing. class TestNormalizer(Normalizer): _table = str.maketrans({'Ø': 'O'}) def canonicalize(self, buffer: str) -> str: return buffer def normalize(self, token: str) -> str: return token.upper().translate(self._table) # For testing. class TestDocument(Document): def __init__(self, document_id: int, a: str, b: str): self._document_id = document_id self._a = a self._b = b def get_document_id(self) -> int: return self._document_id def get_field(self, field_name: str, default: str) -> str: if field_name == "a": return self._a if field_name == "b": return self._b return default # For testing. class TestCorpus(Corpus): def __init__(self): self._docs = [] self._docs.append( TestDocument(len(self._docs), "ø o\n\n\nø\n\no", "ø o\nø \no")) self._docs.append(TestDocument(len(self._docs), "ba", "b bab")) self._docs.append(TestDocument(len(self._docs), "ø o Ø o", "ø o")) self._docs.append(TestDocument(len(self._docs), "øO" * 10000, "o")) self._docs.append( TestDocument(len(self._docs), "cbab o øbab Ø ", "ø o " * 10000)) def __iter__(self): return iter(self._docs) def size(self) -> int: return len(self._docs) def get_document(self, document_id: int) -> Document: return self._docs[document_id] # Run the tests! for fields in [("b", ), ("a", "b")]: engine = SuffixArray(TestCorpus(), fields, TestNormalizer(), BrainDeadTokenizer()) results = [] def process(m): results.append((m['document'].document_id, m['score'])) expected_results = { ('b', ): ( ('bab', [(1, 1)]), ('ø o', [(4, 19999), (0, 3), (2, 1)]), ('o O', [(4, 19999), (0, 3), (2, 1)]), ('oooooo', []), ('o o o o', [(4, 19997), (0, 1)]), ), ('a', 'b'): ( ('bab', [(1, 1)]), ('ø o', [(4, 20000), (0, 6), (2, 4)]), ('o O', [(4, 20000), (0, 6), (2, 4)]), ('oøØOøO', [ (3, 1), ]), ('o o o o', [(4, 19997), (0, 2), (2, 1)]), ) } for query, expected in expected_results[fields]: results.clear() engine.evaluate(query, {'hit_count': 10}, process) assert results == expected
def assignment_b(): # Use these throughout below. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() # Prepare for some suffix array lookups. print("LOADING...") corpus = InMemoryCorpus("data/cran.xml") print("INDEXING...") engine = SuffixArray(corpus, ["body"], normalizer, tokenizer) results = [] hit_count = 5 # Callback for receiving matches. def match_collector(match): results.append(match) print("*** WINNER", match["score"], match["document"]) # Define the actual test queries. test1 = ("visc", 11, [328]) # Look for {'viscous', 'viscosity', ...}. test2 = ("Of A", 10, [946]) # Test robustness for case and whitespace. test3 = ("", 0, []) # Safety feature: Match nothing instead of everything. test4 = ("approximate solution", 3, [1374, 159]) # Multiple winners. # Test that the simple occurrence ranking works. Be robust towards how ties are resolved. for (query, winner_score, winner_document_ids) in [test1, test2, test3, test4]: print("SEARCHING for '" + query + "'...") results.clear() engine.evaluate(query, { "debug": False, "hit_count": hit_count }, match_collector) if winner_document_ids: assert results[0]["score"] == winner_score assert results[0]["document"].document_id in winner_document_ids assert len(results) <= hit_count else: assert len(results) == 0 # Simple test of using a trie-encoded dictionary for efficiently locating substrings in a buffer. trie = Trie() for s in [ "romerike", "apple computer", "norsk", "norsk ørret", "sverige", "ørret", "banan" ]: trie.add(s, tokenizer) finder = StringFinder(trie, tokenizer) buffer = "det var en gang en norsk ørret fra romerike som likte abba fra sverige" print("SCANNING...") results.clear() finder.scan(buffer, lambda m: results.append(m)) print("Buffer \"" + buffer + "\" contains", results) assert [m["match"] for m in results ] == ["norsk", "norsk ørret", "ørret", "romerike", "sverige"] # Find all MeSH terms that occur verbatim in some selected Cranfield documents! Since MeSH # documents are medical terms and the Cranfield documents have technical content, the # overlap probably isn't that big. print("LOADING...") mesh = InMemoryCorpus("data/mesh.txt") cranfield = InMemoryCorpus("data/cran.xml") print("BUILDING...") trie = Trie() for d in mesh: trie.add(d["body"] or "", tokenizer) finder = StringFinder(trie, tokenizer) print("SCANNING...") for (document_id, expected_matches) in [(0, ["wing", "wing"]), (3, ["solutions", "skin", "friction"]), (1254, ["electrons", "ions"])]: document = cranfield[document_id] buffer = document["body"] or "" results.clear() finder.scan(buffer, lambda m: results.append(m)) print("Cranfield document", document, "contains MeSH terms", results) assert [m["match"] for m in results] == expected_matches