def main(): import os.path from normalization import BrainDeadNormalizer from tokenization import BrainDeadTokenizer from corpus import InMemoryCorpus from ahocorasick import Trie, StringFinder print("Building trie from MeSH corpus...") normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() corpus = InMemoryCorpus(os.path.join(data_path, 'mesh.txt')) dictionary = Trie() for document in corpus: dictionary.add( normalizer.normalize(normalizer.canonicalize(document["body"])), tokenizer) engine = StringFinder(dictionary, tokenizer) print("Enter some text and locate words and phrases that are MeSH terms.") def evaluator(text): matches = [] engine.scan(normalizer.normalize(normalizer.canonicalize(text)), lambda m: matches.append(m)) return matches simple_repl("text", evaluator)
class TestBrainDeadNormalizer(unittest.TestCase): def setUp(self): from normalization import BrainDeadNormalizer self._normalizer = BrainDeadNormalizer() def test_canonicalize(self): self.assertEqual(self._normalizer.canonicalize("Dette ER en\nprØve!"), "Dette ER en\nprØve!") def test_normalize(self): self.assertEqual( self._normalizer.normalize("grÅFustaSJEOpphengsForKOBling"), "gråfustasjeopphengsforkobling")
def assignment_a(): # Use these throughout below. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() # Dump postings for a dummy two-document corpus. print("INDEXING...") corpus = InMemoryCorpus() corpus.add_document(InMemoryDocument(0, {"body": "this is a Test"})) corpus.add_document(InMemoryDocument(1, {"body": "test TEST prØve"})) index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer) for (term, expected) in zip(index.get_terms("PRøvE wtf tesT"), [[(1, 1)], [], [(0, 1), (1, 2)]]): print(term) assert term in ["prøve", "wtf", "test"] postings = list(index.get_postings_iterator(term)) for posting in postings: print(posting) assert len(postings) == len(expected) assert [(p.document_id, p.term_frequency) for p in postings] == expected print(index) # Again, for a slightly bigger corpus. print("LOADING...") corpus = InMemoryCorpus("data/mesh.txt") print("INDEXING...") index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer) for (term, expected_length) in [("hydrogen", 8), ("hydrocephalus", 2)]: print(term) for posting in index.get_postings_iterator(term): print(posting) assert len(list(index.get_postings_iterator(term))) == expected_length # Test that we merge posting lists correctly. Note implicit test for case- and whitespace robustness. print("MERGING...") merger = PostingsMerger() and_query = ("HIV pROtein", "AND", [11316, 11319, 11320, 11321]) or_query = ("water Toxic", "OR", [3078, 8138, 8635, 9379, 14472, 18572, 23234, 23985] + [i for i in range(25265, 25282)]) for (query, operator, expected_document_ids) in [and_query, or_query]: print(re.sub("\W+", " " + operator + " ", query)) terms = list(index.get_terms(query)) assert len(terms) == 2 postings = [ index.get_postings_iterator(terms[i]) for i in range(len(terms)) ] merged = { "AND": merger.intersection, "OR": merger.union }[operator](postings[0], postings[1]) documents = [ corpus.get_document(posting.document_id) for posting in merged ] print(*documents, sep="\n") assert len(documents) == len(expected_document_ids) assert [d.get_document_id() for d in documents] == expected_document_ids
def setUp(self): from normalization import BrainDeadNormalizer from tokenization import BrainDeadTokenizer from corpus import InMemoryDocument, InMemoryCorpus from invertedindex import InMemoryInvertedIndex from ranking import BetterRanker normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() corpus = InMemoryCorpus() corpus.add_document( InMemoryDocument(0, { "title": "the foo", "static_quality_score": 0.9 })) corpus.add_document( InMemoryDocument(1, { "title": "the foo", "static_quality_score": 0.2 })) corpus.add_document( InMemoryDocument(2, { "title": "the foo foo", "static_quality_score": 0.2 })) corpus.add_document(InMemoryDocument(3, {"title": "the bar"})) corpus.add_document(InMemoryDocument(4, {"title": "the bar bar"})) corpus.add_document(InMemoryDocument(5, {"title": "the baz"})) corpus.add_document(InMemoryDocument(6, {"title": "the baz"})) corpus.add_document(InMemoryDocument(7, {"title": "the baz baz"})) index = InMemoryInvertedIndex(corpus, ["title"], normalizer, tokenizer) self._ranker = BetterRanker(corpus, index)
def assignment_a_postingsmerger_1(): # A small but real corpus. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() corpus = InMemoryCorpus("./data/mesh.txt") index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer) # Test that we merge posting lists correctly. Note implicit test for case- and whitespace robustness. print("MERGING...") merger = PostingsMerger() and_query = ("HIV pROtein", "AND", [11316, 11319, 11320, 11321]) or_query = ("water Toxic", "OR", [3078, 8138, 8635, 9379, 14472, 18572, 23234, 23985] + [i for i in range(25265, 25282)]) for (query, operator, expected_document_ids) in [and_query, or_query]: print(re.sub("\W+", " " + operator + " ", query)) terms = list(index.get_terms(query)) assert len(terms) == 2 postings = [index[terms[i]] for i in range(len(terms))] merged = { "AND": merger.intersection, "OR": merger.union }[operator](postings[0], postings[1]) documents = [corpus[posting.document_id] for posting in merged] print(*documents, sep="\n") assert len(documents) == len(expected_document_ids) assert [d.document_id for d in documents] == expected_document_ids
def assignment_e_naivebayes_2(): # Use these throughout below. These are really language-specific functions, so it's a huge # simplification to use these for a language identifier. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() results = [] # Callback for receiving results. Received scores are log-probabilities. def match_collector(match: dict): results.append(match) print("*** WINNER", match["score"], match["category"]) # Replicate Example 13.1 on pages 241 and 242 in the textbook. china = InMemoryCorpus() china.add_document(InMemoryDocument(0, {"body": "Chinese Beijing Chinese"})) china.add_document( InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"})) china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"})) not_china = InMemoryCorpus() not_china.add_document(InMemoryDocument(0, {"body": "Tokyo Japan Chinese"})) training_set = {"china": china, "not china": not_china} classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer) buffer = "Chinese Chinese Chinese Tokyo Japan" print(buffer) results.clear() classifier.classify(buffer, match_collector) assert len(results) == 2 assert results[0]["category"] == "china" assert results[1]["category"] == "not china" assert math.isclose(math.exp(results[0]["score"]), 0.0003, abs_tol=0.00001) assert math.isclose(math.exp(results[1]["score"]), 0.0001, abs_tol=0.00005)
def main(): import os.path from normalization import BrainDeadNormalizer from tokenization import ShingleGenerator from corpus import InMemoryCorpus from invertedindex import InMemoryInvertedIndex from ranking import BrainDeadRanker from searchengine import SimpleSearchEngine print("Indexing MeSH corpus...") normalizer = BrainDeadNormalizer() tokenizer = ShingleGenerator(3) corpus = InMemoryCorpus(os.path.join(data_path, 'mesh.txt')) index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer) ranker = BrainDeadRanker() engine = SimpleSearchEngine(corpus, index) options = {"debug": False, "hit_count": 5, "match_threshold": 0.5} print("Enter a query and find matching documents.") print(f"Lookup options are {options}.") print(f"Tokenizer is {tokenizer.__class__.__name__}.") print(f"Ranker is {ranker.__class__.__name__}.") def evaluator(query): matches = [] engine.evaluate(query, options, ranker, lambda m: matches.append(m)) return matches simple_repl("query", evaluator)
def assignment_a_inverted_index_1(): # Use these throughout below. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() # Dump postings for a dummy two-document corpus. print("INDEXING...") corpus = InMemoryCorpus() corpus.add_document(InMemoryDocument(0, {"body": "this is a Test"})) corpus.add_document(InMemoryDocument(1, {"body": "test TEST prØve"})) index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer) for (term, expected) in zip(index.get_terms("PRøvE wtf tesT"), [[(1, 1)], [], [(0, 1), (1, 2)]]): print(term) assert term in ["prøve", "wtf", "test"] postings = list(index[term]) for posting in postings: print(posting) assert len(postings) == len(expected) assert [(p.document_id, p.term_frequency) for p in postings] == expected print(index) # Document counts should be correct. assert index.get_document_frequency("wtf") == 0 assert index.get_document_frequency("test") == 2 assert index.get_document_frequency("prøve") == 1
def assignment_a_inverted_index_2(): # Use these throughout below. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() # Dump postings for a slightly bigger corpus. print("LOADING...") corpus = InMemoryCorpus("./data/mesh.txt") print("INDEXING...") index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer) for (term, expected_length) in [("hydrogen", 8), ("hydrocephalus", 2)]: print(term) for posting in index[term]: print(posting) assert len(list(index[term])) == expected_length
def assignment_c_simplesearchengine_1(): # Use these throughout below. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() # Load and index MeSH terms. print("LOADING...") corpus = InMemoryCorpus("../data/mesh.txt") print("INDEXING...") inverted_index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer) # Do ranked retrieval, using a simple ranker. engine = SimpleSearchEngine(corpus, inverted_index) simple_ranker = BrainDeadRanker() results = [] # Callback for receiving matches. def match_collector(match): results.append(match) print("*** WINNER", match["score"], match["document"]) query = "polluTION Water" for match_threshold in [0.1, 1.0]: print( f"SEARCHING for '{query}' with match threshold {str(match_threshold)}..." ) results.clear() options = { "match_threshold": match_threshold, "hit_count": 10, "debug": False } engine.evaluate(query, options, simple_ranker, match_collector) assert len(results) == {0.1: 10, 1.0: 3}[match_threshold] for (score, document_id) in [(match["score"], match["document"].document_id) for match in results[:3]]: assert score == 2.0 # Both 'pollution' and 'water'. assert document_id in [25274, 25275, 25276] for score in [match["score"] for match in results[3:]]: assert score == 1.0 # Only 'pollution' or 'water', but not both.
def assignment_d_betterranker(): # Use these throughout below. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() results = [] hit_count = 10 # Callback for receiving matches. def match_collector(match): results.append(match) print("*** WINNER", match["score"], match["document"]) # Load and index some English news sentences. Look at the output and compare the two rankers! # The naive ranker assigns equal weight to all words (including stopwords), whereas the improved # ranker does not. The test below for the improved ranker (with document #24 being the winner) # assumes a straightforward implementation of a TF-IDF ranking scheme as described in the # textbook. print("LOADING...") corpus = InMemoryCorpus("data/en.txt") print("INDEXING...") inverted_index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer) simple_ranker = BrainDeadRanker() better_ranker = BetterRanker(corpus, inverted_index) engine = SimpleSearchEngine(corpus, inverted_index) for query in ["the terrorism attack and obama"]: options = { "match_threshold": 0.1, "hit_count": hit_count, "debug": False } for ranker in [simple_ranker, better_ranker]: print("SEARCHING for '" + query + "' using " + ranker.__class__.__name__ + "...") results.clear() engine.evaluate(query, options, ranker, match_collector) winner_document_ids = { simple_ranker: [9221, 7263], better_ranker: [24] }[ranker] assert 0 < len(results) <= hit_count assert results[0]["document"].document_id in winner_document_ids
def assignment_a_inverted_index_3(): # tests that multiple fields are handled correctly normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() doc = InMemoryDocument(document_id=0, fields={ 'felt 1': 'Dette er en test. Test, sa jeg. TEST!', 'felt 2': 'test er det', 'felt 3': 'test TEsT', }) corpus = InMemoryCorpus() corpus.add_document(doc) index = InMemoryInvertedIndex(corpus, ['felt 1', 'felt 3'], normalizer, tokenizer) p = next(index.get_postings_iterator('test')) print(f"term-freq: {p.term_frequency} (correct is 5)") assert p.document_id == 0 assert p.term_frequency == 5
def main(): import os.path from normalization import BrainDeadNormalizer from tokenization import BrainDeadTokenizer from corpus import InMemoryCorpus from naivebayesclassifier import NaiveBayesClassifier print("Initializing naive Bayes classifier from news corpora...") normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() languages = ["en", "no", "da", "de"] training_set = {language: InMemoryCorpus(os.path.join(data_path,f"{language}.txt")) for language in languages} classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer) print(f"Enter some text and classify it into {languages}.") print(f"Returned scores are log-probabilities.") def evaluator(text): results = [] classifier.classify(text, lambda m: results.append(m)) return results simple_repl("text", evaluator)
def assignment_b_suffixarray_1(): # Use these throughout below. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() # Prepare for some suffix array lookups. print("LOADING...") corpus = InMemoryCorpus("data/cran.xml") print("INDEXING...") engine = SuffixArray(corpus, ["body"], normalizer, tokenizer) results = [] hit_count = 5 # Callback for receiving matches. def match_collector(match): results.append(match) print("*** WINNER", match["score"], match["document"]) # Define the actual test queries. test1 = ("visc", 11, [328]) # Look for {'viscous', 'viscosity', ...}. test2 = ("Of A", 10, [946]) # Test robustness for case and whitespace. test3 = ("", 0, []) # Safety feature: Match nothing instead of everything. test4 = ("approximate solution", 3, [1374, 159]) # Multiple winners. # Test that the simple occurrence ranking works. Be robust towards how ties are resolved. for (query, winner_score, winner_document_ids) in [test1, test2, test3, test4]: print("SEARCHING for '" + query + "'...") results.clear() engine.evaluate(query, { "debug": False, "hit_count": hit_count }, match_collector) if winner_document_ids: assert results[0]["score"] == winner_score assert results[0]["document"].document_id in winner_document_ids assert len(results) <= hit_count else: assert len(results) == 0
def assignment_d_shinglegenerator_2(): # Use these throughout below. normalizer = BrainDeadNormalizer() tokenizer = ShingleGenerator(3) ranker = BrainDeadRanker() results = [] hit_count = 10 # Load MeSH terms. print("LOADING...") corpus = InMemoryCorpus("data/mesh.txt") # Do ranked retrieval, using n-grams (shingles) and a simple ranker. This allows for fuzzy retrieval. print("INDEXING...") inverted_index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer) engine = SimpleSearchEngine(corpus, inverted_index) # Callback for receiving matches. def match_collector(match): results.append(match) print("*** WINNER", match["score"], match["document"]) # Test with some mispelled queries. Be robust for arbitrary resolving of ties. for (query, winner_score, winner_document_ids) in [ ("orGAnik kEMmistry", 8.0, [16981, 16980, 4411, 4410, 4408]), ("synndrome", 7.0, [1275]) ]: print("SEARCHING for '" + query + "'...") results.clear() options = { "match_threshold": 0.1, "hit_count": hit_count, "debug": False } engine.evaluate(query, options, ranker, match_collector) assert 0 < len(results) <= hit_count assert results[0]["score"] == winner_score assert results[0]["document"].document_id in winner_document_ids
def main(): import os.path from normalization import BrainDeadNormalizer from tokenization import BrainDeadTokenizer from corpus import InMemoryCorpus from suffixarray import SuffixArray print("Building suffix array from Cranfield corpus...") normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() corpus = InMemoryCorpus(os.path.join(data_path, 'cran.xml')) engine = SuffixArray(corpus, ["body"], normalizer, tokenizer) options = {"debug": False, "hit_count": 5} print("Enter a prefix phrase query and find matching documents.") print(f"Lookup options are {options}.") print("Returned scores are occurrence counts.") def evaluator(query): matches = [] engine.evaluate(query, options, lambda m: matches.append(m)) return matches simple_repl("query", evaluator)
def main(): import os.path from normalization import BrainDeadNormalizer from tokenization import BrainDeadTokenizer from corpus import InMemoryCorpus from invertedindex import InMemoryInvertedIndex print("Building inverted index from Cranfield corpus...") normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() corpus = InMemoryCorpus(os.path.join(data_path, 'cran.xml')) index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer) print("Enter one or more index terms and inspect their posting lists.") def evaluator(terms): terms = index.get_terms(terms) return { term: list(index.get_postings_iterator(term)) for term in terms } simple_repl("terms", evaluator)
def assignment_e_naivebayes_1(): # Use these throughout below. These are really language-specific functions, so it's a huge # simplification to use these for a language identifier. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() results = [] # Callback for receiving results. Received scores are log-probabilities. def match_collector(match: dict): results.append(match) print("*** WINNER", match["score"], match["category"]) # Use this as the training set for our language identifier. print("LOADING...") training_set = { language: InMemoryCorpus("data/" + language + ".txt") for language in ["en", "no", "da", "de"] } # Assess probabilities from the training set. print("TRAINING...") classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer) # Classify some previously unseen text fragments. print("CLASSIFYING...") for (buffer, language) in [ ("Mon tro om det riktige språket identifiseres? Dette er norsk bokmål, forøvrig.", "no"), ("I don't believe that the number of tokens exceeds a billion.", "en"), ("De danske drenge drikker snaps!", "da"), ("Der Kriminalpolizei! Haben sie angst?", "de") ]: print(buffer) results.clear() classifier.classify(buffer, match_collector) assert results[0]["category"] == language
def assignment_c_simplesearchengine_3(): # All accesses to posting lists are logged here. accesses = [] # For testing. class AccessLoggedIterator(Iterator[Posting]): def __init__(self, term: str, wrapped: Iterator[Posting]): self._term = term self._wrapped = wrapped def __next__(self): posting = next(self._wrapped) accesses.append((self._term, posting.document_id)) return posting # For testing. class AccessLoggedInvertedIndex(InvertedIndex): def __init__(self, wrapped: InvertedIndex): self._wrapped = wrapped def get_terms(self, buffer: str) -> Iterator[str]: return self._wrapped.get_terms(buffer) def get_postings_iterator(self, term: str) -> Iterator[Posting]: return AccessLoggedIterator( term, self._wrapped.get_postings_iterator(term)) def get_document_frequency(self, term: str) -> int: return self._wrapped.get_document_frequency(term) # Use these throughout below. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() # Load and index MeSH terms. corpus = InMemoryCorpus("data/mesh.txt") inverted_index = AccessLoggedInvertedIndex( InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)) # Do ranked retrieval, using a simple ranker. engine = SimpleSearchEngine(corpus, inverted_index) simple_ranker = BrainDeadRanker() query = "Water polluTION" options = {"match_threshold": 0.5, "hit_count": 1, "debug": False} engine.evaluate(query, options, simple_ranker, lambda m: m) # Expected posting list traversal ordering if the implementation chooses to evaluate this as "water pollution". ordering1 = [('water', 3078), ('pollution', 788), ('pollution', 789), ('pollution', 790), ('pollution', 8079), ('water', 8635), ('pollution', 23837), ('water', 9379), ('water', 23234), ('water', 25265), ('pollution', 25274), ('water', 25266), ('water', 25267), ('water', 25268), ('water', 25269), ('water', 25270), ('water', 25271), ('water', 25272), ('water', 25273), ('water', 25274), ('water', 25275), ('pollution', 25275), ('water', 25276), ('pollution', 25276), ('water', 25277), ('water', 25278), ('water', 25279), ('water', 25280), ('water', 25281)] # Expected posting list traversal ordering if the implementation chooses to evaluate this as "pollution water". ordering2 = [('pollution', 788), ('water', 3078), ('pollution', 789), ('pollution', 790), ('pollution', 8079), ('water', 8635), ('pollution', 23837), ('water', 9379), ('water', 23234), ('water', 25265), ('pollution', 25274), ('water', 25266), ('water', 25267), ('water', 25268), ('water', 25269), ('water', 25270), ('water', 25271), ('water', 25272), ('water', 25273), ('water', 25274), ('pollution', 25275), ('water', 25275), ('pollution', 25276), ('water', 25276), ('water', 25277), ('water', 25278), ('water', 25279), ('water', 25280), ('water', 25281)] # Check that the posting lists have been accessed in a way that's consistent with document-at-a-time traversal. # Be somewhat robust to implementation details. This is a fairly strict test, and advanced (but valid) # implementations that for some reason do lookaheads or whatever might fail. assert accesses == ordering1 or accesses == ordering2
def assignment_c_simplesearchengine_2(): # Use these throughout below. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() ranker = BrainDeadRanker() # Used for comparing floating point numbers. epsilon = 0.0001 # Create a dummy test corpus. corpus = InMemoryCorpus() words = (''.join(term) for term in product("bcd", "aei", "jkl")) texts = (' '.join(word) for word in combinations_with_replacement(words, 3)) for text in texts: corpus.add_document(InMemoryDocument(corpus.size(), {'a': text})) # What we're testing. engine = SimpleSearchEngine( corpus, InMemoryInvertedIndex(corpus, ["a"], normalizer, tokenizer)) # Where the callback will collect the matches. results = [] # Callback that collects matches. def collect(m): results.append((m['score'], m['document'].document_id)) # Executes a query. def search(q, t, n): results.clear() engine.evaluate(q, { 'match_threshold': t, 'hit_count': n }, ranker, collect) # Sorts the collected matches. def sort_results(): results.sort(key=lambda e: e[1]) results.sort(key=lambda e: e[0], reverse=True) # Test predicate. def check_at(i, expected): assert results[i] == expected # Test predicate. def check_range(indices, score, document_ids): for i, d in zip(indices, document_ids): check_at(i, (score, d)) # Test predicate. def check_hits(n): assert len(results) == n # Run tests! search('baj BAJ baj', 1.0, 27) check_hits(27) check_at(0, (9.0, 0)) sort_results() check_range(range(1, 27), 6.0, range(1, 27)) search('baj caj', 1.0, 100) check_hits(27) search('baj caj daj', 2 / 3 + epsilon, 100) check_hits(79) search('baj caj', 2 / 3 + epsilon, 100) check_hits(100) sort_results() check_at(0, (3.0, 0)) check_range(range(4, 12), 2.0, range(1, 9)) check_range(range(12, 29), 2.0, range(10, 27)) check_at(29, (2.0, 35)) check_at(78, (2.0, 2531)) search('baj cek dil', 1.0, 10) check_hits(1) check_at(0, (3.0, 286)) search('baj cek dil', 2 / 3 + epsilon, 80) check_hits(79) sort_results() check_at(0, (3.0, 13)) check_at(1, (3.0, 26)) check_at(2, (3.0, 273)) search('baj xxx yyy', 2 / 3 + epsilon, 100) check_hits(0) search('baj xxx yyy', 2 / 3 - epsilon, 100) check_hits(100)
def setUp(self): from normalization import BrainDeadNormalizer self._normalizer = BrainDeadNormalizer()
def test_simple_search_engine(): from itertools import product, combinations_with_replacement from tokenization import BrainDeadTokenizer from normalization import BrainDeadNormalizer from corpus import InMemoryCorpus, InMemoryDocument from invertedindex import InMemoryInvertedIndex from searchengine import SimpleSearchEngine from ranking import BrainDeadRanker Ɛ = 0.0001 corpus = InMemoryCorpus() for txt in (' '.join(w) for w in combinations_with_replacement( list(''.join(t) for t in product( 'bcd', 'aei', 'jkl', )), 3)): corpus.add_document(InMemoryDocument(corpus.size(), {'a': txt})) engine = SimpleSearchEngine( corpus, InMemoryInvertedIndex(corpus, ('a', ), BrainDeadNormalizer(), BrainDeadTokenizer())) results = [] def search(q, r, n): results.clear() def match(m): results.append((m['score'], m['document'].document_id)) print('searching "' + q + '" at threshold', r, '…') engine.evaluate(q, { 'recall_threshold': r, 'hit_count': n }, BrainDeadRanker(), match) def sort_results(): results.sort(key=lambda e: e[1]) results.sort(key=lambda e: e[0], reverse=True) def check_at(i, expected): if results[i] != expected: print('FAILED, EXPECTED ', expected, ' RESULT', i, ' was', results[i]) def check_range(indices, score, docrange): for i, d in zip(indices, docrange): check_at(i, (score, d)) def check_hits(n): if len(results) != n: print('FAILED, expected', n, 'results, got', len(results)) search('baj BAJ baj', 1.0, 27) check_hits(27) check_at(0, (9.0, 0)) sort_results() check_range(range(1, 27), 6.0, range(1, 27)) search('baj CAj', 1.0, 100) check_hits(27) search('baj caj daj', 2 / 3 + Ɛ, 100) check_hits(79) search('baj caj', 2 / 3 + Ɛ, 100) # her check_hits(100) sort_results() check_at(0, (3.0, 0)) check_range(range(4, 12), 2.0, range(1, 9)) check_range(range(12, 29), 2.0, range(10, 27)) check_at(29, (2.0, 35)) check_at(78, (2.0, 2531)) search('baj cek dil', 1.0, 10) check_hits(1) check_at(0, (3.0, 286)) search('baj cek dil', 2 / 3 + Ɛ, 80) check_hits(79) sort_results() check_at(0, (3.0, 13)) check_at(1, (3.0, 26)) check_at(2, (3.0, 273)) search('baj xxx yyy', 2 / 3 + Ɛ, 100) check_hits(0) search('baj xxx yyy', 2 / 3 - Ɛ, 100) check_hits(100)
def assignment_d(): # Use these throughout below. normalizer = BrainDeadNormalizer() simple_tokenizer = BrainDeadTokenizer() # Load MeSH terms. print("LOADING...") corpus = InMemoryCorpus("data/mesh.txt") # Do ranked retrieval, using n-grams and a simple ranker. This allows for fuzzy retrieval. print("INDEXING...") shingle_generator = ShingleGenerator(3) shingle_inverted_index = InMemoryInvertedIndex(corpus, ["body"], normalizer, shingle_generator) shingle_engine = SimpleSearchEngine(corpus, shingle_inverted_index) simple_ranker = BrainDeadRanker() results = [] hit_count = 10 # Callback for receiving matches. def match_collector(match): results.append(match) print("*** WINNER", match["score"], match["document"]) # Test with some mispelled queries. Be robust for arbitrary resolving of ties. for (query, winner_score, winner_document_ids) in [ ("orGAnik kEMmistry", 8.0, [16981, 16980, 4411, 4410, 4408]), ("synndrome", 7.0, [1275]) ]: print("SEARCHING for '" + query + "'...") results.clear() options = { "match_threshold": 0.1, "hit_count": hit_count, "debug": False } shingle_engine.evaluate(query, options, simple_ranker, match_collector) assert 0 < len(results) <= hit_count assert results[0]["score"] == winner_score assert results[0]["document"].document_id in winner_document_ids # Load and index some English news sentences. Look at the output and compare the two rankers! # The naive ranker assigns equal weight to all words (including stopwords), whereas the improved # ranker does not. The test below for the improved ranker (with document #24 being the winner) # assumes a straightforward implementation of a TF-IDF ranking scheme as described in the # textbook. print("LOADING...") corpus = InMemoryCorpus("data/en.txt") print("INDEXING...") inverted_index = InMemoryInvertedIndex(corpus, ["body"], normalizer, simple_tokenizer) better_ranker = BetterRanker(corpus, inverted_index) engine = SimpleSearchEngine(corpus, inverted_index) for query in ["the terrorism attack and obama"]: options = { "match_threshold": 0.1, "hit_count": hit_count, "debug": False } for ranker in [simple_ranker, better_ranker]: print("SEARCHING for '" + query + "' using " + ranker.__class__.__name__ + "...") results.clear() engine.evaluate(query, options, ranker, match_collector) winner_document_ids = { simple_ranker: [9221, 7263], better_ranker: [24] }[ranker] assert 0 < len(results) <= hit_count assert results[0]["document"].document_id in winner_document_ids
def assignment_e(): # Use these throughout below. These are really language-specific functions, so it's a huge # simplification to use these for a language identifier. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() results = [] # Callback for receiving results. Received scores are log-probabilities. def match_collector(match: dict): results.append(match) print("*** WINNER", match["score"], match["category"]) # Use this as the training set for our language identifier. print("LOADING...") training_set = { language: InMemoryCorpus("data/" + language + ".txt") for language in ["en", "no", "da", "de"] } # Assess probabilities from the training set. print("TRAINING...") classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer) # Classify some previously unseen text fragments. print("CLASSIFYING...") for (buffer, language) in [ ("Mon tro om det riktige språket identifiseres? Dette er norsk bokmål, forøvrig.", "no"), ("I don't believe that the number of tokens exceeds a billion.", "en"), ("De danske drenge drikker snaps!", "da"), ("Der Kriminalpolizei! Haben sie angst?", "de") ]: print(buffer) results.clear() classifier.classify(buffer, match_collector) assert results[0]["category"] == language # For demonstration purposes, replicate Example 13.1 on pages 241 and 242 in the textbook. china = InMemoryCorpus() china.add_document(InMemoryDocument(0, {"body": "Chinese Beijing Chinese"})) china.add_document( InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"})) china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"})) not_china = InMemoryCorpus() not_china.add_document(InMemoryDocument(0, {"body": "Tokyo Japan Chinese"})) training_set = {"china": china, "not china": not_china} classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer) buffer = "Chinese Chinese Chinese Tokyo Japan" print(buffer) results.clear() classifier.classify(buffer, match_collector) assert len(results) == 2 assert results[0]["category"] == "china" assert results[1]["category"] == "not china" assert math.isclose(math.exp(results[0]["score"]), 0.0003, abs_tol=0.00001) assert math.isclose(math.exp(results[1]["score"]), 0.0001, abs_tol=0.00005)
def setUp(self): from normalization import BrainDeadNormalizer from tokenization import BrainDeadTokenizer self._normalizer = BrainDeadNormalizer() self._tokenizer = BrainDeadTokenizer()
def assignment_b(): # Use these throughout below. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() # Prepare for some suffix array lookups. print("LOADING...") corpus = InMemoryCorpus("data/cran.xml") print("INDEXING...") engine = SuffixArray(corpus, ["body"], normalizer, tokenizer) results = [] hit_count = 5 # Callback for receiving matches. def match_collector(match): results.append(match) print("*** WINNER", match["score"], match["document"]) # Define the actual test queries. test1 = ("visc", 11, [328]) # Look for {'viscous', 'viscosity', ...}. test2 = ("Of A", 10, [946]) # Test robustness for case and whitespace. test3 = ("", 0, []) # Safety feature: Match nothing instead of everything. test4 = ("approximate solution", 3, [1374, 159]) # Multiple winners. # Test that the simple occurrence ranking works. Be robust towards how ties are resolved. for (query, winner_score, winner_document_ids) in [test1, test2, test3, test4]: print("SEARCHING for '" + query + "'...") results.clear() engine.evaluate(query, { "debug": False, "hit_count": hit_count }, match_collector) if winner_document_ids: assert results[0]["score"] == winner_score assert results[0]["document"].document_id in winner_document_ids assert len(results) <= hit_count else: assert len(results) == 0 # Simple test of using a trie-encoded dictionary for efficiently locating substrings in a buffer. trie = Trie() for s in [ "romerike", "apple computer", "norsk", "norsk ørret", "sverige", "ørret", "banan" ]: trie.add(s, tokenizer) finder = StringFinder(trie, tokenizer) buffer = "det var en gang en norsk ørret fra romerike som likte abba fra sverige" print("SCANNING...") results.clear() finder.scan(buffer, lambda m: results.append(m)) print("Buffer \"" + buffer + "\" contains", results) assert [m["match"] for m in results ] == ["norsk", "norsk ørret", "ørret", "romerike", "sverige"] # Find all MeSH terms that occur verbatim in some selected Cranfield documents! Since MeSH # documents are medical terms and the Cranfield documents have technical content, the # overlap probably isn't that big. print("LOADING...") mesh = InMemoryCorpus("data/mesh.txt") cranfield = InMemoryCorpus("data/cran.xml") print("BUILDING...") trie = Trie() for d in mesh: trie.add(d["body"] or "", tokenizer) finder = StringFinder(trie, tokenizer) print("SCANNING...") for (document_id, expected_matches) in [(0, ["wing", "wing"]), (3, ["solutions", "skin", "friction"]), (1254, ["electrons", "ions"])]: document = cranfield[document_id] buffer = document["body"] or "" results.clear() finder.scan(buffer, lambda m: results.append(m)) print("Cranfield document", document, "contains MeSH terms", results) assert [m["match"] for m in results] == expected_matches