def setUp(self): normalizer = in3120.BrainDeadNormalizer() tokenizer = in3120.BrainDeadTokenizer() corpus = in3120.InMemoryCorpus() corpus.add_document( in3120.InMemoryDocument(0, { "title": "the foo", "static_quality_score": 0.9 })) corpus.add_document( in3120.InMemoryDocument(1, { "title": "the foo", "static_quality_score": 0.2 })) corpus.add_document( in3120.InMemoryDocument(2, { "title": "the foo foo", "static_quality_score": 0.2 })) corpus.add_document(in3120.InMemoryDocument(3, {"title": "the bar"})) corpus.add_document( in3120.InMemoryDocument(4, {"title": "the bar bar"})) corpus.add_document(in3120.InMemoryDocument(5, {"title": "the baz"})) corpus.add_document(in3120.InMemoryDocument(6, {"title": "the baz"})) corpus.add_document( in3120.InMemoryDocument(7, {"title": "the baz baz"})) index = in3120.InMemoryInvertedIndex(corpus, ["title"], normalizer, tokenizer) self.__ranker = in3120.BetterRanker(corpus, index)
def repl_a(): print("Building inverted index from Cranfield corpus...") normalizer = in3120.BrainDeadNormalizer() tokenizer = in3120.BrainDeadTokenizer() corpus = in3120.InMemoryCorpus(data_path("cran.xml")) index = in3120.InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer) print("Enter one or more index terms and inspect their posting lists.") simple_repl("terms", lambda ts: {t: list(index.get_postings_iterator(t)) for t in index.get_terms(ts)})
def repl_b_2(): print("Building trie from MeSH corpus...") normalizer = in3120.BrainDeadNormalizer() tokenizer = in3120.BrainDeadTokenizer() corpus = in3120.InMemoryCorpus(data_path("mesh.txt")) dictionary = in3120.Trie() dictionary.add((normalizer.normalize(normalizer.canonicalize(d["body"])) for d in corpus), tokenizer) engine = in3120.StringFinder(dictionary, tokenizer) print("Enter some text and locate words and phrases that are MeSH terms.") simple_repl("text", lambda t: list(engine.scan(normalizer.normalize(normalizer.canonicalize(t)))))
def repl_e(): print("Initializing naive Bayes classifier from news corpora...") normalizer = in3120.BrainDeadNormalizer() tokenizer = in3120.BrainDeadTokenizer() languages = ["en", "no", "da", "de"] training_set = {language: in3120.InMemoryCorpus(data_path(f"{language}.txt")) for language in languages} classifier = in3120.NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer) print(f"Enter some text and classify it into {languages}.") print(f"Returned scores are log-probabilities.") simple_repl("text", lambda t: list(classifier.classify(t)))
def repl_b_1(): print("Building suffix array from Cranfield corpus...") normalizer = in3120.BrainDeadNormalizer() tokenizer = in3120.BrainDeadTokenizer() corpus = in3120.InMemoryCorpus(data_path("cran.xml")) engine = in3120.SuffixArray(corpus, ["body"], normalizer, tokenizer) options = {"debug": False, "hit_count": 5} print("Enter a prefix phrase query and find matching documents.") print(f"Lookup options are {options}.") print("Returned scores are occurrence counts.") simple_repl("query", lambda q: list(engine.evaluate(q, options)))
def test_mesh_corpus(self): normalizer = in3120.BrainDeadNormalizer() tokenizer = in3120.BrainDeadTokenizer() corpus = in3120.InMemoryCorpus("../data/mesh.txt") index = in3120.InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer) self.__process_query_with_two_terms(corpus, index, "HIV pROtein", self.__merger.intersection, [11316, 11319, 11320, 11321]) self.__process_query_with_two_terms( corpus, index, "water Toxic", self.__merger.union, [3078, 8138, 8635, 9379, 14472, 18572, 23234, 23985] + [i for i in range(25265, 25282)])
def repl_d_2(): print("Indexing English news corpus...") normalizer = in3120.BrainDeadNormalizer() tokenizer = in3120.BrainDeadTokenizer() corpus = in3120.InMemoryCorpus(data_path("en.txt")) index = in3120.InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer) ranker = in3120.BetterRanker(corpus, index) engine = in3120.SimpleSearchEngine(corpus, index) options = {"debug": False, "hit_count": 5, "match_threshold": 0.5} print("Enter a query and find matching documents.") print(f"Lookup options are {options}.") print(f"Tokenizer is {tokenizer.__class__.__name__}.") print(f"Ranker is {ranker.__class__.__name__}.") simple_repl("query", lambda q: list(engine.evaluate(q, options, ranker)))
def setUp(self): self.__normalizer = in3120.BrainDeadNormalizer() self.__tokenizer = in3120.BrainDeadTokenizer()
def setUp(self): self.__tokenizer = in3120.ShingleGenerator(3) self.__normalizer = in3120.BrainDeadNormalizer()