예제 #1
0
def assignment_a_postingsmerger_1():

    # A small but real corpus.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    corpus = InMemoryCorpus("./data/mesh.txt")
    index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)

    # Test that we merge posting lists correctly. Note implicit test for case- and whitespace robustness.
    print("MERGING...")
    merger = PostingsMerger()
    and_query = ("HIV  pROtein", "AND", [11316, 11319, 11320, 11321])
    or_query = ("water Toxic", "OR",
                [3078, 8138, 8635, 9379, 14472, 18572, 23234, 23985] +
                [i for i in range(25265, 25282)])
    for (query, operator, expected_document_ids) in [and_query, or_query]:
        print(re.sub("\W+", " " + operator + " ", query))
        terms = list(index.get_terms(query))
        assert len(terms) == 2
        postings = [index[terms[i]] for i in range(len(terms))]
        merged = {
            "AND": merger.intersection,
            "OR": merger.union
        }[operator](postings[0], postings[1])
        documents = [corpus[posting.document_id] for posting in merged]
        print(*documents, sep="\n")
        assert len(documents) == len(expected_document_ids)
        assert [d.document_id for d in documents] == expected_document_ids
예제 #2
0
파일: repl.py 프로젝트: 181221/IN4120-SOEK
def main():
    import os.path
    from normalization import BrainDeadNormalizer
    from tokenization import BrainDeadTokenizer
    from corpus import InMemoryCorpus
    from invertedindex import InMemoryInvertedIndex
    from ranking import BrainDeadRanker
    from searchengine import SimpleSearchEngine
    print("Indexing English news corpus...")
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    corpus = InMemoryCorpus(os.path.join(data_path, 'en.txt'))
    index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)
    ranker = BrainDeadRanker()
    engine = SimpleSearchEngine(corpus, index)
    options = {"debug": False, "hit_count": 5, "match_threshold": 0.5}
    print("Enter a query and find matching documents.")
    print(f"Lookup options are {options}.")
    print(f"Tokenizer is {tokenizer.__class__.__name__}.")
    print(f"Ranker is {ranker.__class__.__name__}.")

    def evaluator(query):
        matches = []
        engine.evaluate(query, options, ranker, lambda m: matches.append(m))
        return matches

    simple_repl("query", evaluator)
예제 #3
0
def assignment_a_inverted_index_1():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()

    # Dump postings for a dummy two-document corpus.
    print("INDEXING...")
    corpus = InMemoryCorpus()
    corpus.add_document(InMemoryDocument(0, {"body": "this is a Test"}))
    corpus.add_document(InMemoryDocument(1, {"body": "test TEST prØve"}))
    index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)
    for (term, expected) in zip(index.get_terms("PRøvE wtf tesT"), [[(1, 1)], [], [(0, 1), (1, 2)]]):
        print(term)
        assert term in ["prøve", "wtf", "test"]
        postings = list(index[term])
        for posting in postings:
            print(posting)
        assert len(postings) == len(expected)
        assert [(p.document_id, p.term_frequency) for p in postings] == expected
    print(index)

    # Document counts should be correct.
    assert index.get_document_frequency("wtf") == 0
    assert index.get_document_frequency("test") == 2
    assert index.get_document_frequency("prøve") == 1
예제 #4
0
def assignment_e_naivebayes_2():

    # Use these throughout below. These are really language-specific functions, so it's a huge
    # simplification to use these for a language identifier.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    results = []

    # Callback for receiving results. Received scores are log-probabilities.
    def match_collector(match: dict):
        results.append(match)
        print("*** WINNER", match["score"], match["category"])

    # Replicate Example 13.1 on pages 241 and 242 in the textbook.
    china = InMemoryCorpus()
    china.add_document(InMemoryDocument(0,
                                        {"body": "Chinese Beijing Chinese"}))
    china.add_document(
        InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"}))
    china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"}))
    not_china = InMemoryCorpus()
    not_china.add_document(InMemoryDocument(0,
                                            {"body": "Tokyo Japan Chinese"}))
    training_set = {"china": china, "not china": not_china}
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer,
                                      tokenizer)
    buffer = "Chinese Chinese Chinese Tokyo Japan"
    print(buffer)
    results.clear()
    classifier.classify(buffer, match_collector)
    assert len(results) == 2
    assert results[0]["category"] == "china"
    assert results[1]["category"] == "not china"
    assert math.isclose(math.exp(results[0]["score"]), 0.0003, abs_tol=0.00001)
    assert math.isclose(math.exp(results[1]["score"]), 0.0001, abs_tol=0.00005)
예제 #5
0
def main():
    import os.path
    from normalization import BrainDeadNormalizer
    from tokenization import BrainDeadTokenizer
    from corpus import InMemoryCorpus
    from ahocorasick import Trie, StringFinder
    print("Building trie from MeSH corpus...")
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    corpus = InMemoryCorpus(os.path.join(data_path, 'mesh.txt'))
    dictionary = Trie()
    for document in corpus:
        dictionary.add(
            normalizer.normalize(normalizer.canonicalize(document["body"])),
            tokenizer)
    engine = StringFinder(dictionary, tokenizer)
    print("Enter some text and locate words and phrases that are MeSH terms.")

    def evaluator(text):
        matches = []
        engine.scan(normalizer.normalize(normalizer.canonicalize(text)),
                    lambda m: matches.append(m))
        return matches

    simple_repl("text", evaluator)
예제 #6
0
 def setUp(self):
     from normalization import BrainDeadNormalizer
     from tokenization import BrainDeadTokenizer
     from corpus import InMemoryDocument, InMemoryCorpus
     from invertedindex import InMemoryInvertedIndex
     from ranking import BetterRanker
     normalizer = BrainDeadNormalizer()
     tokenizer = BrainDeadTokenizer()
     corpus = InMemoryCorpus()
     corpus.add_document(
         InMemoryDocument(0, {
             "title": "the foo",
             "static_quality_score": 0.9
         }))
     corpus.add_document(
         InMemoryDocument(1, {
             "title": "the foo",
             "static_quality_score": 0.2
         }))
     corpus.add_document(
         InMemoryDocument(2, {
             "title": "the foo foo",
             "static_quality_score": 0.2
         }))
     corpus.add_document(InMemoryDocument(3, {"title": "the bar"}))
     corpus.add_document(InMemoryDocument(4, {"title": "the bar bar"}))
     corpus.add_document(InMemoryDocument(5, {"title": "the baz"}))
     corpus.add_document(InMemoryDocument(6, {"title": "the baz"}))
     corpus.add_document(InMemoryDocument(7, {"title": "the baz baz"}))
     index = InMemoryInvertedIndex(corpus, ["title"], normalizer, tokenizer)
     self._ranker = BetterRanker(corpus, index)
예제 #7
0
def assignment_a():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()

    # Dump postings for a dummy two-document corpus.
    print("INDEXING...")
    corpus = InMemoryCorpus()
    corpus.add_document(InMemoryDocument(0, {"body": "this is a Test"}))
    corpus.add_document(InMemoryDocument(1, {"body": "test TEST prØve"}))
    index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)
    for (term, expected) in zip(index.get_terms("PRøvE wtf tesT"),
                                [[(1, 1)], [], [(0, 1), (1, 2)]]):
        print(term)
        assert term in ["prøve", "wtf", "test"]
        postings = list(index.get_postings_iterator(term))
        for posting in postings:
            print(posting)
        assert len(postings) == len(expected)
        assert [(p.document_id, p.term_frequency)
                for p in postings] == expected
    print(index)

    # Again, for a slightly bigger corpus.
    print("LOADING...")
    corpus = InMemoryCorpus("data/mesh.txt")
    print("INDEXING...")
    index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)
    for (term, expected_length) in [("hydrogen", 8), ("hydrocephalus", 2)]:
        print(term)
        for posting in index.get_postings_iterator(term):
            print(posting)
        assert len(list(index.get_postings_iterator(term))) == expected_length

    # Test that we merge posting lists correctly. Note implicit test for case- and whitespace robustness.
    print("MERGING...")
    merger = PostingsMerger()
    and_query = ("HIV  pROtein", "AND", [11316, 11319, 11320, 11321])
    or_query = ("water Toxic", "OR",
                [3078, 8138, 8635, 9379, 14472, 18572, 23234, 23985] +
                [i for i in range(25265, 25282)])
    for (query, operator, expected_document_ids) in [and_query, or_query]:
        print(re.sub("\W+", " " + operator + " ", query))
        terms = list(index.get_terms(query))
        assert len(terms) == 2
        postings = [
            index.get_postings_iterator(terms[i]) for i in range(len(terms))
        ]
        merged = {
            "AND": merger.intersection,
            "OR": merger.union
        }[operator](postings[0], postings[1])
        documents = [
            corpus.get_document(posting.document_id) for posting in merged
        ]
        print(*documents, sep="\n")
        assert len(documents) == len(expected_document_ids)
        assert [d.get_document_id()
                for d in documents] == expected_document_ids
예제 #8
0
class TestBrainDeadTokenizer(unittest.TestCase):
    def setUp(self):
        from tokenization import BrainDeadTokenizer
        self._tokenizer = BrainDeadTokenizer()

    def test_strings(self):
        result = self._tokenizer.strings("Dette  er en\nprøve!")
        self.assertListEqual(result, ["Dette", "er", "en", "prøve"])

    def test_tokens(self):
        result = self._tokenizer.tokens("Dette  er en\nprøve!")
        self.assertListEqual(result, [("Dette", (0, 5)), ("er", (7, 9)),
                                      ("en", (10, 12)), ("prøve", (13, 18))])

    def test_ranges(self):
        result = self._tokenizer.ranges("Dette  er en\nprøve!")
        self.assertListEqual(result, [(0, 5), (7, 9), (10, 12), (13, 18)])
예제 #9
0
 def test_access_nodes(self):
     from tokenization import BrainDeadTokenizer
     from ahocorasick import Trie
     tokenizer = BrainDeadTokenizer()
     strings = ["abba", "ørret", "abb", "abbab", "abbor"]
     root = Trie()
     for s in strings:
         root.add(s, tokenizer)
     self.assertFalse(root.is_final())
     self.assertIsNone(root.consume("snegle"))
     node = root.consume("ab")
     self.assertFalse(node.is_final())
     node = node.consume("b")
     self.assertTrue(node.is_final())
     self.assertEqual(node, root.consume("abb"))
예제 #10
0
def main():
    """
    Example usage. A tiny unit test, in a sense.
    """
    tokenizer = BrainDeadTokenizer()
    strings = ["abba", "ørret", "abb", "abbab", "abbor"]
    trie = Trie()
    for s in strings:
        trie.add(s, tokenizer)
    assert trie.is_final() is False
    assert trie.consume("snegle") is None
    node = trie.consume("ab")
    assert node.is_final() is False
    node = node.consume("b")
    assert node.is_final() is True
    assert node == trie.consume("abb")
예제 #11
0
def assignment_a_inverted_index_2():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()

    # Dump postings for a slightly bigger corpus.
    print("LOADING...")
    corpus = InMemoryCorpus("./data/mesh.txt")
    print("INDEXING...")
    index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)
    for (term, expected_length) in [("hydrogen", 8), ("hydrocephalus", 2)]:
        print(term)
        for posting in index[term]:
            print(posting)
        assert len(list(index[term])) == expected_length
예제 #12
0
def assignment_b_stringfinder():

    # Use these throughout below.
    tokenizer = BrainDeadTokenizer()
    results = []

    # Simple test of using a trie-encoded dictionary for efficiently locating substrings in a buffer.
    trie = Trie()
    for s in [
            "romerike", "apple computer", "norsk", "norsk ørret", "sverige",
            "ørret", "banan"
    ]:
        trie.add(s, tokenizer)
    finder = StringFinder(trie, tokenizer)
    buffer = "det var en gang en norsk  ørret fra romerike som likte abba fra sverige"
    print("SCANNING...")
    results.clear()
    finder.scan(buffer, lambda m: results.append(m))
    print("Buffer \"" + buffer + "\" contains", results)
    assert [m["match"] for m in results
            ] == ["norsk", "norsk ørret", "ørret", "romerike", "sverige"]

    # Find all MeSH terms that occur verbatim in some selected Cranfield documents! Since MeSH
    # documents are medical terms and the Cranfield documents have technical content, the
    # overlap probably isn't that big.
    print("LOADING...")
    mesh = InMemoryCorpus("data/mesh.txt")
    cranfield = InMemoryCorpus("data/cran.xml")
    print("BUILDING...")
    trie = Trie()
    for d in mesh:
        trie.add(d["body"] or "", tokenizer)
    finder = StringFinder(trie, tokenizer)
    print("SCANNING...")
    for (document_id,
         expected_matches) in [(0, ["wing", "wing"]),
                               (3, ["solutions", "skin", "friction"]),
                               (1254, ["electrons", "ions"])]:
        document = cranfield.get_document(document_id)
        buffer = document["body"] or ""
        results.clear()
        finder.scan(buffer, lambda m: results.append(m))
        print("Cranfield document", document, "contains MeSH terms", results)
        assert [m["match"] for m in results] == expected_matches
예제 #13
0
def assignment_c_simplesearchengine_1():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()

    # Load and index MeSH terms.
    print("LOADING...")
    corpus = InMemoryCorpus("../data/mesh.txt")
    print("INDEXING...")
    inverted_index = InMemoryInvertedIndex(corpus, ["body"], normalizer,
                                           tokenizer)

    # Do ranked retrieval, using a simple ranker.
    engine = SimpleSearchEngine(corpus, inverted_index)
    simple_ranker = BrainDeadRanker()
    results = []

    # Callback for receiving matches.
    def match_collector(match):
        results.append(match)
        print("*** WINNER", match["score"], match["document"])

    query = "polluTION Water"
    for match_threshold in [0.1, 1.0]:
        print(
            f"SEARCHING for '{query}' with match threshold {str(match_threshold)}..."
        )
        results.clear()
        options = {
            "match_threshold": match_threshold,
            "hit_count": 10,
            "debug": False
        }
        engine.evaluate(query, options, simple_ranker, match_collector)
        assert len(results) == {0.1: 10, 1.0: 3}[match_threshold]
        for (score, document_id) in [(match["score"],
                                      match["document"].document_id)
                                     for match in results[:3]]:
            assert score == 2.0  # Both 'pollution' and 'water'.
            assert document_id in [25274, 25275, 25276]
        for score in [match["score"] for match in results[3:]]:
            assert score == 1.0  # Only 'pollution' or 'water', but not both.
예제 #14
0
def assignment_a_inverted_index_3():
    # tests that multiple fields are handled correctly

    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()

    doc = InMemoryDocument(document_id=0, fields={
        'felt 1': 'Dette er en test. Test, sa jeg. TEST!',
        'felt 2': 'test er det',
        'felt 3': 'test TEsT',
    })
    corpus = InMemoryCorpus()
    corpus.add_document(doc)

    index = InMemoryInvertedIndex(corpus, ['felt 1', 'felt 3'], normalizer, tokenizer)
    p = next(index.get_postings_iterator('test'))
    print(f"term-freq: {p.term_frequency} (correct is 5)")
    assert p.document_id == 0
    assert p.term_frequency == 5
예제 #15
0
def assignment_d_betterranker():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    results = []
    hit_count = 10

    # Callback for receiving matches.
    def match_collector(match):
        results.append(match)
        print("*** WINNER", match["score"], match["document"])

    # Load and index some English news sentences. Look at the output and compare the two rankers!
    # The naive ranker assigns equal weight to all words (including stopwords), whereas the improved
    # ranker does not. The test below for the improved ranker (with document #24 being the winner)
    # assumes a straightforward implementation of a TF-IDF ranking scheme as described in the
    # textbook.
    print("LOADING...")
    corpus = InMemoryCorpus("data/en.txt")
    print("INDEXING...")
    inverted_index = InMemoryInvertedIndex(corpus, ["body"], normalizer,
                                           tokenizer)
    simple_ranker = BrainDeadRanker()
    better_ranker = BetterRanker(corpus, inverted_index)
    engine = SimpleSearchEngine(corpus, inverted_index)
    for query in ["the terrorism attack and obama"]:
        options = {
            "match_threshold": 0.1,
            "hit_count": hit_count,
            "debug": False
        }
        for ranker in [simple_ranker, better_ranker]:
            print("SEARCHING for '" + query + "' using " +
                  ranker.__class__.__name__ + "...")
            results.clear()
            engine.evaluate(query, options, ranker, match_collector)
            winner_document_ids = {
                simple_ranker: [9221, 7263],
                better_ranker: [24]
            }[ranker]
            assert 0 < len(results) <= hit_count
            assert results[0]["document"].document_id in winner_document_ids
예제 #16
0
파일: repl.py 프로젝트: 181221/IN4120-SOEK
def main():
    import os.path
    from normalization import BrainDeadNormalizer
    from tokenization import BrainDeadTokenizer
    from corpus import InMemoryCorpus
    from naivebayesclassifier import NaiveBayesClassifier
    print("Initializing naive Bayes classifier from news corpora...")
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    languages = ["en", "no", "da", "de"]
    training_set = {language: InMemoryCorpus(os.path.join(data_path,f"{language}.txt")) for language in languages}
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer)
    print(f"Enter some text and classify it into {languages}.")
    print(f"Returned scores are log-probabilities.")

    def evaluator(text):
        results = []
        classifier.classify(text, lambda m: results.append(m))
        return results
    simple_repl("text", evaluator)
예제 #17
0
def assignment_b_suffixarray_1():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()

    # Prepare for some suffix array lookups.
    print("LOADING...")
    corpus = InMemoryCorpus("data/cran.xml")
    print("INDEXING...")
    engine = SuffixArray(corpus, ["body"], normalizer, tokenizer)
    results = []
    hit_count = 5

    # Callback for receiving matches.
    def match_collector(match):
        results.append(match)
        print("*** WINNER", match["score"], match["document"])

    # Define the actual test queries.
    test1 = ("visc", 11, [328])  # Look for {'viscous', 'viscosity', ...}.
    test2 = ("Of  A", 10, [946])  # Test robustness for case and whitespace.
    test3 = ("", 0, [])  # Safety feature: Match nothing instead of everything.
    test4 = ("approximate solution", 3, [1374, 159])  # Multiple winners.

    # Test that the simple occurrence ranking works. Be robust towards how ties are resolved.
    for (query, winner_score,
         winner_document_ids) in [test1, test2, test3, test4]:
        print("SEARCHING for '" + query + "'...")
        results.clear()
        engine.evaluate(query, {
            "debug": False,
            "hit_count": hit_count
        }, match_collector)
        if winner_document_ids:
            assert results[0]["score"] == winner_score
            assert results[0]["document"].document_id in winner_document_ids
            assert len(results) <= hit_count
        else:
            assert len(results) == 0
예제 #18
0
def main():
    import os.path
    from normalization import BrainDeadNormalizer
    from tokenization import BrainDeadTokenizer
    from corpus import InMemoryCorpus
    from suffixarray import SuffixArray
    print("Building suffix array from Cranfield corpus...")
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    corpus = InMemoryCorpus(os.path.join(data_path, 'cran.xml'))
    engine = SuffixArray(corpus, ["body"], normalizer, tokenizer)
    options = {"debug": False, "hit_count": 5}
    print("Enter a prefix phrase query and find matching documents.")
    print(f"Lookup options are {options}.")
    print("Returned scores are occurrence counts.")

    def evaluator(query):
        matches = []
        engine.evaluate(query, options, lambda m: matches.append(m))
        return matches

    simple_repl("query", evaluator)
예제 #19
0
파일: repl.py 프로젝트: 181221/IN4120-SOEK
def main():
    import os.path
    from normalization import BrainDeadNormalizer
    from tokenization import BrainDeadTokenizer
    from corpus import InMemoryCorpus
    from invertedindex import InMemoryInvertedIndex

    print("Building inverted index from Cranfield corpus...")
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    corpus = InMemoryCorpus(os.path.join(data_path, 'cran.xml'))
    index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)
    print("Enter one or more index terms and inspect their posting lists.")

    def evaluator(terms):
        terms = index.get_terms(terms)
        return {
            term: list(index.get_postings_iterator(term))
            for term in terms
        }

    simple_repl("terms", evaluator)
예제 #20
0
def assignment_e_naivebayes_1():

    # Use these throughout below. These are really language-specific functions, so it's a huge
    # simplification to use these for a language identifier.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    results = []

    # Callback for receiving results. Received scores are log-probabilities.
    def match_collector(match: dict):
        results.append(match)
        print("*** WINNER", match["score"], match["category"])

    # Use this as the training set for our language identifier.
    print("LOADING...")
    training_set = {
        language: InMemoryCorpus("data/" + language + ".txt")
        for language in ["en", "no", "da", "de"]
    }

    # Assess probabilities from the training set.
    print("TRAINING...")
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer,
                                      tokenizer)

    # Classify some previously unseen text fragments.
    print("CLASSIFYING...")
    for (buffer, language) in [
        ("Mon tro om det riktige språket identifiseres? Dette er norsk bokmål, forøvrig.",
         "no"),
        ("I don't believe that the number of tokens exceeds a billion.", "en"),
        ("De danske drenge drikker snaps!", "da"),
        ("Der Kriminalpolizei! Haben sie angst?", "de")
    ]:
        print(buffer)
        results.clear()
        classifier.classify(buffer, match_collector)
        assert results[0]["category"] == language
예제 #21
0
def assignment_c_simplesearchengine_2():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    ranker = BrainDeadRanker()

    # Used for comparing floating point numbers.
    epsilon = 0.0001

    # Create a dummy test corpus.
    corpus = InMemoryCorpus()
    words = (''.join(term) for term in product("bcd", "aei", "jkl"))
    texts = (' '.join(word)
             for word in combinations_with_replacement(words, 3))
    for text in texts:
        corpus.add_document(InMemoryDocument(corpus.size(), {'a': text}))

    # What we're testing.
    engine = SimpleSearchEngine(
        corpus, InMemoryInvertedIndex(corpus, ["a"], normalizer, tokenizer))

    # Where the callback will collect the matches.
    results = []

    # Callback that collects matches.
    def collect(m):
        results.append((m['score'], m['document'].document_id))

    # Executes a query.
    def search(q, t, n):
        results.clear()
        engine.evaluate(q, {
            'match_threshold': t,
            'hit_count': n
        }, ranker, collect)

    # Sorts the collected matches.
    def sort_results():
        results.sort(key=lambda e: e[1])
        results.sort(key=lambda e: e[0], reverse=True)

    # Test predicate.
    def check_at(i, expected):
        assert results[i] == expected

    # Test predicate.
    def check_range(indices, score, document_ids):
        for i, d in zip(indices, document_ids):
            check_at(i, (score, d))

    # Test predicate.
    def check_hits(n):
        assert len(results) == n

    # Run tests!
    search('baj BAJ    baj', 1.0, 27)
    check_hits(27)
    check_at(0, (9.0, 0))
    sort_results()
    check_range(range(1, 27), 6.0, range(1, 27))
    search('baj caj', 1.0, 100)
    check_hits(27)
    search('baj caj daj', 2 / 3 + epsilon, 100)
    check_hits(79)
    search('baj caj', 2 / 3 + epsilon, 100)
    check_hits(100)
    sort_results()
    check_at(0, (3.0, 0))
    check_range(range(4, 12), 2.0, range(1, 9))
    check_range(range(12, 29), 2.0, range(10, 27))
    check_at(29, (2.0, 35))
    check_at(78, (2.0, 2531))
    search('baj cek dil', 1.0, 10)
    check_hits(1)
    check_at(0, (3.0, 286))
    search('baj cek dil', 2 / 3 + epsilon, 80)
    check_hits(79)
    sort_results()
    check_at(0, (3.0, 13))
    check_at(1, (3.0, 26))
    check_at(2, (3.0, 273))
    search('baj xxx yyy', 2 / 3 + epsilon, 100)
    check_hits(0)
    search('baj xxx yyy', 2 / 3 - epsilon, 100)
    check_hits(100)
예제 #22
0
def assignment_b_suffixarray_2():

    # For testing.
    class TestNormalizer(Normalizer):

        _table = str.maketrans({'Ø': 'O'})

        def canonicalize(self, buffer: str) -> str:
            return buffer

        def normalize(self, token: str) -> str:
            return token.upper().translate(self._table)

    # For testing.
    class TestDocument(Document):

        def __init__(self, document_id: int, a: str, b: str):
            self._document_id = document_id
            self._a = a
            self._b = b

        def get_document_id(self) -> int:
            return self._document_id

        def get_field(self, field_name: str, default: str) -> str:
            if field_name == "a":
                return self._a
            if field_name == "b":
                return self._b
            return default

    # For testing.
    class TestCorpus(Corpus):
        def __init__(self):
            self._docs = []
            self._docs.append(TestDocument(len(self._docs), "ø  o\n\n\nø\n\no", "ø o\nø   \no"))
            self._docs.append(TestDocument(len(self._docs), "ba", "b bab"))
            self._docs.append(TestDocument(len(self._docs), "ø  o Ø o", "ø o"))
            self._docs.append(TestDocument(len(self._docs), "øO" * 10000, "o"))
            self._docs.append(TestDocument(len(self._docs), "cbab o øbab Ø ", "ø o " * 10000))

        def __iter__(self):
            return iter(self._docs)

        def size(self) -> int:
            return len(self._docs)

        def get_document(self, document_id: int) -> Document:
            return self._docs[document_id]

    # Run the tests!
    for fields in [("b",), ("a", "b")]:

        # Create the suffix array over the given set of fields. Measure memory usage. If memory usage is
        # excessive, most likely the implementation is copying strings or doing other silly stuff instead
        # of working with buffer indices. The naive reference implementation is not in any way optimized,
        # and uses about 1.5 MB of memory on this corpus.
        tracemalloc.start()
        snapshot1 = tracemalloc.take_snapshot()
        engine = SuffixArray(TestCorpus(), fields, TestNormalizer(), BrainDeadTokenizer())
        snapshot2 = tracemalloc.take_snapshot()
        for statistic in snapshot2.compare_to(snapshot1, "filename"):
            if statistic.traceback[0].filename == inspect.getfile(SuffixArray):
                assert statistic.size_diff < 2000000, f"Memory usage is {statistic.size_diff}"
        tracemalloc.stop()
        results = []

        def process(m):
            results.append((m['document'].document_id, m['score']))

        expected_results = {
            ('b',): (
                ('bab', [(1, 1)]),
                ('ø o', [(4, 19999), (0, 3), (2, 1)]),
                ('o O', [(4, 19999), (0, 3), (2, 1)]),
                ('oooooo', []),
                ('o o o o', [(4, 19997), (0, 1)]),
            ),
            ('a', 'b'): (
                ('bab', [(1, 1)]),
                ('ø o', [(4, 20000), (0, 6), (2, 4)]),
                ('o O', [(4, 20000), (0, 6), (2, 4)]),
                ('oøØOøO', [(3, 1), ]),
                ('o o o o', [(4, 19997), (0, 2), (2, 1)]),
            )
        }

        for query, expected in expected_results[fields]:
            results.clear()
            engine.evaluate(query, {'hit_count': 10}, process)
            assert results == expected
예제 #23
0
 def setUp(self):
     from tokenization import BrainDeadTokenizer
     self._tokenizer = BrainDeadTokenizer()
예제 #24
0
def assignment_d():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    simple_tokenizer = BrainDeadTokenizer()

    # Load MeSH terms.
    print("LOADING...")
    corpus = InMemoryCorpus("data/mesh.txt")

    # Do ranked retrieval, using n-grams and a simple ranker. This allows for fuzzy retrieval.
    print("INDEXING...")
    shingle_generator = ShingleGenerator(3)
    shingle_inverted_index = InMemoryInvertedIndex(corpus, ["body"],
                                                   normalizer,
                                                   shingle_generator)
    shingle_engine = SimpleSearchEngine(corpus, shingle_inverted_index)
    simple_ranker = BrainDeadRanker()
    results = []
    hit_count = 10

    # Callback for receiving matches.
    def match_collector(match):
        results.append(match)
        print("*** WINNER", match["score"], match["document"])

    # Test with some mispelled queries. Be robust for arbitrary resolving of ties.
    for (query, winner_score, winner_document_ids) in [
        ("orGAnik kEMmistry", 8.0, [16981, 16980, 4411, 4410, 4408]),
        ("synndrome", 7.0, [1275])
    ]:
        print("SEARCHING for '" + query + "'...")
        results.clear()
        options = {
            "match_threshold": 0.1,
            "hit_count": hit_count,
            "debug": False
        }
        shingle_engine.evaluate(query, options, simple_ranker, match_collector)
        assert 0 < len(results) <= hit_count
        assert results[0]["score"] == winner_score
        assert results[0]["document"].document_id in winner_document_ids

    # Load and index some English news sentences. Look at the output and compare the two rankers!
    # The naive ranker assigns equal weight to all words (including stopwords), whereas the improved
    # ranker does not. The test below for the improved ranker (with document #24 being the winner)
    # assumes a straightforward implementation of a TF-IDF ranking scheme as described in the
    # textbook.
    print("LOADING...")
    corpus = InMemoryCorpus("data/en.txt")
    print("INDEXING...")
    inverted_index = InMemoryInvertedIndex(corpus, ["body"], normalizer,
                                           simple_tokenizer)
    better_ranker = BetterRanker(corpus, inverted_index)
    engine = SimpleSearchEngine(corpus, inverted_index)
    for query in ["the terrorism attack and obama"]:
        options = {
            "match_threshold": 0.1,
            "hit_count": hit_count,
            "debug": False
        }
        for ranker in [simple_ranker, better_ranker]:
            print("SEARCHING for '" + query + "' using " +
                  ranker.__class__.__name__ + "...")
            results.clear()
            engine.evaluate(query, options, ranker, match_collector)
            winner_document_ids = {
                simple_ranker: [9221, 7263],
                better_ranker: [24]
            }[ranker]
            assert 0 < len(results) <= hit_count
            assert results[0]["document"].document_id in winner_document_ids
예제 #25
0
 def setUp(self):
     from normalization import BrainDeadNormalizer
     from tokenization import BrainDeadTokenizer
     self._normalizer = BrainDeadNormalizer()
     self._tokenizer = BrainDeadTokenizer()
예제 #26
0
def assignment_e():

    # Use these throughout below. These are really language-specific functions, so it's a huge
    # simplification to use these for a language identifier.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    results = []

    # Callback for receiving results. Received scores are log-probabilities.
    def match_collector(match: dict):
        results.append(match)
        print("*** WINNER", match["score"], match["category"])

    # Use this as the training set for our language identifier.
    print("LOADING...")
    training_set = {
        language: InMemoryCorpus("data/" + language + ".txt")
        for language in ["en", "no", "da", "de"]
    }

    # Assess probabilities from the training set.
    print("TRAINING...")
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer,
                                      tokenizer)

    # Classify some previously unseen text fragments.
    print("CLASSIFYING...")
    for (buffer, language) in [
        ("Mon tro om det riktige språket identifiseres? Dette er norsk bokmål, forøvrig.",
         "no"),
        ("I don't believe that the number of tokens exceeds a billion.", "en"),
        ("De danske drenge drikker snaps!", "da"),
        ("Der Kriminalpolizei! Haben sie angst?", "de")
    ]:
        print(buffer)
        results.clear()
        classifier.classify(buffer, match_collector)
        assert results[0]["category"] == language

    # For demonstration purposes, replicate Example 13.1 on pages 241 and 242 in the textbook.
    china = InMemoryCorpus()
    china.add_document(InMemoryDocument(0,
                                        {"body": "Chinese Beijing Chinese"}))
    china.add_document(
        InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"}))
    china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"}))
    not_china = InMemoryCorpus()
    not_china.add_document(InMemoryDocument(0,
                                            {"body": "Tokyo Japan Chinese"}))
    training_set = {"china": china, "not china": not_china}
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer,
                                      tokenizer)
    buffer = "Chinese Chinese Chinese Tokyo Japan"
    print(buffer)
    results.clear()
    classifier.classify(buffer, match_collector)
    assert len(results) == 2
    assert results[0]["category"] == "china"
    assert results[1]["category"] == "not china"
    assert math.isclose(math.exp(results[0]["score"]), 0.0003, abs_tol=0.00001)
    assert math.isclose(math.exp(results[1]["score"]), 0.0001, abs_tol=0.00005)
예제 #27
0
def assignment_b_suffixarray_2():

    # For testing.
    class TestNormalizer(Normalizer):

        _table = str.maketrans({'Ø': 'O'})

        def canonicalize(self, buffer: str) -> str:
            return buffer

        def normalize(self, token: str) -> str:
            return token.upper().translate(self._table)

    # For testing.
    class TestDocument(Document):
        def __init__(self, document_id: int, a: str, b: str):
            self._document_id = document_id
            self._a = a
            self._b = b

        def get_document_id(self) -> int:
            return self._document_id

        def get_field(self, field_name: str, default: str) -> str:
            if field_name == "a":
                return self._a
            if field_name == "b":
                return self._b
            return default

    # For testing.
    class TestCorpus(Corpus):
        def __init__(self):
            self._docs = []
            self._docs.append(
                TestDocument(len(self._docs), "ø  o\n\n\nø\n\no",
                             "ø o\nø   \no"))
            self._docs.append(TestDocument(len(self._docs), "ba", "b bab"))
            self._docs.append(TestDocument(len(self._docs), "ø  o Ø o", "ø o"))
            self._docs.append(TestDocument(len(self._docs), "øO" * 10000, "o"))
            self._docs.append(
                TestDocument(len(self._docs), "cbab o øbab Ø ",
                             "ø o " * 10000))

        def __iter__(self):
            return iter(self._docs)

        def size(self) -> int:
            return len(self._docs)

        def get_document(self, document_id: int) -> Document:
            return self._docs[document_id]

    # Run the tests!
    for fields in [("b", ), ("a", "b")]:

        engine = SuffixArray(TestCorpus(), fields, TestNormalizer(),
                             BrainDeadTokenizer())
        results = []

        def process(m):
            results.append((m['document'].document_id, m['score']))

        expected_results = {
            ('b', ): (
                ('bab', [(1, 1)]),
                ('ø o', [(4, 19999), (0, 3), (2, 1)]),
                ('o O', [(4, 19999), (0, 3), (2, 1)]),
                ('oooooo', []),
                ('o o o o', [(4, 19997), (0, 1)]),
            ),
            ('a', 'b'): (
                ('bab', [(1, 1)]),
                ('ø o', [(4, 20000), (0, 6), (2, 4)]),
                ('o O', [(4, 20000), (0, 6), (2, 4)]),
                ('oøØOøO', [
                    (3, 1),
                ]),
                ('o o o o', [(4, 19997), (0, 2), (2, 1)]),
            )
        }

        for query, expected in expected_results[fields]:
            results.clear()
            engine.evaluate(query, {'hit_count': 10}, process)
            assert results == expected
예제 #28
0
def assignment_b():

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()

    # Prepare for some suffix array lookups.
    print("LOADING...")
    corpus = InMemoryCorpus("data/cran.xml")
    print("INDEXING...")
    engine = SuffixArray(corpus, ["body"], normalizer, tokenizer)
    results = []
    hit_count = 5

    # Callback for receiving matches.
    def match_collector(match):
        results.append(match)
        print("*** WINNER", match["score"], match["document"])

    # Define the actual test queries.
    test1 = ("visc", 11, [328])  # Look for {'viscous', 'viscosity', ...}.
    test2 = ("Of  A", 10, [946])  # Test robustness for case and whitespace.
    test3 = ("", 0, [])  # Safety feature: Match nothing instead of everything.
    test4 = ("approximate solution", 3, [1374, 159])  # Multiple winners.

    # Test that the simple occurrence ranking works. Be robust towards how ties are resolved.
    for (query, winner_score,
         winner_document_ids) in [test1, test2, test3, test4]:
        print("SEARCHING for '" + query + "'...")
        results.clear()
        engine.evaluate(query, {
            "debug": False,
            "hit_count": hit_count
        }, match_collector)
        if winner_document_ids:
            assert results[0]["score"] == winner_score
            assert results[0]["document"].document_id in winner_document_ids
            assert len(results) <= hit_count
        else:
            assert len(results) == 0

    # Simple test of using a trie-encoded dictionary for efficiently locating substrings in a buffer.
    trie = Trie()
    for s in [
            "romerike", "apple computer", "norsk", "norsk ørret", "sverige",
            "ørret", "banan"
    ]:
        trie.add(s, tokenizer)
    finder = StringFinder(trie, tokenizer)
    buffer = "det var en gang en norsk  ørret fra romerike som likte abba fra sverige"
    print("SCANNING...")
    results.clear()
    finder.scan(buffer, lambda m: results.append(m))
    print("Buffer \"" + buffer + "\" contains", results)
    assert [m["match"] for m in results
            ] == ["norsk", "norsk ørret", "ørret", "romerike", "sverige"]

    # Find all MeSH terms that occur verbatim in some selected Cranfield documents! Since MeSH
    # documents are medical terms and the Cranfield documents have technical content, the
    # overlap probably isn't that big.
    print("LOADING...")
    mesh = InMemoryCorpus("data/mesh.txt")
    cranfield = InMemoryCorpus("data/cran.xml")
    print("BUILDING...")
    trie = Trie()
    for d in mesh:
        trie.add(d["body"] or "", tokenizer)
    finder = StringFinder(trie, tokenizer)
    print("SCANNING...")
    for (document_id,
         expected_matches) in [(0, ["wing", "wing"]),
                               (3, ["solutions", "skin", "friction"]),
                               (1254, ["electrons", "ions"])]:
        document = cranfield[document_id]
        buffer = document["body"] or ""
        results.clear()
        finder.scan(buffer, lambda m: results.append(m))
        print("Cranfield document", document, "contains MeSH terms", results)
        assert [m["match"] for m in results] == expected_matches
예제 #29
0
def assignment_c_simplesearchengine_3():

    # All accesses to posting lists are logged here.
    accesses = []

    # For testing.
    class AccessLoggedIterator(Iterator[Posting]):
        def __init__(self, term: str, wrapped: Iterator[Posting]):
            self._term = term
            self._wrapped = wrapped

        def __next__(self):
            posting = next(self._wrapped)
            accesses.append((self._term, posting.document_id))
            return posting

    # For testing.
    class AccessLoggedInvertedIndex(InvertedIndex):
        def __init__(self, wrapped: InvertedIndex):
            self._wrapped = wrapped

        def get_terms(self, buffer: str) -> Iterator[str]:
            return self._wrapped.get_terms(buffer)

        def get_postings_iterator(self, term: str) -> Iterator[Posting]:
            return AccessLoggedIterator(
                term, self._wrapped.get_postings_iterator(term))

        def get_document_frequency(self, term: str) -> int:
            return self._wrapped.get_document_frequency(term)

    # Use these throughout below.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()

    # Load and index MeSH terms.
    corpus = InMemoryCorpus("data/mesh.txt")
    inverted_index = AccessLoggedInvertedIndex(
        InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer))

    # Do ranked retrieval, using a simple ranker.
    engine = SimpleSearchEngine(corpus, inverted_index)
    simple_ranker = BrainDeadRanker()
    query = "Water  polluTION"
    options = {"match_threshold": 0.5, "hit_count": 1, "debug": False}
    engine.evaluate(query, options, simple_ranker, lambda m: m)

    # Expected posting list traversal ordering if the implementation chooses to evaluate this as "water pollution".
    ordering1 = [('water', 3078), ('pollution', 788), ('pollution', 789),
                 ('pollution', 790), ('pollution', 8079), ('water', 8635),
                 ('pollution', 23837), ('water', 9379), ('water', 23234),
                 ('water', 25265), ('pollution', 25274), ('water', 25266),
                 ('water', 25267), ('water', 25268), ('water', 25269),
                 ('water', 25270), ('water', 25271), ('water', 25272),
                 ('water', 25273), ('water', 25274), ('water', 25275),
                 ('pollution', 25275), ('water', 25276), ('pollution', 25276),
                 ('water', 25277), ('water', 25278), ('water', 25279),
                 ('water', 25280), ('water', 25281)]

    # Expected posting list traversal ordering if the implementation chooses to evaluate this as "pollution water".
    ordering2 = [('pollution', 788), ('water', 3078), ('pollution', 789),
                 ('pollution', 790), ('pollution', 8079), ('water', 8635),
                 ('pollution', 23837), ('water', 9379), ('water', 23234),
                 ('water', 25265), ('pollution', 25274), ('water', 25266),
                 ('water', 25267), ('water', 25268), ('water', 25269),
                 ('water', 25270), ('water', 25271), ('water', 25272),
                 ('water', 25273), ('water', 25274), ('pollution', 25275),
                 ('water', 25275), ('pollution', 25276), ('water', 25276),
                 ('water', 25277), ('water', 25278), ('water', 25279),
                 ('water', 25280), ('water', 25281)]

    # Check that the posting lists have been accessed in a way that's consistent with document-at-a-time traversal.
    # Be somewhat robust to implementation details. This is a fairly strict test, and advanced (but valid)
    # implementations that for some reason do lookaheads or whatever might fail.
    assert accesses == ordering1 or accesses == ordering2
예제 #30
0
파일: testc2.py 프로젝트: havarf/s-kemotor
def test_simple_search_engine():
    from itertools import product, combinations_with_replacement
    from tokenization import BrainDeadTokenizer
    from normalization import BrainDeadNormalizer
    from corpus import InMemoryCorpus, InMemoryDocument
    from invertedindex import InMemoryInvertedIndex
    from searchengine import SimpleSearchEngine
    from ranking import BrainDeadRanker

    Ɛ = 0.0001
    corpus = InMemoryCorpus()

    for txt in (' '.join(w) for w in combinations_with_replacement(
            list(''.join(t) for t in product(
                'bcd',
                'aei',
                'jkl',
            )), 3)):
        corpus.add_document(InMemoryDocument(corpus.size(), {'a': txt}))

    engine = SimpleSearchEngine(
        corpus,
        InMemoryInvertedIndex(corpus, ('a', ), BrainDeadNormalizer(),
                              BrainDeadTokenizer()))

    results = []

    def search(q, r, n):
        results.clear()

        def match(m):
            results.append((m['score'], m['document'].document_id))

        print('searching "' + q + '" at threshold', r, '…')
        engine.evaluate(q, {
            'recall_threshold': r,
            'hit_count': n
        }, BrainDeadRanker(), match)

    def sort_results():
        results.sort(key=lambda e: e[1])
        results.sort(key=lambda e: e[0], reverse=True)

    def check_at(i, expected):
        if results[i] != expected:
            print('FAILED, EXPECTED ', expected, ' RESULT', i, ' was',
                  results[i])

    def check_range(indices, score, docrange):
        for i, d in zip(indices, docrange):
            check_at(i, (score, d))

    def check_hits(n):
        if len(results) != n:
            print('FAILED, expected', n, 'results, got', len(results))

    search('baj BAJ    baj', 1.0, 27)
    check_hits(27)
    check_at(0, (9.0, 0))
    sort_results()
    check_range(range(1, 27), 6.0, range(1, 27))
    search('baj CAj', 1.0, 100)
    check_hits(27)
    search('baj caj daj', 2 / 3 + Ɛ, 100)
    check_hits(79)
    search('baj caj', 2 / 3 + Ɛ, 100)  # her
    check_hits(100)
    sort_results()
    check_at(0, (3.0, 0))
    check_range(range(4, 12), 2.0, range(1, 9))
    check_range(range(12, 29), 2.0, range(10, 27))
    check_at(29, (2.0, 35))
    check_at(78, (2.0, 2531))
    search('baj cek dil', 1.0, 10)
    check_hits(1)
    check_at(0, (3.0, 286))
    search('baj cek dil', 2 / 3 + Ɛ, 80)
    check_hits(79)
    sort_results()
    check_at(0, (3.0, 13))
    check_at(1, (3.0, 26))
    check_at(2, (3.0, 273))
    search('baj xxx yyy', 2 / 3 + Ɛ, 100)
    check_hits(0)
    search('baj xxx yyy', 2 / 3 - Ɛ, 100)
    check_hits(100)