Python InMemoryCorpus примеры, context.in3120.InMemoryCorpus Python примеры использования

Пример #1

0

Показать файл

 def test_load_from_file(self):
     corpus = in3120.InMemoryCorpus("../data/mesh.txt")
     self.assertEqual(corpus.size(), 25588)
     corpus = in3120.InMemoryCorpus("../data/cran.xml")
     self.assertEqual(corpus.size(), 1400)
     corpus = in3120.InMemoryCorpus("../data/docs.json")
     self.assertEqual(corpus.size(), 13)
     corpus = in3120.InMemoryCorpus("../data/imdb.csv")
     self.assertEqual(corpus.size(), 1000)

Пример #2

0

Показать файл

 def test_mesh_terms_in_cran_corpus(self):
     mesh = in3120.InMemoryCorpus("../data/mesh.txt")
     cran = in3120.InMemoryCorpus("../data/cran.xml")
     trie = in3120.Trie()
     trie.add((d["body"] or "" for d in mesh), self.__tokenizer)
     finder = in3120.StringFinder(trie, self.__tokenizer)
     self.__scan_buffer_verify_matches(finder, cran[0]["body"],
                                       ["wing", "wing"])
     self.__scan_buffer_verify_matches(finder, cran[3]["body"],
                                       ["solutions", "skin", "friction"])
     self.__scan_buffer_verify_matches(finder, cran[1254]["body"],
                                       ["electrons", "ions"])

Пример #3

0

Показать файл

 def setUp(self):
     normalizer = in3120.BrainDeadNormalizer()
     tokenizer = in3120.BrainDeadTokenizer()
     corpus = in3120.InMemoryCorpus()
     corpus.add_document(
         in3120.InMemoryDocument(0, {
             "title": "the foo",
             "static_quality_score": 0.9
         }))
     corpus.add_document(
         in3120.InMemoryDocument(1, {
             "title": "the foo",
             "static_quality_score": 0.2
         }))
     corpus.add_document(
         in3120.InMemoryDocument(2, {
             "title": "the foo foo",
             "static_quality_score": 0.2
         }))
     corpus.add_document(in3120.InMemoryDocument(3, {"title": "the bar"}))
     corpus.add_document(
         in3120.InMemoryDocument(4, {"title": "the bar bar"}))
     corpus.add_document(in3120.InMemoryDocument(5, {"title": "the baz"}))
     corpus.add_document(in3120.InMemoryDocument(6, {"title": "the baz"}))
     corpus.add_document(
         in3120.InMemoryDocument(7, {"title": "the baz baz"}))
     index = in3120.InMemoryInvertedIndex(corpus, ["title"], normalizer,
                                          tokenizer)
     self.__ranker = in3120.BetterRanker(corpus, index)

Пример #4

0

Показать файл

 def test_multiple_fields(self):
     corpus = in3120.InMemoryCorpus()
     corpus.add_document(
         in3120.InMemoryDocument(0, {
             "field1": "a b c",
             "field2": "b c d"
         }))
     corpus.add_document(
         in3120.InMemoryDocument(1, {
             "field1": "x",
             "field2": "y"
         }))
     corpus.add_document(
         in3120.InMemoryDocument(2, {
             "field1": "y",
             "field2": "z"
         }))
     engine0 = in3120.SuffixArray(corpus, ["field1", "field2"],
                                  self.__normalizer, self.__tokenizer)
     engine1 = in3120.SuffixArray(corpus, ["field1"], self.__normalizer,
                                  self.__tokenizer)
     engine2 = in3120.SuffixArray(corpus, ["field2"], self.__normalizer,
                                  self.__tokenizer)
     self.__process_query_and_verify_winner(engine0, "b c", [0], 2)
     self.__process_query_and_verify_winner(engine0, "y", [1, 2], 1)
     self.__process_query_and_verify_winner(engine1, "x", [1], 1)
     self.__process_query_and_verify_winner(engine1, "y", [2], 1)
     self.__process_query_and_verify_winner(engine1, "z", [], None)
     self.__process_query_and_verify_winner(engine2, "z", [2], 1)

Пример #5

0

Показать файл

 def test_mesh_corpus(self):
     corpus = in3120.InMemoryCorpus("../data/mesh.txt")
     index = in3120.InMemoryInvertedIndex(corpus, ["body"],
                                          self.__normalizer,
                                          self.__tokenizer)
     self.assertEqual(len(list(index["hydrogen"])), 8)
     self.assertEqual(len(list(index["hydrocephalus"])), 2)

Пример #6

0

Показать файл

def repl_a():
    print("Building inverted index from Cranfield corpus...")
    normalizer = in3120.BrainDeadNormalizer()
    tokenizer = in3120.BrainDeadTokenizer()
    corpus = in3120.InMemoryCorpus(data_path("cran.xml"))
    index = in3120.InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)
    print("Enter one or more index terms and inspect their posting lists.")
    simple_repl("terms", lambda ts: {t: list(index.get_postings_iterator(t)) for t in index.get_terms(ts)})

Пример #7

0

Показать файл

 def test_uses_yield(self):
     import types
     corpus = in3120.InMemoryCorpus()
     corpus.add_document(in3120.InMemoryDocument(0, {"a": "the foo bar"}))
     engine = in3120.SuffixArray(corpus, ["a"], self.__normalizer,
                                 self.__tokenizer)
     matches = engine.evaluate("foo", {})
     self.assertIsInstance(matches, types.GeneratorType,
                           "Are you using yield?")

Пример #8

0

Показать файл

 def test_cran_corpus(self):
     corpus = in3120.InMemoryCorpus("../data/cran.xml")
     engine = in3120.SuffixArray(corpus, ["body"], self.__normalizer,
                                 self.__tokenizer)
     self.__process_query_and_verify_winner(engine, "visc", [328], 11)
     self.__process_query_and_verify_winner(engine, "Of  A", [946], 10)
     self.__process_query_and_verify_winner(engine, "", [], None)
     self.__process_query_and_verify_winner(engine, "approximate solution",
                                            [159, 1374], 3)

Пример #9

0

Показать файл

def repl_e():
    print("Initializing naive Bayes classifier from news corpora...")
    normalizer = in3120.BrainDeadNormalizer()
    tokenizer = in3120.BrainDeadTokenizer()
    languages = ["en", "no", "da", "de"]
    training_set = {language: in3120.InMemoryCorpus(data_path(f"{language}.txt")) for language in languages}
    classifier = in3120.NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer)
    print(f"Enter some text and classify it into {languages}.")
    print(f"Returned scores are log-probabilities.")
    simple_repl("text", lambda t: list(classifier.classify(t)))

Пример #10

0

Показать файл

def repl_b_2():
    print("Building trie from MeSH corpus...")
    normalizer = in3120.BrainDeadNormalizer()
    tokenizer = in3120.BrainDeadTokenizer()
    corpus = in3120.InMemoryCorpus(data_path("mesh.txt"))
    dictionary = in3120.Trie()
    dictionary.add((normalizer.normalize(normalizer.canonicalize(d["body"])) for d in corpus), tokenizer)
    engine = in3120.StringFinder(dictionary, tokenizer)
    print("Enter some text and locate words and phrases that are MeSH terms.")
    simple_repl("text", lambda t: list(engine.scan(normalizer.normalize(normalizer.canonicalize(t)))))

Пример #11

0

Показать файл

 def test_uses_yield(self):
     import types
     corpus = in3120.InMemoryCorpus()
     corpus.add_document(in3120.InMemoryDocument(0, {"a": "the foo bar"}))
     training_set = {c: corpus for c in ["x", "y"]}
     classifier = in3120.NaiveBayesClassifier(training_set, ["a"],
                                              self.__normalizer,
                                              self.__tokenizer)
     matches = classifier.classify("urg foo the gog")
     self.assertIsInstance(matches, types.GeneratorType,
                           "Are you using yield?")

Пример #12

0

Показать файл

def repl_b_1():
    print("Building suffix array from Cranfield corpus...")
    normalizer = in3120.BrainDeadNormalizer()
    tokenizer = in3120.BrainDeadTokenizer()
    corpus = in3120.InMemoryCorpus(data_path("cran.xml"))
    engine = in3120.SuffixArray(corpus, ["body"], normalizer, tokenizer)
    options = {"debug": False, "hit_count": 5}
    print("Enter a prefix phrase query and find matching documents.")
    print(f"Lookup options are {options}.")
    print("Returned scores are occurrence counts.")
    simple_repl("query", lambda q: list(engine.evaluate(q, options)))

Пример #13

0

Показать файл

 def test_uses_yield(self):
     import types
     corpus = in3120.InMemoryCorpus()
     corpus.add_document(in3120.InMemoryDocument(0, {"a": "foo bar"}))
     index = in3120.InMemoryInvertedIndex(corpus, ["a"], self.__normalizer,
                                          self.__tokenizer)
     engine = in3120.SimpleSearchEngine(corpus, index)
     ranker = in3120.BrainDeadRanker()
     matches = engine.evaluate("foo", {}, ranker)
     self.assertIsInstance(matches, types.GeneratorType,
                           "Are you using yield?")

Пример #14

0

Показать файл

 def test_mesh_corpus(self):
     normalizer = in3120.BrainDeadNormalizer()
     tokenizer = in3120.BrainDeadTokenizer()
     corpus = in3120.InMemoryCorpus("../data/mesh.txt")
     index = in3120.InMemoryInvertedIndex(corpus, ["body"], normalizer,
                                          tokenizer)
     self.__process_query_with_two_terms(corpus, index, "HIV  pROtein",
                                         self.__merger.intersection,
                                         [11316, 11319, 11320, 11321])
     self.__process_query_with_two_terms(
         corpus, index, "water Toxic", self.__merger.union,
         [3078, 8138, 8635, 9379, 14472, 18572, 23234, 23985] +
         [i for i in range(25265, 25282)])

Пример #15

0

Показать файл

def repl_d_2():
    print("Indexing English news corpus...")
    normalizer = in3120.BrainDeadNormalizer()
    tokenizer = in3120.BrainDeadTokenizer()
    corpus = in3120.InMemoryCorpus(data_path("en.txt"))
    index = in3120.InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)
    ranker = in3120.BetterRanker(corpus, index)
    engine = in3120.SimpleSearchEngine(corpus, index)
    options = {"debug": False, "hit_count": 5, "match_threshold": 0.5}
    print("Enter a query and find matching documents.")
    print(f"Lookup options are {options}.")
    print(f"Tokenizer is {tokenizer.__class__.__name__}.")
    print(f"Ranker is {ranker.__class__.__name__}.")
    simple_repl("query", lambda q: list(engine.evaluate(q, options, ranker)))

Пример #16

0

Показать файл

 def test_index_shingled_mesh_corpus(self):
     tokenizer = in3120.ShingleGenerator(3)
     corpus = in3120.InMemoryCorpus("../data/mesh.txt")
     index = in3120.InMemoryInvertedIndex(corpus, ["body"],
                                          self.__normalizer, tokenizer)
     engine = in3120.SimpleSearchEngine(corpus, index)
     self.__process_query_verify_matches("orGAnik kEMmistry", engine, {
         "match_threshold": 0.1,
         "hit_count": 10
     }, (10, 8.0, [4408, 4410, 4411, 16980, 16981]))
     self.__process_query_verify_matches("synndrome", engine, {
         "match_threshold": 0.1,
         "hit_count": 10
     }, (10, 7.0, [1275]))

Пример #17

0

Показать файл

 def test_china_example_from_textbook(self):
     import math
     china = in3120.InMemoryCorpus()
     china.add_document(
         in3120.InMemoryDocument(0, {"body": "Chinese Beijing Chinese"}))
     china.add_document(
         in3120.InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"}))
     china.add_document(
         in3120.InMemoryDocument(2, {"body": "Chinese Macao"}))
     not_china = in3120.InMemoryCorpus()
     not_china.add_document(
         in3120.InMemoryDocument(0, {"body": "Tokyo Japan Chinese"}))
     training_set = {"china": china, "not china": not_china}
     classifier = in3120.NaiveBayesClassifier(training_set, ["body"],
                                              self.__normalizer,
                                              self.__tokenizer)
     results = list(
         classifier.classify("Chinese Chinese Chinese Tokyo Japan"))
     self.assertEqual(len(results), 2)
     self.assertEqual(results[0]["category"], "china")
     self.assertAlmostEqual(math.exp(results[0]["score"]), 0.0003, 4)
     self.assertEqual(results[1]["category"], "not china")
     self.assertAlmostEqual(math.exp(results[1]["score"]), 0.0001, 4)

Пример #18

0

Показать файл

 def test_synthetic_corpus(self):
     from itertools import product, combinations_with_replacement
     corpus = in3120.InMemoryCorpus()
     words = ("".join(term) for term in product("bcd", "aei", "jkl"))
     texts = (" ".join(word)
              for word in combinations_with_replacement(words, 3))
     for text in texts:
         corpus.add_document(
             in3120.InMemoryDocument(corpus.size(), {"a": text}))
     index = in3120.InMemoryInvertedIndex(corpus, ["a"], self.__normalizer,
                                          self.__tokenizer)
     engine = in3120.SimpleSearchEngine(corpus, index)
     epsilon = 0.0001
     self.__process_query_verify_matches("baj BAJ    baj", engine, {
         "match_threshold": 1.0,
         "hit_count": 27
     }, (27, 9.0, [0]))
     self.__process_query_verify_matches("baj caj", engine, {
         "match_threshold": 1.0,
         "hit_count": 100
     }, (27, None, None))
     self.__process_query_verify_matches("baj caj daj", engine, {
         "match_threshold": 2 / 3 + epsilon,
         "hit_count": 100
     }, (79, None, None))
     self.__process_query_verify_matches("baj caj", engine, {
         "match_threshold": 2 / 3 + epsilon,
         "hit_count": 100
     }, (100, 3.0, [0, 9, 207, 2514]))
     self.__process_query_verify_matches("baj cek dil", engine, {
         "match_threshold": 1.0,
         "hit_count": 10
     }, (1, 3.0, [286]))
     self.__process_query_verify_matches("baj cek dil", engine, {
         "match_threshold": 1.0,
         "hit_count": 10
     }, (1, None, None))
     self.__process_query_verify_matches("baj cek dil", engine, {
         "match_threshold": 2 / 3 + epsilon,
         "hit_count": 80
     }, (79, 3.0, [13, 26, 273, 286, 377, 3107, 3198]))
     self.__process_query_verify_matches("baj xxx yyy", engine, {
         "match_threshold": 2 / 3 + epsilon,
         "hit_count": 100
     }, (0, None, None))
     self.__process_query_verify_matches("baj xxx yyy", engine, {
         "match_threshold": 2 / 3 - epsilon,
         "hit_count": 100
     }, (100, None, None))

Пример #19

0

Показать файл

 def test_mesh_corpus(self):
     corpus = in3120.InMemoryCorpus("../data/mesh.txt")
     index = in3120.InMemoryInvertedIndex(corpus, ["body"],
                                          self.__normalizer,
                                          self.__tokenizer)
     engine = in3120.SimpleSearchEngine(corpus, index)
     query = "polluTION Water"
     self.__process_two_term_query_verify_matches(query, engine, {
         "match_threshold": 0.1,
         "hit_count": 10
     }, (10, [25274, 25275, 25276]))
     self.__process_two_term_query_verify_matches(query, engine, {
         "match_threshold": 1.0,
         "hit_count": 10
     }, (3, [25274, 25275, 25276]))

Пример #20

0

Показать файл

 def test_multiple_fields(self):
     document = in3120.InMemoryDocument(
         0, {
             'felt1': 'Dette er en test. Test, sa jeg. TEST!',
             'felt2': 'test er det',
             'felt3': 'test TEsT',
         })
     corpus = in3120.InMemoryCorpus()
     corpus.add_document(document)
     index = in3120.InMemoryInvertedIndex(corpus, ['felt1', 'felt3'],
                                          self.__normalizer,
                                          self.__tokenizer)
     posting = next(index.get_postings_iterator('test'))
     self.assertEqual(posting.document_id, 0)
     self.assertEqual(posting.term_frequency, 5)

Пример #21

0

Показать файл

 def test_access_documents(self):
     corpus = in3120.InMemoryCorpus()
     corpus.add_document(
         in3120.InMemoryDocument(0, {"body": "this is a Test"}))
     corpus.add_document(
         in3120.InMemoryDocument(1, {
             "title": "prØve",
             "body": "en to tre"
         }))
     self.assertEqual(corpus.size(), 2)
     self.assertListEqual([d.document_id for d in corpus], [0, 1])
     self.assertListEqual(
         [corpus[i].document_id for i in range(0, corpus.size())], [0, 1])
     self.assertListEqual([
         corpus.get_document(i).document_id
         for i in range(0, corpus.size())
     ], [0, 1])

Пример #22

0

Показать файл

 def test_language_detection_trained_on_some_news_corpora(self):
     training_set = {
         l: in3120.InMemoryCorpus(f"../data/{l}.txt")
         for l in ["en", "no", "da", "de"]
     }
     classifier = in3120.NaiveBayesClassifier(training_set, ["body"],
                                              self.__normalizer,
                                              self.__tokenizer)
     self.__classify_buffer_and_verify_top_categories(
         "Vil det riktige språket identifiseres? Dette er bokmål.",
         classifier, ["no"])
     self.__classify_buffer_and_verify_top_categories(
         "I don't believe that the number of tokens exceeds a billion.",
         classifier, ["en"])
     self.__classify_buffer_and_verify_top_categories(
         "De danske drenge drikker snaps!", classifier, ["da"])
     self.__classify_buffer_and_verify_top_categories(
         "Der Kriminalpolizei! Haben sie angst?", classifier, ["de"])

Пример #23

0

Показать файл

 def test_access_postings(self):
     corpus = in3120.InMemoryCorpus()
     corpus.add_document(
         in3120.InMemoryDocument(0, {"body": "this is a Test"}))
     corpus.add_document(
         in3120.InMemoryDocument(1, {"body": "test TEST prØve"}))
     index = in3120.InMemoryInvertedIndex(corpus, ["body"],
                                          self.__normalizer,
                                          self.__tokenizer)
     self.assertListEqual(list(index.get_terms("PRøvE wtf tesT")),
                          ["prøve", "wtf", "test"])
     self.assertListEqual([(p.document_id, p.term_frequency)
                           for p in index["prøve"]], [(1, 1)])
     self.assertListEqual([(p.document_id, p.term_frequency)
                           for p in index.get_postings_iterator("wtf")], [])
     self.assertListEqual([(p.document_id, p.term_frequency)
                           for p in index["test"]], [(0, 1), (1, 2)])
     self.assertEqual(index.get_document_frequency("wtf"), 0)
     self.assertEqual(index.get_document_frequency("prøve"), 1)
     self.assertEqual(index.get_document_frequency("test"), 2)

Пример #24

0

Показать файл

 def test_memory_usage(self):
     import tracemalloc
     import inspect
     corpus = in3120.InMemoryCorpus()
     corpus.add_document(
         in3120.InMemoryDocument(0, {
             "a": "o  o\n\n\no\n\no",
             "b": "o o\no   \no"
         }))
     corpus.add_document(
         in3120.InMemoryDocument(1, {
             "a": "ba",
             "b": "b bab"
         }))
     corpus.add_document(
         in3120.InMemoryDocument(2, {
             "a": "o  o O o",
             "b": "o o"
         }))
     corpus.add_document(
         in3120.InMemoryDocument(3, {
             "a": "oO" * 10000,
             "b": "o"
         }))
     corpus.add_document(
         in3120.InMemoryDocument(4, {
             "a": "cbab o obab O ",
             "b": "o o " * 10000
         }))
     tracemalloc.start()
     snapshot1 = tracemalloc.take_snapshot()
     engine = in3120.SuffixArray(corpus, ["a", "b"], self.__normalizer,
                                 self.__tokenizer)
     self.assertIsNotNone(engine)
     snapshot2 = tracemalloc.take_snapshot()
     tracemalloc.stop()
     for statistic in snapshot2.compare_to(snapshot1, "filename"):
         if statistic.traceback[0].filename == inspect.getfile(
                 in3120.SuffixArray):
             self.assertLessEqual(statistic.size_diff, 2000000,
                                  "Memory usage seems excessive.")

Пример #25

0

Показать файл

    def test_document_at_a_time_traversal_mesh_corpus(self):
        from typing import Iterator, List, Tuple, Set

        class AccessLoggedCorpus(in3120.Corpus):
            def __init__(self, wrapped: in3120.Corpus):
                self.__wrapped = wrapped
                self.__accesses = set()

            def __iter__(self):
                return iter(self.__wrapped)

            def size(self) -> int:
                return self.__wrapped.size()

            def get_document(self, document_id: int) -> in3120.Document:
                self.__accesses.add(document_id)
                return self.__wrapped.get_document(document_id)

            def get_history(self) -> Set[int]:
                return self.__accesses

        class AccessLoggedIterator(Iterator[in3120.Posting]):
            def __init__(self, term: str, accesses: List[Tuple[str, int]],
                         wrapped: Iterator[in3120.Posting]):
                self.__term = term
                self.__accesses = accesses
                self.__wrapped = wrapped

            def __next__(self):
                posting = next(self.__wrapped)
                self.__accesses.append((self.__term, posting.document_id))
                return posting

        class AccessLoggedInvertedIndex(in3120.InvertedIndex):
            def __init__(self, wrapped: in3120.InvertedIndex):
                self.__wrapped = wrapped
                self.__accesses = []

            def get_terms(self, buffer: str) -> Iterator[str]:
                return self.__wrapped.get_terms(buffer)

            def get_postings_iterator(self,
                                      term: str) -> Iterator[in3120.Posting]:
                return AccessLoggedIterator(
                    term, self.__accesses,
                    self.__wrapped.get_postings_iterator(term))

            def get_document_frequency(self, term: str) -> int:
                return self.__wrapped.get_document_frequency(term)

            def get_history(self) -> List[Tuple[str, int]]:
                return self.__accesses

        corpus1 = in3120.InMemoryCorpus("../data/mesh.txt")
        corpus2 = AccessLoggedCorpus(corpus1)
        index = AccessLoggedInvertedIndex(
            in3120.InMemoryInvertedIndex(corpus1, ["body"], self.__normalizer,
                                         self.__tokenizer))
        engine = in3120.SimpleSearchEngine(corpus2, index)
        ranker = in3120.BrainDeadRanker()
        query = "Water  polluTION"
        options = {"match_threshold": 0.5, "hit_count": 1, "debug": False}
        matches = list(engine.evaluate(query, options, ranker))
        self.assertIsNotNone(matches)
        history = corpus2.get_history()
        self.assertListEqual(
            list(history),
            [25274])  # Only the document in the result set should be accessed.
        ordering1 = [
            (
                'water', 3078
            ),  # Document-at-a-time ordering if evaluated as "water pollution".
            ('pollution', 788),
            ('pollution', 789),
            ('pollution', 790),
            ('pollution', 8079),
            ('water', 8635),
            ('pollution', 23837),
            ('water', 9379),
            ('water', 23234),
            ('water', 25265),
            ('pollution', 25274),
            ('water', 25266),
            ('water', 25267),
            ('water', 25268),
            ('water', 25269),
            ('water', 25270),
            ('water', 25271),
            ('water', 25272),
            ('water', 25273),
            ('water', 25274),
            ('water', 25275),
            ('pollution', 25275),
            ('water', 25276),
            ('pollution', 25276),
            ('water', 25277),
            ('water', 25278),
            ('water', 25279),
            ('water', 25280),
            ('water', 25281)
        ]
        ordering2 = [
            (
                'pollution', 788
            ),  # Document-at-a-time ordering if evaluated as "pollution water".
            ('water', 3078),
            ('pollution', 789),
            ('pollution', 790),
            ('pollution', 8079),
            ('water', 8635),
            ('pollution', 23837),
            ('water', 9379),
            ('water', 23234),
            ('water', 25265),
            ('pollution', 25274),
            ('water', 25266),
            ('water', 25267),
            ('water', 25268),
            ('water', 25269),
            ('water', 25270),
            ('water', 25271),
            ('water', 25272),
            ('water', 25273),
            ('water', 25274),
            ('pollution', 25275),
            ('water', 25275),
            ('pollution', 25276),
            ('water', 25276),
            ('water', 25277),
            ('water', 25278),
            ('water', 25279),
            ('water', 25280),
            ('water', 25281)
        ]
        history = index.get_history()
        self.assertTrue(
            history == ordering1 or history
            == ordering2)  # Strict. Advanced implementations might fail.

Python InMemoryCorpus примеры использования