Пример #1
0
 def test_init(self):
     self.assertDictEqual(
         BagOfWords("cat dog cow").values, {
             "cat": 1,
             "dog": 1,
             "cow": 1
         })
     self.assertDictEqual(
         BagOfWords("Cat dog cat").values, {
             "cat": 2,
             "dog": 1
         })
Пример #2
0
 def test_init_with_symbols_in_str(self):
     self.assertDictEqual(
         BagOfWords(" cat, dog! cow.").values, {
             "cat": 1,
             "dog": 1,
             "cow": 1
         })
     self.assertDictEqual(
         BagOfWords(text="cat dog?? cat!!! ").values, {
             "cat": 2,
             "dog": 1
         })
Пример #3
0
 def test_union(self):
     bag1 = BagOfWords("cat dog cow fish cat cat fish")
     bag2 = BagOfWords("dog grape banana peach")
     self.assertDictEqual(
         bag1.union(bag2).values, {
             "banana": 1,
             "cat": 3,
             "cow": 1,
             "dog": 2,
             "fish": 2,
             "grape": 1,
             "peach": 1
         })
Пример #4
0
 def test_document_len(self):
     bag1 = BagOfWords("cat dog cow fish cat cat fish")
     bag2 = BagOfWords("dog grape banana peach")
     self.assertEqual(bag1.document_len(), 7)
     self.assertEqual(bag2.document_len(), 4)
     self.assertEqual(bag1.intersection(bag2).document_len(), 1)
     self.assertEqual(bag1.union(bag2).document_len(), 11)
Пример #5
0
 def test_str(self):
     txt = str(BagOfWords("cat dog cow"))
     self.assertTrue(txt.startswith("{"))
     self.assertIn("'cat': 1", txt)
     self.assertIn("'dog': 1", txt)
     self.assertIn("'cow': 1", txt)
     self.assertTrue(txt.endswith("}"))
Пример #6
0
 def setUp(self):
     self.indexer = Indexer()
     for text in self.texts:
         text = text.strip()
         bag = BagOfWords(text,
                          enable_stemming=False,
                          filter_stopwords=False)
         self.indexer.index(bag)
Пример #7
0
 def test_from_values_dict(self):
     self.assertDictEqual(
         BagOfWords.from_values_dict({
             "cat": 1,
             "dog": 1,
             "cow": 1
         }).values, {
             "cat": 1,
             "dog": 1,
             "cow": 1
         })
     self.assertDictEqual(
         BagOfWords.from_values_dict({
             "cat": 2,
             "dog": 1
         }).values, {
             "cat": 2,
             "dog": 1
         })
Пример #8
0
 def test_to_dict(self):
     self.assertDictEqual(
         BagOfWords(" cat, dog! cow.").to_dict(), {
             "text": " cat, dog! cow.",
             "values": {
                 "cat": 1,
                 "dog": 1,
                 "cow": 1
             }
         })
Пример #9
0
    def test_search_5(self):
        bag = BagOfWords("foo bar",
                         enable_stemming=False,
                         filter_stopwords=False)
        results = sorted(((text, round(score, 5))
                          for text, score in self.indexer.search(bag, 10)),
                         reverse=True,
                         key=lambda x: x[1])

        self.assertEqual(len(results), 0)
Пример #10
0
def main(args):
    indexer = Indexer()

    with io.open(args.texts, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            bag = BagOfWords(line, filter_stopwords=False)
            indexer.index(bag)
    open_func = gzip.open if args.zip else io.open
    index_ext = ".json.gz" if args.zip else ".json"
    with open_func(args.index + index_ext, mode="wb") as f:
        indexer.dump(f)
    return 0
Пример #11
0
    def test_search_4(self):
        bag = BagOfWords("life ñu",
                         enable_stemming=False,
                         filter_stopwords=False)
        results = sorted(((text, round(score, 5))
                          for text, score in self.indexer.search(bag, 10)),
                         reverse=True,
                         key=lambda x: x[1])

        self.assertEqual(len(results), 2)
        result0, _ = results[0]
        self.assertEqual(result0, self.texts[1])
        result1, _ = results[1]
        self.assertEqual(result1, self.texts[0])
Пример #12
0
    def test_index_creation(self):
        self.maxDiff = None

        indexer = Indexer()
        for text in self.texts:
            text = text.strip()
            bag = BagOfWords(text,
                             enable_stemming=False,
                             filter_stopwords=False)
            indexer.index(bag)
        got = indexer.to_dict()

        self.assertSequenceEqual(self.expected["docs_index"],
                                 got["docs_index"])
        self.assertDictEqual(self.expected["terms_index"], got["terms_index"])
Пример #13
0
    def test_dump(self):
        indexer = Indexer()
        for text in self.texts:
            text = text.strip()
            bag = BagOfWords(text,
                             enable_stemming=False,
                             filter_stopwords=False)
            indexer.index(bag)
        fd = StringIO()
        indexer.dump(fd)
        fd.seek(0)
        got = json.load(fd)

        self.assertSequenceEqual(self.expected["docs_index"],
                                 got["docs_index"])
        self.assertDictEqual(self.expected["terms_index"], got["terms_index"])
Пример #14
0
 def test_from_dict(self):
     bag = BagOfWords.from_dict({
         "text": "cat dog cow",
         "values": {
             "cat": 1,
             "dog": 1,
             "cow": 1
         }
     })
     self.assertEqual(bag.text, "cat dog cow")
     self.assertDictEqual(bag.values, {"cat": 1, "dog": 1, "cow": 1})
     with self.assertRaises(ValueError):
         BagOfWords.from_dict({})
     with self.assertRaises(ValueError):
         BagOfWords.from_dict({"text": "blablabla"})
     with self.assertRaises(ValueError):
         BagOfWords.from_dict({"values": {"a": 1, "b": 1}})
Пример #15
0
def main(args):
    indexer = Indexer()
    open_func = gzip.open if args.zip else io.open
    index_ext = ".json.gz" if args.zip else ".json"
    with open_func(args.index + index_ext) as f:
        indexer.load(f)

    with io.open(args.texts, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if len(line) == 0:
                continue
            bag = BagOfWords(line, filter_stopwords=False)
            print(">" * 80)
            print("Query:\n    {}".format(bag.text))
            print(">" * 80)
            for result, score in indexer.search(bag, args.limit):
                print("{}:\n    {}".format(
                    score, result[0:args.show].encode("utf-8")))
            print("<" * 80)
            print("\n\n")
    return 0
Пример #16
0
 def test_len(self):
     self.assertEqual(len(BagOfWords("")), 0)
     self.assertEqual(len(BagOfWords("cat dog cow")), 3)
     self.assertEqual(len(BagOfWords(text="cat dog cat")), 2)
Пример #17
0
 def test_intersection(self):
     bag1 = BagOfWords("cat dog cow fish cat cat fish")
     bag2 = BagOfWords("dog grape banana peach")
     self.assertDictEqual(bag1.intersection(bag2).values, {"dog": 1})
Пример #18
0
 def test_iter(self):
     self.assertSequenceEqual(sorted(iter(BagOfWords(""))), [])
     self.assertSequenceEqual(sorted(iter(BagOfWords("cat cow dog"))),
                              [("cat", 1), ("cow", 1), ("dog", 1)])
     self.assertSequenceEqual(sorted(iter(BagOfWords(text="cat dog cat"))),
                              [("cat", 2), ("dog", 1)])