示例#1
0
文件: tests.py 项目: sankosk/SIW
 def test_iter(self):
     self.assertSequenceEqual(sorted(iter(BagOfWords(""))), [])
     self.assertSequenceEqual(
         sorted(iter(BagOfWords("cat cow dog"))), [("cat", 1), ("cow", 1),
                                                   ("dog", 1)])
     self.assertSequenceEqual(
         sorted(iter(BagOfWords(text="cat dog cat"))), [("cat", 2),
                                                        ("dog", 1)])
示例#2
0
文件: tests.py 项目: sankosk/SIW
 def test_init(self):
     self.assertDictEqual(
         BagOfWords("cat dog cow").values, {
             "cat": 1,
             "dog": 1,
             "cow": 1
         })
     self.assertDictEqual(
         BagOfWords("Cat dog cat").values, {
             "cat": 2,
             "dog": 1
         })
示例#3
0
文件: tests.py 项目: sankosk/SIW
 def test_init_with_symbols_in_str(self):
     self.assertDictEqual(
         BagOfWords(" cat, dog! cow.").values, {
             "cat": 1,
             "dog": 1,
             "cow": 1
         })
     self.assertDictEqual(
         BagOfWords(text="cat dog?? cat!!! ").values, {
             "cat": 2,
             "dog": 1
         })
示例#4
0
文件: tests.py 项目: sankosk/SIW
 def test_union(self):
     bag1 = BagOfWords("cat dog cow fish cat cat fish")
     bag2 = BagOfWords("dog grape banana peach")
     self.assertDictEqual(
         bag1.union(bag2).values, {
             "banana": 1,
             "cat": 3,
             "cow": 1,
             "dog": 2,
             "fish": 2,
             "grape": 1,
             "peach": 1
         })
示例#5
0
文件: tests.py 项目: sankosk/SIW
 def test_document_len(self):
     bag1 = BagOfWords("cat dog cow fish cat cat fish")
     bag2 = BagOfWords("dog grape banana peach")
     self.assertEqual(bag1.document_len(), 7)
     self.assertEqual(bag2.document_len(), 4)
     self.assertEqual(bag1.intersection(bag2).document_len(), 1)
     self.assertEqual(bag1.union(bag2).document_len(), 11)
示例#6
0
 def test_init_with_symbols_in_str(self):
     """Prueba la inicialización con strings que contengan símbolos de puntuación
     """
     self.assertDictEqual(
         BagOfWords(" cat, dog! cow.").values, {
             "cat": 1,
             "dog": 1,
             "cow": 1
         })
     self.assertDictEqual(
         BagOfWords(text="cat dog?? cat!!! ").values, {
             "cat": 2,
             "dog": 1
         })
示例#7
0
 def test_init_with_str(self):
     """Prueba la inicialización con strings
     """
     self.assertDictEqual(
         BagOfWords("cat dog cow").values, {
             "cat": 1,
             "dog": 1,
             "cow": 1
         })
     self.assertDictEqual(
         BagOfWords(text="Cat dog cat").values, {
             "cat": 2,
             "dog": 1
         })
示例#8
0
文件: tests.py 项目: sankosk/SIW
 def setUp(self):
     self.indexer = Indexer()
     for text in self.texts:
         text = text.strip()
         bag = BagOfWords(
             text, enable_stemming=False, filter_stopwords=False)
         self.indexer.index(bag)
示例#9
0
文件: tests.py 项目: sankosk/SIW
 def test_str(self):
     txt = str(BagOfWords("cat dog cow"))
     self.assertTrue(txt.startswith("{"))
     self.assertIn("'cat': 1", txt)
     self.assertIn("'dog': 1", txt)
     self.assertIn("'cow': 1", txt)
     self.assertTrue(txt.endswith("}"))
示例#10
0
    def test_document_len(self):
        """Prueba el tamaño del documento, no del vector
        """
        bag1 = BagOfWords("cat dog cow fish cat cat fish")
        bag2 = BagOfWords("dog grape banana peach")

        self.assertEqual(bag1.document_len(), 7)
        self.assertEqual(bag2.document_len(), 4)
        self.assertEqual(bag1.intersection(bag2).document_len(), 1)
        self.assertEqual(bag1.union(bag2).document_len(), 11)
示例#11
0
文件: tests.py 项目: sankosk/SIW
    def test_search_4(self):
        bag = BagOfWords("life ñu", enable_stemming=False, filter_stopwords=False)
        results = sorted(((text, round(score, 5)) for text, score in self.indexer.search(bag, 10)), reverse=True, key=lambda x: x[1])

        self.assertEqual(len(results), 2)
        result0,  _ = results[0]
        self.assertEqual(result0, self.texts[1])
        result1,  _ = results[1]
        self.assertEqual(result1, self.texts[0])
示例#12
0
 def test_str(self):
     """Prueba la conversión a string
     """
     txt = str(BagOfWords("cat dog cow"))
     self.assertTrue(txt.startswith("{"))
     self.assertIn("'cat': 1", txt)
     self.assertIn("'dog': 1", txt)
     self.assertIn("'cow': 1", txt)
     self.assertTrue(txt.endswith("}"))
示例#13
0
文件: tests.py 项目: sankosk/SIW
 def test_to_dict(self):
     self.assertDictEqual(
         BagOfWords(" cat, dog! cow.").to_dict(), {
             "text": " cat, dog! cow.",
             "values": {
                 "cat": 1,
                 "dog": 1,
                 "cow": 1
             }
         })
示例#14
0
文件: tests.py 项目: sankosk/SIW
 def test_from_values_dict(self):
     self.assertDictEqual(
         BagOfWords.from_values_dict({
             "cat": 1,
             "dog": 1,
             "cow": 1
         }).values, {
             "cat": 1,
             "dog": 1,
             "cow": 1
         })
     self.assertDictEqual(
         BagOfWords.from_values_dict({
             "cat": 2,
             "dog": 1
         }).values, {
             "cat": 2,
             "dog": 1
         })
示例#15
0
 def test_init_with_dict(self):
     """Prueba la inicialización con diccinoarios
     """
     self.assertDictEqual(
         BagOfWords(values={
             "cat": 1,
             "dog": 1,
             "cow": 1
         }).values, {
             "cat": 1,
             "dog": 1,
             "cow": 1
         })
     self.assertDictEqual(
         BagOfWords(values={
             "cat": 2,
             "dog": 1
         }).values, {
             "cat": 2,
             "dog": 1
         })
示例#16
0
文件: index.py 项目: sankosk/SIW
def main(args):
    indexer = Indexer()

    with io.open(args.texts, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            bag = BagOfWords(line, filter_stopwords=False)
            indexer.index(bag)
    open_func = gzip.open if args.zip else io.open
    index_ext = ".json.gz" if args.zip else ".json"
    with open_func(args.index + index_ext, mode="wb") as f:
        indexer.dump(f)
    return 0
示例#17
0
文件: tests.py 项目: sankosk/SIW
    def test_index_creation(self):
        self.maxDiff = None

        indexer = Indexer()
        for text in self.texts:
            text = text.strip()
            bag = BagOfWords(
                text, enable_stemming=False, filter_stopwords=False)
            indexer.index(bag)
        got = indexer.to_dict()

        self.assertSequenceEqual(self.expected["docs_index"],
                                 got["docs_index"])
        self.assertDictEqual(self.expected["terms_index"], got["terms_index"])
示例#18
0
文件: tests.py 项目: sankosk/SIW
    def test_dump(self):
        indexer = Indexer()
        for text in self.texts:
            text = text.strip()
            bag = BagOfWords(
                text, enable_stemming=False, filter_stopwords=False)
            indexer.index(bag)
        fd = StringIO()
        indexer.dump(fd)
        fd.seek(0)
        got = json.load(fd)

        self.assertSequenceEqual(self.expected["docs_index"],
                                 got["docs_index"])
        self.assertDictEqual(self.expected["terms_index"], got["terms_index"])
示例#19
0
    def test_dump(self):
        """Prueba que el fichero JSON generado sea correcto
        """
        indexer = Indexer()
        for text in self.texts:
            text = text.strip()
            bag = BagOfWords(text,
                             enable_stemming=False,
                             filter_stopwords=False)
            indexer.index(bag)
        fd = StringIO()
        indexer.dump(fd)
        fd.seek(0)
        got = json.load(fd)

        self.assertSequenceEqual(self.expected["docs_index"],
                                 got["docs_index"])
        self.assertDictEqual(self.expected["words_index"], got["words_index"])
示例#20
0
文件: tests.py 项目: sankosk/SIW
 def test_from_dict(self):
     bag = BagOfWords.from_dict({
         "text": "cat dog cow",
         "values": {
             "cat": 1,
             "dog": 1,
             "cow": 1
         }
     })
     self.assertEqual(bag.text, "cat dog cow")
     self.assertDictEqual(bag.values, {"cat": 1, "dog": 1, "cow": 1})
     with self.assertRaises(ValueError):
         BagOfWords.from_dict({})
     with self.assertRaises(ValueError):
         BagOfWords.from_dict({"text": "blablabla"})
     with self.assertRaises(ValueError):
         BagOfWords.from_dict({"values": {"a": 1, "b": 1}})
示例#21
0
    def test_index_creation(self):
        """Prueba la creación del indice

        Esta prueba usa el siguiente ejemplo como modelo
        https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Example_of_tf%E2%80%93idf
        """

        self.maxDiff = None

        indexer = Indexer()
        for text in self.texts:
            text = text.strip()
            bag = BagOfWords(text,
                             enable_stemming=False,
                             filter_stopwords=False)
            indexer.index(bag)

        self.assertSequenceEqual(self.expected["docs_index"],
                                 indexer.docs_index)
        self.assertDictEqual(self.expected["words_index"], indexer.words_index)
示例#22
0
文件: search.py 项目: sankosk/SIW
def main(args):
    indexer = Indexer()
    open_func = gzip.open if args.zip else io.open
    index_ext = ".json.gz" if args.zip else ".json"
    with open_func(args.index + index_ext) as f:
        indexer.load(f)

    with io.open(args.texts, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if len(line) == 0:
                continue
            bag = BagOfWords(line, filter_stopwords=False)
            print(">" * 80)
            print("Query:\n    {}".format(bag.text))
            print(">" * 80)
            for result, score in indexer.search(bag, args.limit):
                print("{}:\n    {}".format(score, result[0:args.show].encode("utf-8")))
            print("<" * 80)
            print("\n\n")
    return 0
示例#23
0
    def test_score(self):
        """Prueba los scores de una palabra para cada documento

        Esta prueba usa el siguiente ejemplo como modelo
        https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Example_of_tf%E2%80%93idf
        """
        indexer = Indexer()
        for text in self.texts:
            text = text.strip()
            bag = BagOfWords(text,
                             enable_stemming=False,
                             filter_stopwords=False)
            indexer.index(bag)

        # Tests the score of "example"
        scores = sorted(indexer.score("example",
                                      enable_stemming=False,
                                      filter_stopwords=False),
                        reverse=True)
        self.assertAlmostEqual(scores[0][0], 0.129, places=3)
        self.assertEqual(scores[0][1], 1)

        # Tests the score of "this"
        scores = sorted(indexer.score("this",
                                      enable_stemming=False,
                                      filter_stopwords=False),
                        reverse=True)
        self.assertAlmostEqual(scores[0][0], 0, places=3)
        self.assertEqual(scores[0][1], 1)
        self.assertAlmostEqual(scores[1][0], 0, places=3)
        self.assertEqual(scores[1][1], 0)

        # Tests the score of "sample"
        scores = sorted(indexer.score("sample",
                                      enable_stemming=False,
                                      filter_stopwords=False),
                        reverse=True)
        self.assertAlmostEqual(scores[0][0], 0.060, places=3)
        self.assertEqual(scores[0][1], 0)
示例#24
0
文件: tests.py 项目: sankosk/SIW
 def test_len(self):
     self.assertEqual(len(BagOfWords("")), 0)
     self.assertEqual(len(BagOfWords("cat dog cow")), 3)
     self.assertEqual(len(BagOfWords(text="cat dog cat")), 2)
示例#25
0
文件: tests.py 项目: sankosk/SIW
    def test_search_5(self):
        bag = BagOfWords("foo bar", enable_stemming=False, filter_stopwords=False)
        results = sorted(((text, round(score, 5)) for text, score in self.indexer.search(bag, 10)), reverse=True, key=lambda x: x[1])

        self.assertEqual(len(results), 0)
示例#26
0
 def test_len(self):
     """Prueba el tamaño del vector
     """
     self.assertEqual(len(BagOfWords()), 0)
     self.assertEqual(len(BagOfWords("cat dog cow")), 3)
     self.assertEqual(len(BagOfWords(text="cat dog cat")), 2)
示例#27
0
 def test_intersection(self):
     """Prueba la interesección de dos bag-of-words
     """
     bag1 = BagOfWords("cat dog cow fish cat cat fish")
     bag2 = BagOfWords("dog grape banana peach")
     self.assertDictEqual(bag1.intersection(bag2).values, {"dog": 1})
示例#28
0
文件: tests.py 项目: sankosk/SIW
 def test_intersection(self):
     bag1 = BagOfWords("cat dog cow fish cat cat fish")
     bag2 = BagOfWords("dog grape banana peach")
     self.assertDictEqual(bag1.intersection(bag2).values, {"dog": 1})