Exemplo n.º 1
0
    def setUp(self):
        text = u"Comment Google classe les pages Internet"

        c = Classifier(CleanTextUtil("french"))
        c.add_text(text)

        self.dictionary_db = c.dictionary_db
        self.vi = VectorItem("googl", "1")
Exemplo n.º 2
0
class TestClassifier(unittest.TestCase):
    """ Tests the Classifier class.

    """
    def setUp(self):
        self.c = Classifier(CleanTextUtil("french"))

    def tearDown(self):
        rm_data_dir()

    def test_add_text(self):
        """ Tests add_text.

        Add a text to the classifier:
         1- Verify if the number of text equals 1.
         2- Verify if the text added is equals to words wanted.

        """
        flux1_text = (
            u"Comment Google classe les pages Internet "
            u"Bientôt une sphère pour remplacer souris et écrans tactiles ? "
            u"Le clip kitsch du couple présidentiel chinois"
        )

        flux1_text_wanted = [
            "bient", "chinois", "class", "clip", "comment", "coupl", 
            "cran", "googl", "internet", "kitsch", "le", "pag", "pr", 
            "re", "remplac", "sidentiel", "sour", "sph", "tactil"
        ]

        self.c.add_text(flux1_text)

        self.assertEquals(int(self.c.classifier_state_db.get("text_nb")), 1) # 1

        words = [word for word, _ in kc_util.gen_db(self.c.dictionary_db.cursor())]
        self.assertEquals(words, flux1_text_wanted) # 2

    def test_set_idf(self):
        """ Tests set_idf.

        Add two texts:
         1- Verify idf equals 0.0

        Add idf:
         2- Verify idf not equals 0.0

        """
        self.c.add_text("foo")
        self.c.add_text("bar") # important for idf

        _, word_info = kc_util.gen_db(self.c.dictionary_db.cursor()).next()
        self.assertEquals(word_info.idf, 0.0) # 1

        self.c.set_idf()

        _, word_info = kc_util.gen_db(self.c.dictionary_db.cursor()).next()
        self.assertNotEquals(word_info.idf, 0.0) # 2

    def test_set_idf_tfidf_norm(self):
        """ Tests set_idf_tfidf_norm.

        Add two texts:
         1- Verify idf equals 0.0
         2- Verify idf norm equals '0.0'

        Update idf:
         2- Verify idf not equals 0.0
         3- Verify idf norm not equals '0.0'

        """
        text, vector_1 = "foo", "foo_1"
        self.c.add_text(text)
        self.c.add_text("bar") # important for idf

        _, word_info = kc_util.gen_db(self.c.dictionary_db.cursor()).next()
        self.assertEquals(word_info.idf, 0.0) # 1

        self.c.add_vector(vector_1, text) 

        norm = self.c.vectors_norm_db.get(vector_1)
        self.assertEquals(norm, '0.0') # 2

        self.c.set_idf()

        _, word_info = kc_util.gen_db(self.c.dictionary_db.cursor()).next()
        self.assertNotEquals(word_info.idf, 0.0) # 3

        self.c.set_tfidf_norm()

        norm = self.c.vectors_norm_db.get(vector_1)
        self.assertNotEquals(norm, '0.0') # 4

    def test_add_vector(self):
        """ Tests add_vector.

        Add a text.

        1- Check if there is not a vector.  

        Add a vector:
         2- Check if there is a vector.

        """
        text, vector_1 = "foo", "foo_1"
        self.c.add_text(text)

        vector = self.c.get_vector(vector_1)
        self.assertIsNone(vector) # 1

        self.c.add_vector(vector_1, text) 

        vector = self.c.get_vector(vector_1)
        self.assertIsInstance(vector, Vector) # 2

    def test_rm_vector(self):
        """ Tests rm_vector.

        Add a text.
        Add a vector:
         1- Check if the vector exists.
        Remove the vector:
         2- Check if the vector doesn't exist anymore.

        """
        text, vector_1 = "foo", "foo_1"
        self.c.add_text(text)
        self.c.add_vector(vector_1, text) 

        vector = self.c.get_vector(vector_1)
        self.assertIsNotNone(vector) # 1

        self.c.rm_vector(vector_1)

        vector = self.c.get_vector(vector_1)
        self.assertIsNone(vector) # 2

    def test_get_vector(self):
        """ Tests get_vector.

        Add a text.
        Add a vector.
         1- Check if the vector exists with "get_vector".

        Get an unknown vector.
         2- Check if the vector is None.

        """
        text, vector_1 = "foo", "foo_1"

        self.c.add_text(text)
        self.c.add_vector(vector_1, text) 

        vector = self.c.get_vector(vector_1)
        self.assertIsNotNone(vector) # 1

        vector = self.c.get_vector("unknown vector")
        self.assertIsNone(vector) # 2

    def test_get_vectors(self):
        """ Tests get_vectors.

        Add two texts.
        Add two vectors.

        With get_vectors:
         1- Verify names equals by comparing names.
         2- Verify object equals by comparing tags.

        """
        text_1, vector_1, tag_1 = "foo", "foo_1", "SPORT" 
        text_2, vector_2, tag_2 = "bar", "bar_1", "BUSINESS"

        self.c.add_text(text_1)
        self.c.add_text(text_2)

        self.c.add_vector(vector_1, text_1, tag_1) 
        self.c.add_vector(vector_2, text_2, tag_2) 

        names = [name for name, _ in self.c.get_vectors()] 
        self.assertEquals(set(names), set(self.c.get_vectors_name())) # 1

        tags = [vector.tag for _, vector in self.c.get_vectors()] 
        self.assertEquals(set([tag_1, tag_2]), set(tags)) # 2

    def test_get_vectors_name(self):
        """ Tests get_vectors_name.

        Add two texts.
        Add two vectors.
        Get vectors names:
         1- Verify names equals.

        """
        text_1, vector_1 = "foo", "foo_1"
        text_2, vector_2 = "bar", "bar_1"

        self.c.add_text(text_1)
        self.c.add_text(text_2)

        self.c.add_vector(vector_1, text_1) 
        self.c.add_vector(vector_2, text_2) 

        names = self.c.get_vectors_name()
        self.assertEquals(set([vector_1,vector_2]), set(names)) # 1

    def test_idf(self):
#TODO
        """ Test .

        """
        pass

    def test_tf_idf(self):
        """ Test .

        """
        pass

    def test_vector_tfidf_norm(self):
        """ Test .

        """
        pass

    def test_scalar_product(self):
        """ Test .

        """
        pass

    def test_cosine_sim(self):
        """ Test .

        """
        pass

    def test_kNN(self):
        """ Test .

        """
        pass

    def test_get_category(self):
        """ Test .

        """
        pass

    def test_print_words_structure(self):
        """ Test .

        """
        pass

    def test_print_vector(self):
        """ Test .

        """
        pass

    def test_print_vectors(self):
        """ Test .

        """
        pass