示例#1
0
    def test_set_idf_tfidf_norm(self):
        """ Tests set_idf_tfidf_norm.

        Add two texts:
         1- Verify idf equals 0.0
         2- Verify idf norm equals '0.0'

        Update idf:
         2- Verify idf not equals 0.0
         3- Verify idf norm not equals '0.0'

        """
        text, vector_1 = "foo", "foo_1"
        self.c.add_text(text)
        self.c.add_text("bar") # important for idf

        _, word_info = kc_util.gen_db(self.c.dictionary_db.cursor()).next()
        self.assertEquals(word_info.idf, 0.0) # 1

        self.c.add_vector(vector_1, text) 

        norm = self.c.vectors_norm_db.get(vector_1)
        self.assertEquals(norm, '0.0') # 2

        self.c.set_idf()

        _, word_info = kc_util.gen_db(self.c.dictionary_db.cursor()).next()
        self.assertNotEquals(word_info.idf, 0.0) # 3

        self.c.set_tfidf_norm()

        norm = self.c.vectors_norm_db.get(vector_1)
        self.assertNotEquals(norm, '0.0') # 4
示例#2
0
    def test_add_text(self):
        """ Tests add_text.

        Add a text to the classifier:
         1- Verify if the number of text equals 1.
         2- Verify if the text added is equals to words wanted.

        """
        flux1_text = (
            u"Comment Google classe les pages Internet "
            u"Bientôt une sphère pour remplacer souris et écrans tactiles ? "
            u"Le clip kitsch du couple présidentiel chinois"
        )

        flux1_text_wanted = [
            "bient", "chinois", "class", "clip", "comment", "coupl", 
            "cran", "googl", "internet", "kitsch", "le", "pag", "pr", 
            "re", "remplac", "sidentiel", "sour", "sph", "tactil"
        ]

        self.c.add_text(flux1_text)

        self.assertEquals(int(self.c.classifier_state_db.get("text_nb")), 1) # 1

        words = [word for word, _ in kc_util.gen_db(self.c.dictionary_db.cursor())]
        self.assertEquals(words, flux1_text_wanted) # 2
示例#3
0
    def get_items(self, name):
        """ Returns feed's items with the generator.
        
        Args:
            name (str): Name of the feed.

        Yields:
            tuple (str, Item): A generator of tuple (item id, item obj).

        """
        feed = self.get_feed(name)

        try:
            items_db = kc.DB()
            items_db.open(feed.item_db_filename, kc.DB.OREADER)

            for item in kc_util.gen_db(items_db.cursor()):
                yield item

        except AttributeError as er:   
            # feed can be None
            if feed:
                logging.error(er)

        except Exception as er:
            logging.error(er)
            
        finally:
            items_db.close()
示例#4
0
    def set_idf(self):
        """ Updates by adding the inverse document frequency (idf) for each word.

        """
        for word, word_info in kc_util.gen_db(self.dictionary_db.cursor()):
            word_info.idf = self.idf(word_info.number)
            self.dictionary_db.replace(word, pickle.dumps(word_info))
示例#5
0
 def print_words_structure(self):
     """ Prints content of the main dictionary.
     
     """
     print("Dictionary words:")
     for _, word in kc_util.gen_db(self.dictionary_db.cursor()):
         print(word) 
         print("")
示例#6
0
    def get_vectors(self):
        """ Returns the vectors.

        Yields:
            tuple (str, Vector): Name of the vector and vector object. 

        """
        return kc_util.gen_db(self.vectors_db.cursor())
示例#7
0
    def test_set_idf(self):
        """ Tests set_idf.

        Add two texts:
         1- Verify idf equals 0.0

        Add idf:
         2- Verify idf not equals 0.0

        """
        self.c.add_text("foo")
        self.c.add_text("bar") # important for idf

        _, word_info = kc_util.gen_db(self.c.dictionary_db.cursor()).next()
        self.assertEquals(word_info.idf, 0.0) # 1

        self.c.set_idf()

        _, word_info = kc_util.gen_db(self.c.dictionary_db.cursor()).next()
        self.assertNotEquals(word_info.idf, 0.0) # 2
示例#8
0
    def get_feeds(self):
        """ Returns feeds with the generator.

        Yields:
            tuple (str, Feed): A generator of tuple (feed name, feed obj).

        """
        try:
            for feed in kc_util.gen_db(self.feeds_db.cursor()):
                yield feed

        except Exception as er:
            logging.error(er)