def test_set_idf_tfidf_norm(self): """ Tests set_idf_tfidf_norm. Add two texts: 1- Verify idf equals 0.0 2- Verify idf norm equals '0.0' Update idf: 2- Verify idf not equals 0.0 3- Verify idf norm not equals '0.0' """ text, vector_1 = "foo", "foo_1" self.c.add_text(text) self.c.add_text("bar") # important for idf _, word_info = kc_util.gen_db(self.c.dictionary_db.cursor()).next() self.assertEquals(word_info.idf, 0.0) # 1 self.c.add_vector(vector_1, text) norm = self.c.vectors_norm_db.get(vector_1) self.assertEquals(norm, '0.0') # 2 self.c.set_idf() _, word_info = kc_util.gen_db(self.c.dictionary_db.cursor()).next() self.assertNotEquals(word_info.idf, 0.0) # 3 self.c.set_tfidf_norm() norm = self.c.vectors_norm_db.get(vector_1) self.assertNotEquals(norm, '0.0') # 4
def test_add_text(self): """ Tests add_text. Add a text to the classifier: 1- Verify if the number of text equals 1. 2- Verify if the text added is equals to words wanted. """ flux1_text = ( u"Comment Google classe les pages Internet " u"Bientôt une sphère pour remplacer souris et écrans tactiles ? " u"Le clip kitsch du couple présidentiel chinois" ) flux1_text_wanted = [ "bient", "chinois", "class", "clip", "comment", "coupl", "cran", "googl", "internet", "kitsch", "le", "pag", "pr", "re", "remplac", "sidentiel", "sour", "sph", "tactil" ] self.c.add_text(flux1_text) self.assertEquals(int(self.c.classifier_state_db.get("text_nb")), 1) # 1 words = [word for word, _ in kc_util.gen_db(self.c.dictionary_db.cursor())] self.assertEquals(words, flux1_text_wanted) # 2
def get_items(self, name): """ Returns feed's items with the generator. Args: name (str): Name of the feed. Yields: tuple (str, Item): A generator of tuple (item id, item obj). """ feed = self.get_feed(name) try: items_db = kc.DB() items_db.open(feed.item_db_filename, kc.DB.OREADER) for item in kc_util.gen_db(items_db.cursor()): yield item except AttributeError as er: # feed can be None if feed: logging.error(er) except Exception as er: logging.error(er) finally: items_db.close()
def set_idf(self): """ Updates by adding the inverse document frequency (idf) for each word. """ for word, word_info in kc_util.gen_db(self.dictionary_db.cursor()): word_info.idf = self.idf(word_info.number) self.dictionary_db.replace(word, pickle.dumps(word_info))
def print_words_structure(self): """ Prints content of the main dictionary. """ print("Dictionary words:") for _, word in kc_util.gen_db(self.dictionary_db.cursor()): print(word) print("")
def get_vectors(self): """ Returns the vectors. Yields: tuple (str, Vector): Name of the vector and vector object. """ return kc_util.gen_db(self.vectors_db.cursor())
def test_set_idf(self): """ Tests set_idf. Add two texts: 1- Verify idf equals 0.0 Add idf: 2- Verify idf not equals 0.0 """ self.c.add_text("foo") self.c.add_text("bar") # important for idf _, word_info = kc_util.gen_db(self.c.dictionary_db.cursor()).next() self.assertEquals(word_info.idf, 0.0) # 1 self.c.set_idf() _, word_info = kc_util.gen_db(self.c.dictionary_db.cursor()).next() self.assertNotEquals(word_info.idf, 0.0) # 2
def get_feeds(self): """ Returns feeds with the generator. Yields: tuple (str, Feed): A generator of tuple (feed name, feed obj). """ try: for feed in kc_util.gen_db(self.feeds_db.cursor()): yield feed except Exception as er: logging.error(er)