def test_constructor_with_big_file_wikicorpus(self): #load tf-idf corpus tfidf_corpus = MmCorpus('/vagrant/data/wiki_tfidf_corpus.mm') #load lda corpus #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm') #load dictionary id2token = Dictionary.load("/vagrant/data/wiki_wordids.dict") #load article titles document_titles = DocumentTitles.load("/vagrant/data/wiki_articles.txt") #train esa model esa_model = EsaModel(tfidf_corpus, num_clusters=15, document_titles=document_titles, num_features=len(id2token)) print esa_model esa_model.save('/vagrant/data/wiki_cesa.model') tmp_esa = EsaModel.load('/vagrant/data/wiki_cesa.model') print tmp_esa
def test_constructor_with_file_wikicorpus(self): #load tf-idf model tfidf_model = tfidfmodel.TfidfModel.load("/vagrant/data/test_tfidf.model") extractor = TfidfFeatureExtractor("/vagrant/data/test") #load tf-idf corpus tfidf_corpus = MmCorpus('/vagrant/data/test_tfidf_corpus.mm') #load lda corpus #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm') #load dictionary id2token = Dictionary.load("/vagrant/data/test_wordids.dict") #load article titles document_titles = DocumentTitles.load("/vagrant/test_articles.txt") #Connect to mongo database connect(self.config_['database']['db-name'], username=self.config_['database']['user'], password=self.config_['database']['passwd'], port=self.config_['database']['port']) #Load articles as test corpus user = User.objects(email=u"*****@*****.**").first() ranked_article_ids = (a.article.id for a in RankedArticle.objects(user_id=user.id).only("article")) all_article_ids = set(a.id for a in Article.objects(id__in=ranked_article_ids).only("id")) read_article_ids = set(a.article.id for a in ReadArticleFeedback.objects(user_id=user.id).only("article")) unread_article_ids = all_article_ids - read_article_ids #sample test articles X, y = get_samples(extractor, read_article_ids, unread_article_ids) s,f = X.shape logger.debug("Training with %d samples, %d features, %d marks" % (s, f, len(y))) #train esa model esa_model = CosineEsaModel(tfidf_corpus, document_titles=document_titles, test_corpus=X, test_corpus_targets=y, num_test_corpus=len(y), num_best_features=15, num_features=len(id2token)) for line in esa_model: print repr(line) esa_model.save('/vagrant/data/test_cesa.model') tmp_esa = CosineEsaModel.load('/vagrant/data/test_cesa.model') print tmp_esa