Пример #1
0
    def test_constructor_with_big_file_wikicorpus(self):
        
        #load tf-idf corpus
        tfidf_corpus = MmCorpus('/media/sdc1/test_dump/result/wiki_tfidf_corpus.mm')
        
        #load lda corpus
        #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm')
        
        #load dictionary
        id2token = Dictionary.load("/media/sdc1/test_dump/result/wiki_wordids.dict")
        
        #load article titles
        document_titles = DocumentTitles.load("/media/sdc1/test_dump/result/wiki_articles.txt")

        #train esa model
        esa_model = EsaModel(tfidf_corpus, num_clusters = 15, 
                             document_titles = document_titles,
                             num_features = len(id2token))
        
        print esa_model
        
        esa_model.save('/media/sdc1/test_dump/result/wiki_cesa.model')
        
        tmp_esa = EsaModel.load('/media/sdc1/test_dump/result/wiki_cesa.model') 
        print tmp_esa  
Пример #2
0
    def test_constructor_with_file_wikicorpus(self):
        
        #load tf-idf model
        tfidf_model = tfidfmodel.TfidfModel.load("/media/sdc1/test_dump/result/test_tfidf.model")
        extractor = TfidfFeatureExtractor("/media/sdc1/test_dump/result/test")
        
        #load tf-idf corpus
        tfidf_corpus = MmCorpus('/media/sdc1/test_dump/result/test_tfidf_corpus.mm')
        
        #load lda corpus
        #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm')
        
        #load dictionary
        id2token = Dictionary.load("/media/sdc1/test_dump/result/test_wordids.dict")
        
        #load article titles
        document_titles = DocumentTitles.load("/media/sdc1/test_dump/result/test_articles.txt")
        
        #Connect to mongo database
        connect(self.config_['database']['db-name'], 
                username= self.config_['database']['user'], 
                password= self.config_['database']['passwd'], 
                port = self.config_['database']['port'])
        
        #Load articles as test corpus
        user = User.objects(email=u"*****@*****.**").first()
        
        ranked_article_ids = (a.article.id 
                              for a 
                              in RankedArticle.objects(user_id = user.id).only("article"))
        all_article_ids = Set(a.id 
                              for a 
                              in Article.objects(id__in = ranked_article_ids).only("id"))
        
        read_article_ids = Set(a.article.id 
                               for a 
                               in ReadArticleFeedback.objects(user_id = user.id).only("article"))
        
        unread_article_ids = all_article_ids - read_article_ids

        #sample test articles
        X, y = get_samples(extractor, read_article_ids, unread_article_ids)
        
        s,f = X.shape
        logger.debug("Traning with %d samples, %d features, %d marks" % 
                     (s,f, len(y)))

        #train esa model
        esa_model = CosineEsaModel(tfidf_corpus, 
                                   document_titles = document_titles,
                                   test_corpus = X, 
                                   test_corpus_targets = y, 
                                   num_test_corpus = len(y),
                                   num_best_features = 15,
                                   num_features = len(id2token))
        
        print esa_model
        
        esa_model.save('/media/sdc1/test_dump/result/test_cesa.model')
        
        tmp_esa = CosineEsaModel.load('/media/sdc1/test_dump/result/test_cesa.model') 
        print tmp_esa