Exemplo n.º 1
0
    def test_constructor_with_file_wikicorpus(self):
        
        #load tf-idf corpus
        tfidf_corpus = MmCorpus('/media/sdc1/test_dump/result/test_tfidf_corpus.mm')
        
        #load lda corpus
        #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm')
        
        #load dictionary
        id2token = Dictionary.load("/media/sdc1/test_dump/result/test_wordids.dict")
        
        #load article titles
        document_titles = DocumentTitles.load("/media/sdc1/test_dump/result/test_articles.txt")

        #train esa model
        esa_model = EsaModel(tfidf_corpus, num_clusters = 15, 
                             document_titles = document_titles,
                             num_features = len(id2token))
        
        print esa_model
        
        esa_model.save('/media/sdc1/test_dump/result/wiki_esa.model')
        
        tmp_esa = EsaModel.load('/media/sdc1/test_dump/result/wiki_esa.model') 
        print tmp_esa  
    """LDA Model creation"""

    #build lda model
    lda = models.LdaModel(corpus=mm_tfidf, id2word=id2token,
                          num_topics=NUM_TOPICS, update_every=1,
                          chunksize=10000,
                          passes=2)

    #save trained model
    lda.save(options.prefix + '_lda.model')

    #save corpus as lda vectors in matrix market format
    corpora.MmCorpus.serialize(options.prefix + '_lda_corpus.mm', lda[mm_tfidf],
                               progress_cnt=10000)

    #init lda-corpus reader
    mm_lda = corpora.MmCorpus(options.prefix + '_lda_corpus.mm')

    """ESA Model creation"""

    #document titles
    article_titles = DocumentTitles.load(options.prefix + "_articles.txt")

    #build esa model
    esa = EsaModel(mm_lda, num_clusters=10000, document_titles=article_titles,
                   num_features=NUM_TOPICS)

    esa.save(options.prefix + "_esa_on_lda.model")

    logger.info("finished transforming")