예제 #1
0
    def test_constructor_with_file_wikicorpus(self):
        
        #load tf-idf corpus
        tfidf_corpus = MmCorpus('/media/sdc1/test_dump/result/test_tfidf_corpus.mm')
        
        #load lda corpus
        #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm')
        
        #load dictionary
        id2token = Dictionary.load("/media/sdc1/test_dump/result/test_wordids.dict")
        
        #load article titles
        document_titles = DocumentTitles.load("/media/sdc1/test_dump/result/test_articles.txt")

        #train esa model
        esa_model = EsaModel(tfidf_corpus, num_clusters = 15, 
                             document_titles = document_titles,
                             num_features = len(id2token))
        
        print esa_model
        
        esa_model.save('/media/sdc1/test_dump/result/wiki_esa.model')
        
        tmp_esa = EsaModel.load('/media/sdc1/test_dump/result/wiki_esa.model') 
        print tmp_esa  
    """LDA Model creation"""

    #build lda model
    lda = models.LdaModel(corpus=mm_tfidf, id2word=id2token,
                          num_topics=NUM_TOPICS, update_every=1,
                          chunksize=10000,
                          passes=2)

    #save trained model
    lda.save(options.prefix + '_lda.model')

    #save corpus as lda vectors in matrix market format
    corpora.MmCorpus.serialize(options.prefix + '_lda_corpus.mm', lda[mm_tfidf],
                               progress_cnt=10000)

    #init lda-corpus reader
    mm_lda = corpora.MmCorpus(options.prefix + '_lda_corpus.mm')

    """ESA Model creation"""

    #document titles
    article_titles = DocumentTitles.load(options.prefix + "_articles.txt")

    #build esa model
    esa = EsaModel(mm_lda, num_clusters=10000, document_titles=article_titles,
                   num_features=NUM_TOPICS)

    esa.save(options.prefix + "_esa_on_lda.model")

    logger.info("finished transforming")
예제 #3
0
                MM_BOW, id2word=CORPUS.dictionary, normalize=True)
            TF_IDF.save(TF_IDF_PATH)
        else:
            TF_IDF = models.TfidfModel.load(TF_IDF_PATH)
        TF_IDF_CORPUS_PATH = os.path.join(
            OPTIONS.prefix, language + "_tfidf_corpus.mm")
        if not os.path.exists(TF_IDF_CORPUS_PATH):
            corpora.MmCorpus.serialize(
                TF_IDF_CORPUS_PATH, TF_IDF[MM_BOW], progress_cnt=10000)
        MM_TF_IDF = corpora.MmCorpus(TF_IDF_CORPUS_PATH)
        LOGGER.info("Finished %s-TF-IDF Model Generation", language)

        ESA_PATH = os.path.join(
            OPTIONS.prefix, language + "_esa_on_tfidf.model")
        if not os.path.exists(ESA_PATH):
            ARTICLE_TITLES = DocumentTitles.load(ARTICLES_PATH)
            ESA = EsaModel(MM_TF_IDF, document_titles=ARTICLE_TITLES)
            ESA.save(ESA_PATH)
        LOGGER.info("Finished %s-ESA Model Generation", language)

        if language == 'en':
            SMALL_EN_ESA_PATH = os.path.join(
                OPTIONS.prefix, "small_en_esa_on_tfidf.model")
            if not os.path.exists(SMALL_EN_ESA_PATH):
                ESA = EsaModel(MM_TF_IDF, document_titles=ARTICLE_TITLES, num_concepts=NUM_TOPICS)
                ESA.save(SMALL_EN_ESA_PATH)
            LOGGER.info("Finished small en-ESA Model Generation")


        LOGGER.info("Finished ALL Transforming Activity")