Exemplo n.º 1
0
    def test_constructor_with_file_wikicorpus(self):
        
        #load tf-idf corpus
        tfidf_corpus = MmCorpus('/media/sdc1/test_dump/result/test_tfidf_corpus.mm')
        
        #load lda corpus
        #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm')
        
        #load dictionary
        id2token = Dictionary.load("/media/sdc1/test_dump/result/test_wordids.dict")
        
        #load article titles
        document_titles = DocumentTitles.load("/media/sdc1/test_dump/result/test_articles.txt")

        #train esa model
        esa_model = EsaModel(tfidf_corpus, num_clusters = 15, 
                             document_titles = document_titles,
                             num_features = len(id2token))
        
        print esa_model
        
        esa_model.save('/media/sdc1/test_dump/result/wiki_esa.model')
        
        tmp_esa = EsaModel.load('/media/sdc1/test_dump/result/wiki_esa.model') 
        print tmp_esa  
Exemplo n.º 2
0
 def __init__(self, prefix):
     """
     prefix is the prefix path to tfidf, lda and esa model.
     """
     logger.info("Load dictionary, tfidf model, lda model and esa model with prefix %s" % prefix)
     self.dictionary = corpora.Dictionary.load(prefix + "_wordids.dict")
     self.tfidf_model = models.TfidfModel.load(prefix + "_tfidf.model")
     self.lda_model = models.LdaModel.load(prefix + "_lda.model")
     self.esa_model = EsaModel.load(prefix + "_esa_on_lda.model")
    """LDA Model creation"""

    #build lda model
    lda = models.LdaModel(corpus=mm_tfidf, id2word=id2token,
                          num_topics=NUM_TOPICS, update_every=1,
                          chunksize=10000,
                          passes=2)

    #save trained model
    lda.save(options.prefix + '_lda.model')

    #save corpus as lda vectors in matrix market format
    corpora.MmCorpus.serialize(options.prefix + '_lda_corpus.mm', lda[mm_tfidf],
                               progress_cnt=10000)

    #init lda-corpus reader
    mm_lda = corpora.MmCorpus(options.prefix + '_lda_corpus.mm')

    """ESA Model creation"""

    #document titles
    article_titles = DocumentTitles.load(options.prefix + "_articles.txt")

    #build esa model
    esa = EsaModel(mm_lda, num_clusters=10000, document_titles=article_titles,
                   num_features=NUM_TOPICS)

    esa.save(options.prefix + "_esa_on_lda.model")

    logger.info("finished transforming")
Exemplo n.º 4
0
 logger.info("Load text file %s" % options.text)
 
 try:
     with open(options.text, "r") as file:
         doc = " ".join(file.readlines())
 except Exception as e:
     logger.error("Could not load document from %s" % options.text)
     sys.exit(1)
     
 #load dictionary, tfidf model, lda model, esa model
 logger.info("Load dictionary, tfidf model, lda model and esa model with prefix %s" 
             % options.prefix)
 dictionary = corpora.Dictionary.load(options.prefix + "_wordids.dict")
 tfidf_model = models.TfidfModel.load(options.prefix + "_tfidf.model")
 lda_model = models.LdaModel.load(options.prefix + "_lda.model")
 esa_model = EsaModel.load(options.prefix + "_esa_on_lda.model")
 
 #create list of tokens from doc
 logger.info("Lemmatize document.")
 tokens = utils.lemmatize(doc)
 
 #create bow of doc from token list
 logger.info("Create bag-of-words representation from document.")
 doc_bow = dictionary.doc2bow(tokens)
 
 #create tfidf representation from bag-of-words
 logger.info("Transform to tfidf.")
 doc_tfidf = tfidf_model[doc_bow]
 
 #create lda representation from tfidf
 logger.info("Transform to lda")
Exemplo n.º 5
0
                MM_BOW, id2word=CORPUS.dictionary, normalize=True)
            TF_IDF.save(TF_IDF_PATH)
        else:
            TF_IDF = models.TfidfModel.load(TF_IDF_PATH)
        TF_IDF_CORPUS_PATH = os.path.join(
            OPTIONS.prefix, language + "_tfidf_corpus.mm")
        if not os.path.exists(TF_IDF_CORPUS_PATH):
            corpora.MmCorpus.serialize(
                TF_IDF_CORPUS_PATH, TF_IDF[MM_BOW], progress_cnt=10000)
        MM_TF_IDF = corpora.MmCorpus(TF_IDF_CORPUS_PATH)
        LOGGER.info("Finished %s-TF-IDF Model Generation", language)

        ESA_PATH = os.path.join(
            OPTIONS.prefix, language + "_esa_on_tfidf.model")
        if not os.path.exists(ESA_PATH):
            ARTICLE_TITLES = DocumentTitles.load(ARTICLES_PATH)
            ESA = EsaModel(MM_TF_IDF, document_titles=ARTICLE_TITLES)
            ESA.save(ESA_PATH)
        LOGGER.info("Finished %s-ESA Model Generation", language)

        if language == 'en':
            SMALL_EN_ESA_PATH = os.path.join(
                OPTIONS.prefix, "small_en_esa_on_tfidf.model")
            if not os.path.exists(SMALL_EN_ESA_PATH):
                ESA = EsaModel(MM_TF_IDF, document_titles=ARTICLE_TITLES, num_concepts=NUM_TOPICS)
                ESA.save(SMALL_EN_ESA_PATH)
            LOGGER.info("Finished small en-ESA Model Generation")


        LOGGER.info("Finished ALL Transforming Activity")