예제 #1
0
    def test_constructor_with_file_wikicorpus(self):
        
        #load tf-idf corpus
        tfidf_corpus = MmCorpus('/media/sdc1/test_dump/result/test_tfidf_corpus.mm')
        
        #load lda corpus
        #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm')
        
        #load dictionary
        id2token = Dictionary.load("/media/sdc1/test_dump/result/test_wordids.dict")
        
        #load article titles
        document_titles = DocumentTitles.load("/media/sdc1/test_dump/result/test_articles.txt")

        #train esa model
        esa_model = EsaModel(tfidf_corpus, num_clusters = 15, 
                             document_titles = document_titles,
                             num_features = len(id2token))
        
        print esa_model
        
        esa_model.save('/media/sdc1/test_dump/result/wiki_esa.model')
        
        tmp_esa = EsaModel.load('/media/sdc1/test_dump/result/wiki_esa.model') 
        print tmp_esa  
예제 #2
0
파일: extractors.py 프로젝트: eric011/nyan
 def __init__(self, prefix):
     """
     prefix is the prefix path to tfidf, lda and esa model.
     """
     logger.info("Load dictionary, tfidf model, lda model and esa model with prefix %s" % prefix)
     self.dictionary = corpora.Dictionary.load(prefix + "_wordids.dict")
     self.tfidf_model = models.TfidfModel.load(prefix + "_tfidf.model")
     self.lda_model = models.LdaModel.load(prefix + "_lda.model")
     self.esa_model = EsaModel.load(prefix + "_esa_on_lda.model")
예제 #3
0
파일: esa_sample.py 프로젝트: JOSMANC/nyan
 logger.info("Load text file %s" % options.text)
 
 try:
     with open(options.text, "r") as file:
         doc = " ".join(file.readlines())
 except Exception as e:
     logger.error("Could not load document from %s" % options.text)
     sys.exit(1)
     
 #load dictionary, tfidf model, lda model, esa model
 logger.info("Load dictionary, tfidf model, lda model and esa model with prefix %s" 
             % options.prefix)
 dictionary = corpora.Dictionary.load(options.prefix + "_wordids.dict")
 tfidf_model = models.TfidfModel.load(options.prefix + "_tfidf.model")
 lda_model = models.LdaModel.load(options.prefix + "_lda.model")
 esa_model = EsaModel.load(options.prefix + "_esa_on_lda.model")
 
 #create list of tokens from doc
 logger.info("Lemmatize document.")
 tokens = utils.lemmatize(doc)
 
 #create bow of doc from token list
 logger.info("Create bag-of-words representation from document.")
 doc_bow = dictionary.doc2bow(tokens)
 
 #create tfidf representation from bag-of-words
 logger.info("Transform to tfidf.")
 doc_tfidf = tfidf_model[doc_bow]
 
 #create lda representation from tfidf
 logger.info("Transform to lda")