def test_constructor_with_file_wikicorpus(self): #load tf-idf corpus tfidf_corpus = MmCorpus('/media/sdc1/test_dump/result/test_tfidf_corpus.mm') #load lda corpus #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm') #load dictionary id2token = Dictionary.load("/media/sdc1/test_dump/result/test_wordids.dict") #load article titles document_titles = DocumentTitles.load("/media/sdc1/test_dump/result/test_articles.txt") #train esa model esa_model = EsaModel(tfidf_corpus, num_clusters = 15, document_titles = document_titles, num_features = len(id2token)) print esa_model esa_model.save('/media/sdc1/test_dump/result/wiki_esa.model') tmp_esa = EsaModel.load('/media/sdc1/test_dump/result/wiki_esa.model') print tmp_esa
def __init__(self, prefix): """ prefix is the prefix path to tfidf, lda and esa model. """ logger.info("Load dictionary, tfidf model, lda model and esa model with prefix %s" % prefix) self.dictionary = corpora.Dictionary.load(prefix + "_wordids.dict") self.tfidf_model = models.TfidfModel.load(prefix + "_tfidf.model") self.lda_model = models.LdaModel.load(prefix + "_lda.model") self.esa_model = EsaModel.load(prefix + "_esa_on_lda.model")
logger.info("Load text file %s" % options.text) try: with open(options.text, "r") as file: doc = " ".join(file.readlines()) except Exception as e: logger.error("Could not load document from %s" % options.text) sys.exit(1) #load dictionary, tfidf model, lda model, esa model logger.info("Load dictionary, tfidf model, lda model and esa model with prefix %s" % options.prefix) dictionary = corpora.Dictionary.load(options.prefix + "_wordids.dict") tfidf_model = models.TfidfModel.load(options.prefix + "_tfidf.model") lda_model = models.LdaModel.load(options.prefix + "_lda.model") esa_model = EsaModel.load(options.prefix + "_esa_on_lda.model") #create list of tokens from doc logger.info("Lemmatize document.") tokens = utils.lemmatize(doc) #create bow of doc from token list logger.info("Create bag-of-words representation from document.") doc_bow = dictionary.doc2bow(tokens) #create tfidf representation from bag-of-words logger.info("Transform to tfidf.") doc_tfidf = tfidf_model[doc_bow] #create lda representation from tfidf logger.info("Transform to lda")