def __init__(self, prefix): Extractor.__init__(self) logger.info( "Load dictionary, tfidf model, lda model and esa model with prefix %s" % prefix) self.dictionary = corpora.Dictionary.load(prefix + "_wordids.dict") self.tfidf_model = models.TfidfModel.load(prefix + "_tfidf.model") # self.lda_model = models.LdaModel.load(prefix + "_lda.model") self.esa_model = EsaModel.load(prefix + "_esa1000_on_tfidf.model")
def load_esa_models(self): logger.info("Loading ESA models") self.esas = { # 'en': EsaModel.load(self.prefix + self.target_prefix + "en_esa_on_tfidf.model"), 'es': EsaModel.load(self.prefix + "es_esa_on_tfidf.model"), # 'de': EsaModel.load(self.prefix + "de_esa_on_tfidf.model"), # 'nl': EsaModel.load(self.prefix + "nl_esa_on_tfidf.model") }
def test_constructor_with_big_file_wikicorpus(self): #load tf-idf corpus tfidf_corpus = MmCorpus('/vagrant/data/wiki_tfidf_corpus.mm') #load lda corpus #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm') #load dictionary id2token = Dictionary.load("/vagrant/data/wiki_wordids.dict") #load article titles document_titles = DocumentTitles.load("/vagrant/data/wiki_articles.txt") #train esa model esa_model = EsaModel(tfidf_corpus, num_clusters=15, document_titles=document_titles, num_features=len(id2token)) print esa_model esa_model.save('/vagrant/data/wiki_cesa.model') tmp_esa = EsaModel.load('/vagrant/data/wiki_cesa.model') print tmp_esa
from config import CONFIG import os from gensim import models from gensim import corpora from nyan.shared_modules.feature_extractor.esa.esamodel import EsaModel from nyan.shared_modules.feature_extractor.esa.document_titles import DocumentTitles from wikiextract.mappings import generate_mappings language = "en" NUM_TOPICS = 2000 TF_IDF_PATH = os.path.join(CONFIG['prefix'], language + "_tfidf.model") TF_IDF = models.TfidfModel.load(TF_IDF_PATH) TF_IDF_CORPUS_PATH = os.path.join(CONFIG['prefix'], language + "_tfidf_corpus.mm") MM_TF_IDF = corpora.MmCorpus(TF_IDF_CORPUS_PATH) ARTICLES_PATH = os.path.join(CONFIG['prefix'], language + "_articles.txt") ARTICLE_TITLES = DocumentTitles.load(ARTICLES_PATH) SMALL_EN_ESA_PATH = os.path.join(CONFIG['prefix'], "en_esa%d_on_tfidf.model" % NUM_TOPICS) SMALL_EN_ESA = EsaModel(MM_TF_IDF, document_titles=ARTICLE_TITLES, num_features=50000, num_concepts=NUM_TOPICS, lang='en') SMALL_EN_ESA.save(SMALL_EN_ESA_PATH) SMALL_EN_ESA = EsaModel.load(SMALL_EN_ESA_PATH) # Regenerate mappings generate_mappings()
def run(self): self.logger.info("Starting...") #corpus = CleanCorpus(options.doc_path) #save dictionary: word <-> token id map #corpus.dictionary.save(options.prefix + "_wordids.dict") #save(lambda path: corpus.dictionary.save(path), # options.prefix + "_wordids.dict") #corpus.dictionary.save_as_text(options.prefix + "_wordids.dict.txt") #del corpus """Bag-of-Words""" #init corpus reader and word -> id map #id2token = corpora.Dictionary.load(options.prefix + "_wordids.dict") #new_corpus = CleanCorpus(options.doc_path, dictionary = id2token) #create and save bow-representation of corpus #corpora.MmCorpus.serialize(options.prefix + '_bow_corpus.mm', new_corpus, # progress_cnt=10000) #save article names #new_corpus.save_article_names(options.prefix + "_articles.txt") #new_corpus.load_article_names(options.prefix + "_articles.txt") #del new_corpus #init corpus reader and word -> id map id2token = corpora.Dictionary.load(self.prefix + "_wordids.dict") #mm_bow = corpora.MmCorpus(options.prefix + '_bow_corpus.mm') """TFIDF Model creation""" #build tfidf model #tfidf = models.TfidfModel(mm_bow, id2word=id2token, normalize=True) #save tfidf model #tfidf.save(options.prefix + '_tfidf.model') #save corpus as tfidf vectors in matrix market format #corpora.MmCorpus.serialize(options.prefix + '_tfidf_corpus.mm', tfidf[mm_bow], # progress_cnt=10000) #init tfidf-corpus reader mm_tfidf = corpora.MmCorpus(self.prefix + '_tfidf_corpus.mm') """LDA Model creation""" #build lda model lda = models.LdaModel(corpus=mm_tfidf, id2word=id2token, num_topics=NUM_TOPICS, update_every=1, chunksize=10000, passes=2) #save trained model lda.save(self.prefix + '_lda.model') #save corpus as lda vectors in matrix market format corpora.MmCorpus.serialize(options.prefix + '_lda_corpus.mm', lda[mm_tfidf], progress_cnt=10000) #init lda-corpus reader mm_lda = corpora.MmCorpus(options.prefix + '_lda_corpus.mm') """ESA Model creation""" #document titles article_titles = DocumentTitles.load(options.prefix + "_articles.txt") #build esa model esa = EsaModel(mm_lda, num_clusters=10000, document_titles=article_titles, num_features=NUM_TOPICS) esa.save(options.prefix + "_esa_on_lda.model") self.logger.info("finished transforming")