def run(self): if self.clean_level in ('raw','clean','stopwords'): kind = self.clean_level else: kind = 'stopwords' for idioma in self.output()['langs'].iterkeys(): dicc_path = self.input()['dict']['langs'][idioma].path corp_path = self.input()['corp']['langs'][idioma].path print '==============================' print 'Corriendo LDA de %s con nivel de limpieza %s' % (idioma, kind) print '==============================' # Cargar diccionario y corpus dicc = corpora.Dictionary.load(dicc_path) corpus = corpora.MmCorpus(corp_path) # Correr LDA del idioma para cada numero de topicos for n_topics in self.output()['langs'][idioma].iterkeys(): print 'Número de tópicos: ' + str(n_topics) if self.by_chunks: lda = LdaModel(corpus, id2word=dicc, num_topics=n_topics, update_every=self.update_e, chunksize=self.chunk_size, passes=self.n_passes) else: lda = LdaModel(corpus, id2word=dicc, num_topics=n_topics, passes=1) lda.save(self.output()['langs'][idioma][n_topics].path)
class LDA(object): def __init__(self, model, vocab, corpus=None, topics=200, passes=1): self._model_file = model self._dict_file = vocab self._corpus_file = corpus self._topics = topics self._passes = passes def train(self): self._corpus = SentenceDocCorpus(self._corpus_file) self._lda = LdaModel(self._corpus, num_topics=self._topics, id2word=self._corpus.dictionary, passes=self._passes) self._dictionary = self._corpus.dictionary self._lda.save(self._model_file) self._dictionary.save(self._dict_file) def load(self): self._lda = LdaModel.load(self._model_file) self._dictionary = Dictionary.load(self._dict_file) def topics(self, words): return self._lda[self._dictionary.doc2bow(common.filter(words))] def topic_vector(self, words): return np.array([ v for k, v in self._lda.__getitem__( self._dictionary.doc2bow(common.filter(words)), eps=0) ])
def train_lda(recipe_file,num_topics,output_file): corpus = RecipeCorpus(recipe_file) corpora.MmCorpus.serialize(output_file+'.corpus.mm', corpus) lda = LdaModel(corpus, id2word=corpus.dictionary, num_topics=int(num_topics), distributed=False) lda.save(output_file) return lda
def create_models(df): ''' creates/saves two LDA models (one genre, one subgenre) in a folder called lda_models ''' df = get_all_genres() id2word = corpora.Dictionary(df.genres) word2id = {v: k for k, v in id2word.items()} corpus = [id2word.doc2bow(genres) for genres in df.genres] # captures subgenres with 50 categories subgenre_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=50, random_state=100, update_every=1, passes=5, alpha='auto', per_word_topics=True) # capture main genres with 10 categories genre_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=10, random_state=100, update_every=1, passes=5, alpha='auto', per_word_topics=True) subgenre_model.save('lda_models/subgenre.model') genre_model.save('lda_models/genre.model')
class LDA(object): def __init__(self, model, vocab, corpus=None, topics=200, passes=1): self._model_file = model self._dict_file = vocab self._corpus_file = corpus self._topics = topics self._passes = passes def train(self): self._corpus = SentenceDocCorpus(self._corpus_file) self._lda = LdaModel(self._corpus, num_topics = self._topics, id2word = self._corpus.dictionary, passes = self._passes) self._dictionary = self._corpus.dictionary self._lda.save(self._model_file) self._dictionary.save(self._dict_file) def load(self): self._lda = LdaModel.load(self._model_file) self._dictionary = Dictionary.load(self._dict_file) def topics(self, words): return self._lda[self._dictionary.doc2bow(common.filter(words))] def topic_vector(self, words): return np.array([v for k, v in self._lda.__getitem__(self._dictionary.doc2bow(common.filter(words)), eps=0)])
def trainModel(): """ Train a model """ if args.mode == 'Random': return args.topics, 0 # need to train on dump files = [ f"{args.input}/{f}" for f in os.listdir(args.input) if os.path.isfile(os.path.join(args.input, f)) ] if args.mode == 'LDA': # create dictionary with open(files[0], "r", encoding='utf-8') as f: dct = Dictionary([' '.join(f.readlines()).split()]) for filename in files[1:]: with open(filename, "r", encoding='utf-8') as f: dct.add_documents([' '.join(f.readlines()).split()]) # create corpus corpus = [] for filename in files: with open(filename, "r", encoding='utf-8') as f: corpus.append(dct.doc2bow(' '.join(f.readlines()).split())) lda = LdaModel(corpus, num_topics=args.topics) lda.save("./models/LDAdump.model") dct.save("./models/LDAdump.dct") return lda, dct if args.mode == 'loadLDA': return LdaModel.load("./models/LDAdump.model"), Dictionary.load( "./models/LDAdump.dct")
class Model: def __init__(self, num_categories=20): self.num_categories = num_categories self.ldamodel = None def create_model(self, doc_matrix, term_dictionary, model_path, save_model=True, language='language_na'): """ Creates an LDA model based on a set of documents :param model_path: :param doc_matrix: :param term_dictionary: :param save_model: :param language: :return LDA model: """ self.language = language start = time() self.ldamodel = LdaModel(doc_matrix, num_topics=self.num_categories, id2word=term_dictionary, passes=50) if save_model: self.save_model(model_path=os.path.join( model_path, 'models', self.language, '%s_%s_category_lda.model' % (language, str(self.num_categories)))) logging.info('Training lasted: {:.2f}s'.format(time() - start)) return self.ldamodel def load_model(self, model_path='lda.model'): """ Loads a pretrained LDA model :param model_path: :return LDA model: """ return LdaModel.load(model_path) def save_model(self, model_path): """ Saves trained LDA model :param model_path: :return: """ if not os.path.isdir('models'): os.mkdir('models') if not os.path.isdir(os.path.join('models', self.language)): os.mkdir(os.path.join('models', self.language)) self.ldamodel.save(model_path) logging.info("Model Saved")
def fetch_model(dictionary): corpus = my_corpus lda = LdaModel(corpus, num_topics=100, update_every=1, chunksize=1000, passes=15) #lda = LdaModel(corpus,num_topics=100,id2word=dictionary,update_every=1,chunksize=1000,passes=50) lda.save('Topic/lda.loc') return lda
def main(vocab_file, inv_vocab_file, infiles): vocab = load_pickled(vocab_file) inv_vocab = load_pickled(inv_vocab_file) lda = LdaModel(id2word=inv_vocab, num_topics=200) for f in infiles: tc = TweetCorpus(f, vocab) lda.update(tc) lda.save('topics.lda')
def cross_val_topics_50p(no_of_topics, corpus, dictionary, validation_50p): model = LdaModel(corpus, num_topics=no_of_topics, id2word=dictionary, passes=50, alpha='auto', eval_every=2000) globals()['ldamodel_%dt_50p_autoalpha_val' % no_of_topics] = model model.save('lda_{}t_50p_autoalpha_val.model'.format(no_of_topics)) validation_50p[no_of_topics] = model.print_topics(num_topics=-1, num_words=50)
def perform_lda(x): texts = x.values.tolist() words = [] for text in texts: words.append([x for x in text[0].split()]) dictionary = corpora.Dictionary(words) corpus = [dictionary.doc2bow(word) for word in words] ldamodel = LdaModel(corpus, num_topics=2, id2word=dictionary, passes=50) ldamodel.save('lda.model') print(ldamodel.print_topics(num_topics=2, num_words=15)) return corpus
def generate_lda_model(): corpus = pre_process_data() dictionary = corpora.Dictionary(corpus) bow_corpus = [dictionary.doc2bow(text) for text in corpus] model = LdaModel(bow_corpus, num_topics=10, id2word=dictionary, passes=3) model.save(lda_model_file) return model
def getLdaModel(bow_corpus, dictionary, useSavedTill): if useSavedTill >= USESAVED.lda_model: common_logger.info("loading LDA model from file") return LdaModel.load(file_lda_model) else: common_logger.info("Training LDA model") num_topics = int(math.log(len(bow_corpus)) + 1) # assumption: lda_model = LdaModel(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=numPasses) common_logger.info("Saving LDA model") lda_model.save(file_lda_model) common_logger.info("Done creating LDA model") return lda_model
def fetch_model(dictionary): print "Fetching LDA Model... ", try: lda = LdaModel.load('Topic/lda.tm') print "LDA Model loaded!" except IOError: print "Model not found, building LDA..." corpus=MyCorpus() #lda = LdaModel(corpus,num_topics=50,update_every=1,chunksize=1000,passes=15) lda = LdaModel(corpus,num_topics=50,id2word=dictionary,update_every=1,chunksize=1000,passes=50) print "LDA Built!" lda.save('Topic/lda.tm') return lda
def train_LDA_model(ntopics, dictionary, doc_term_matrix, output_path=None): '''Receives topic number ''' print(f"Training LDA model with {ntopics} topics") model = LdaModel(doc_term_matrix, num_topics=ntopics, id2word=dictionary, passes=20, eval_every=1, iterations=50) if output_path: print(f"Saving in {output_path}") model.save(output_path) return model
def gensim_lda(d): from gensim import corpora, models from gensim.models.ldamodel import LdaModel list_doc = [] for i in range(0,len(d)): list_doc = list_doc + d[i] dictionary = corpora.Dictionary(list_doc) model = LdaModel(num_topics = 20, id2word = dictionary) for i in range(0, len(d)): print 'Generating corpus and updating model ', i corpus = [dictionary.doc2bow(doc) for doc in d[i]] model.update(corpus) model.save('model_20') print model.show_topics(num_topics = 20, num_words = 10)
def lda_model(tokenized_corpus, vec_size): """create LDA model Args: tokenized_corpus : tokenized documents vec_size : vector size """ dictionary = corpora.HashDictionary(tokenized_corpus) corpus = [dictionary.doc2bow(text) for text in tokenized_corpus] temp_file = os.path.join(NLP_PATH,"lda_model") if os.path.isfile(temp_file): LDA = LdaModel.load(temp_file)#LdaMulticore.load(temp_file) else: LDA = LdaModel(corpus,id2word=dictionary,num_topics=vec_size) LDA.save(temp_file) return LDA,corpus
def fetch_model(dictionary): print "Fetching LDA Model... ", try: lda = LdaModel.load('Topic/lda.tm') print "LDA Model loaded!" except IOError: print "Model not found, building LDA..." corpus = MyCorpus() #lda = LdaModel(corpus,num_topics=50,update_every=1,chunksize=1000,passes=15) lda = LdaModel(corpus, num_topics=50, id2word=dictionary, update_every=1, chunksize=1000, passes=50) print "LDA Built!" lda.save('Topic/lda.tm') return lda
def build_all(): """ build and save a bunch of models to evaluate """ with sqlite3.connect('../database/chat.db') as conn: # get vocabulary MIN_OCCURANCE = 100 vocab = Dictionary([ pd.read_sql( 'select word from words where freq >= {}'.format( MIN_OCCURANCE), conn)['word'].tolist() ]) # models for different number of topics N_EPOCHS = 10 for n_topics in [i for i in range(5, 26, 5)]: # one model per each aggregation style for style, sql in zip(['basic', 'user', 'user_day_room'], get_model_sql()): # init model lda_model = LdaModel(id2word=vocab, num_topics=n_topics, alpha='auto', per_word_topics=True) # do training print('training model_{0}_{1}'.format(style, n_topics)) for epoch in range(N_EPOCHS): print('\tepoch', epoch, '...', end='\r') for chunk in pd.read_sql(sql, conn, chunksize=10000): chunk_corpa = [ vocab.doc2bow(text) for text in chunk['lemma'].str.split(' ').tolist() ] lda_model.update(chunk_corpa) print('\tepoch', epoch, '... done!') # Save model to disk. lda_model.save("saved_models/model_{0}_{1}".format( style, n_topics))
def ldamodel(doc_clean,n_topics,n_words,description,tfidfmodel=False,unseen_docs=None): doc_clean = [min_char(doc).split() for doc in doc_clean] dictionary = corpora.Dictionary(doc_clean) # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. corpus = [dictionary.doc2bow(doc) for doc in doc_clean] compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=doc_clean, start=2, limit=40, step=6) if tfidfmodel: tfidf = TfidfModel(corpus,id2word=dictionary,smartirs='ntc') corpus = tfidf[corpus] ldamodel = LdaModel(corpus, num_topics=16, id2word=dictionary,random_state=1,passes=50,per_word_topics=True) print("#Tópicos LDA") for i in range(0, n_topics): temp = ldamodel.show_topic(i, n_words) terms = [] for term in temp: terms.append(term) print("Topic #" + str(i) + ": ", ", ".join([t + '*' + str(i) for t, i in terms])) print('Bound: ',ldamodel.bound(corpus)) # Compute Perplexity print('Perplexity: ',ldamodel.log_perplexity(corpus)) # Compute Coherence Score coherence_model_lda = CoherenceModel(model=ldamodel, texts=doc_clean, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) if unseen_docs: corpus_new = [dictionary.doc2bow(doc) for doc in unseen_docs] for i, unseen_doc in enumerate(corpus_new): topic = None score = 0 inference_doc = ldamodel[unseen_doc] print(unseen_docs[i]) for index,tmpScore in inference_doc[0]: if tmpScore > score: score = tmpScore topic = ldamodel.print_topic(index, 5) print ("Score: {}\t Topic: {}".format(score, topic)) print("Log perplexity for new corpus is", ldamodel.log_perplexity(corpus_new)) print_result(ldamodel, doc_clean, corpus, n_topics, description) pickle.dump(corpus, open(description+'.pkl', 'wb')) dictionary.save(description+'dictionary.gensim') ldamodel.save(description+'_ldamodel.gensim')
def main(argv): if len(argv) < 4: print 'python train_lda.py group_id num_topics passes' sys.exit(1) group_id = argv[1] num_topics = int(argv[2]) passes = int(argv[3]) log.info('Prepare corpus for group: %s' % group_id) base_path = 'tables/' + group_id + '/' model_base_path = 'ldamodels/' + group_id + '/' # buid dict and corpus #now = datetime.now() indicator = 'title-comment' source_path = base_path + 'corpus-topic-comment' corpus_path = model_base_path + 'corpus-'+ indicator + '-' + group_id + '.mm' dict_path = model_base_path + 'dict-' + indicator + '-' + group_id + '.dict' log.info('Building the dict...') build_dict_corpus(source_path, corpus_path, dict_path) log.info('Loading dict from pre-saved file...') dictionary = corpora.Dictionary.load(dict_path) log.info('Done') #dictionary.save_as_text(base_path + 'text-dict.txt') log.info('Build a lda model...') log.info('Loading corpus from pre-saved .mm file...') mmcorpus = corpora.MmCorpus(corpus_path) log.info('Done') log.info('Training lda model...') model = LdaModel(mmcorpus, num_topics=num_topics, id2word = dictionary, passes = passes) model_path = model_base_path + indicator + '-' + group_id + '.ldamodel' model.save(model_path) log.info('Done.') model = LdaModel.load(model_path) model.show_topics(topics=num_topics, topn=10, log=True)
def generate_model(): np.set_printoptions(precision=2) corpus = [] corpus += load_expo_cdc() corpus += load_lago() corpus += load_news() corpus += load_news_ic() corpus += load_palestras() corpus = preprocessing(corpus) dictionary = corpora.Dictionary(corpus) bow_corpus = [dictionary.doc2bow(text) for text in corpus] dictionary.save(DICT) corpora.MmCorpus.serialize(BOW_CORPUS, bow_corpus) bow2 = np.concatenate((bow_corpus, bow_corpus), axis=0) bow2 = np.concatenate((bow2, bow2), axis=0) bow2 = np.concatenate((bow2, bow2), axis=0) TOPICS = 20 model = LdaModel(bow2, id2word=dictionary, num_topics=TOPICS, iterations=100, passes=15) model.save(MODEL) lda_corpus = [model[vector] for vector in bow2] lda_dense = gensim.matutils.corpus2dense(lda_corpus, num_terms=TOPICS).transpose() """ tfidf = models.TfidfModel(bow_corpus) tfidf_corpus = [tfidf[vector] for vector in bow_corpus] tfidf_dense = gensim.matutils.corpus2dense(tfidf_corpus, num_terms=len(dictionary)).transpose() """ classifier = LogisticRegression() labels = load_labels() labels2 = labels labels2 += labels2 labels2 += labels2 labels2 += labels2 classifier.fit(lda_dense, labels2) joblib.dump(classifier, CLASSIFIER, compress=9) #print "LDA results" probs = classifier.predict_proba(lda_dense)
def SNAP_generateLDAForTopic(self, topic, numTopics = 5): if (topic == 'all'): topics = ['syria', 'ufo', 'movie', 'celebrity', 'russia'] # bieber, cyrus for t in topics: for nt in [5, 10]: self.SNAP_generateLDAForTopic(t, nt) return id2word = self.SNAP_id2word() mmPath = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'snap_data', "gensim_snap_mmcorpus_%s.mm" % topic ) outPath = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'snap_data', "gensim_snap_lda_%s_%d" % (topic, numTopics) ) mm = MmCorpus(mmPath) lda = LdaModel(corpus=mm, id2word=id2word, num_topics=numTopics, update_every=1, chunksize=10000, passes=1) lda.save(outPath) return
def build_lda_model(corpus, dictionary, num_topics=10): file_name = None if corpus == None: corpus = get_corpus() if dictionary == None: dictionary = get_dictionary() if num_topics == 10: file_name = LDA_FILE_10 elif num_topics == 30: file_name = LDA_FILE_30 elif num_topics == 60: file_name = LDA_FILE_60 elif num_topics == 120: file_name = LDA_FILE_120 else: raise ValueError("bad number of topics") lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, update_every=1, chunksize=100, passes=1) lda.save(file_name) for topic in range(10): print "Topic {0}: {1}".format(topic, lda.print_topic(topic)) return lda
def train(refresh=True): if refresh: ptb = BracketParseCorpusReader(Corpus.DATA_DIR, Corpus.FILE_PATTERN) train_folders = [str(i) + str(j) for i in range(2) for j in range(10)] train_folders += [str(i) + str(j) for i in range(2, 3) for j in range(5)] dictionary = corpora.dictionary.Dictionary() train_documents = list() logger.debug('Starting to parse training documents') for folder in train_folders: for ptb_file in os.listdir(os.path.join(Corpus.DATA_DIR, folder)): document_sentences = ptb.sents(fileids=[os.path.join(folder, ptb_file)]) if len(document_sentences) > DOC_LEN_THRESHOLD: doc2sentence = list(chain.from_iterable(document_sentences)) doc2sentence = clean_text(doc2sentence) dictionary.add_documents([doc2sentence]) train_documents.append(doc2sentence) logger.debug('Parsed all training documents') dictionary.filter_extremes(no_below=1, no_above=0.5) dictionary.save(DICTIONARY_FILE) logger.debug('Creating corpus for training data') corpus = [dictionary.doc2bow(text) for text in train_documents] logger.debug('Finished creating corpus') logger.debug('Training LDA model on corpus') lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=N_TOPICS, passes=20) logger.debug('Completed LDA training') lda.save(LDA_MODEL_FILE) else: dictionary = corpora.dictionary.Dictionary.load(DICTIONARY_FILE) lda = LdaModel.load(LDA_MODEL_FILE) return lda, dictionary
import numpy as np import time corpusType = "sraa2_" subDirectory = 'run_sraa' t1 = time.time() corpus = corpora.MmCorpus(subDirectory + '/' + corpusType + 'corpus.mm') dictionary = corpora.dictionary.Dictionary.load(subDirectory + '/' + corpusType + 'dictionary.dict') classes = np.loadtxt(subDirectory + '/' + corpusType + 'classes.dat', dtype=int) t2 = time.time() print 'data loaded ... seconds: ', print(t2 - t1) ldaModel = LdaModel(corpus, num_topics=30, id2word=dictionary, passes=20) ldaModel.save(subDirectory + '/' + corpusType + 'sraa.lda_model') t3 = time.time() print 'ldaModel is finished... seconds:', print(t3 - t2) tfidfModel = models.TfidfModel(corpus) tfidfModel.save(subDirectory + '/' + corpusType + 'sraa.tfidf_model') t4 = time.time() print 'tfidfModel is finished... seconds:', print(t4 - t3)
for idx, doc in enumerate(allmydocs): # if idx > num_docs: # break doc = doc.lower() doc = re.split(' |, |\n|: |(|)', doc) doc = [elt for elt in doc if elt is not None] tokens = [] for words in doc: cleaned = ''.join([i for i in words if i.isalpha()]) if cleaned not in stop_words and 2 < len(cleaned): tokens.append(cleaned) cleaned_docs.append(tokens[:]) # Create a corpus from a list of texts common_dictionary = Dictionary(cleaned_docs) common_corpus = [common_dictionary.doc2bow(text) for text in cleaned_docs] random.shuffle(common_corpus) train = common_corpus[:int(len(common_corpus)*0.8)] test = common_corpus[int(len(common_corpus)*0.8):] lda = LdaModel(common_corpus, num_topics=25, iterations=10000, eval_every=2, chunksize=10000, passes=10) perplex = lda.log_perplexity(common_corpus) print('perplex', perplex) # Save model to disk. temp_file = datapath("model") lda.save(temp_file)
class CustomLda(object): def __init__(self, data=None, dictionary=None): """ initialize, data should be provided, only when unpickling class object it is not needed!""" self.data = data self.model = None self.num_topics = None self.iterations = None self.random_state = None self.dictionary = dictionary if self.data is not None: if self.dictionary is None: self.dictionary = Dictionary(self.data) self.corpus = [self.dictionary.doc2bow(text) for text in self.data] else: self.dictionary = None self.corpus = None self.distributed = None self.chuncksize = None self.passes = None self.update_every = None self.alpha = None self.eta = None self.decay = None self.offset = None self.eval_every = None self.gamma_threshold = None self.minimum_probability = None self.ns_conf = None self.minimum_phi_value = None self.per_word_topics = None self.num_topics = None self.iterations = None self.random_state = None self.model = None self.coherence_model = None self.coherence = None self.coherence_type = None def train(self, num_topics, iterations=1500, random_state=1, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, gamma_threshold=0.001, minimum_probability=0.01, ns_conf=None, minimum_phi_value=0.01, per_word_topics=False, workers=1): """train lda model. If workers >1, goes multicore""" self.distributed = distributed self.chuncksize = chunksize self.passes = passes self.update_every = update_every self.alpha = alpha self.eta = eta self.decay = decay self.offset = offset self.eval_every = eval_every self.gamma_threshold = gamma_threshold self.minimum_probability = minimum_probability self.ns_conf = ns_conf self.minimum_phi_value = minimum_phi_value self.per_word_topics = per_word_topics self.num_topics = num_topics self.iterations = iterations self.random_state = random_state self.workers = workers if self.workers > 1: self.model = LdaMulticore( workers=3, corpus=self.corpus, id2word=self.dictionary, iterations=self.iterations, num_topics=self.num_topics, random_state=self. random_state, # distributed=self.distributed, chunksize=self.chuncksize, passes=self.passes, # update_every= self.update_every, alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset, eval_every=self.eval_every, gamma_threshold=self.gamma_threshold, minimum_probability=self. minimum_probability, # ns_conf=self.ns_conf, minimum_phi_value=self.minimum_phi_value, per_word_topics=self.per_word_topics) else: self.model = LdaModel(corpus=self.corpus, id2word=self.dictionary, iterations=self.iterations, num_topics=self.num_topics, random_state=self.random_state, distributed=self.distributed, chunksize=self.chuncksize, passes=self.passes, update_every=self.update_every, alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset, eval_every=self.eval_every, gamma_threshold=self.gamma_threshold, minimum_probability=self.minimum_probability, ns_conf=self.ns_conf, minimum_phi_value=self.minimum_phi_value, per_word_topics=self.per_word_topics) print('Trained!') def _train_coherence_model(self, coherence_type='u_mass'): """could be made on top of model to get coherence, type could be 'u_mass' or 'c_v'""" self.coherence_model = CoherenceModel(model=self.model, texts=self.data, dictionary=self.dictionary, coherence=coherence_type) def _calculate_coherence(self, coherence_type='u_mass'): self._train_coherence_model(coherence_type=coherence_type) self.coherence = self.coherence_model.get_coherence() def get_coherence(self, coherence_type='u_mass'): if coherence_type != self.coherence_type: self._calculate_coherence(coherence_type=coherence_type) return self.coherence def get_topic_terms(self, num, topn=10): return self.model.get_topic_terms(num, topn=topn) def get_preplexity(self): return self.model.log_perplexity(self.corpus) def get_topics(self, num): return self.model.show_topics(num) def _make_visualization(self): """prepare visualisation for display/saving""" return pyLDAvis.gensim.prepare(self.model, self.corpus, self.dictionary, sort_topics=False) def display(self): """display LDAvis in notebook""" visualisation = self._make_visualization() return pyLDAvis.display(visualisation) def save_ldavis(self, filename='topic.html'): """save LDAvis to .html""" ldavis = self._make_visualization() pyLDAvis.save_html(ldavis, filename) def save_lda(self, filename): """save lda model only""" self.model.save(filename) def pickle(self, filename): """save class instance to file""" f = open(filename, 'wb') pickle.dump(self, f, pickle.HIGHEST_PROTOCOL) f.close() @staticmethod def unpickle(filename): """read class instance from file""" with open(filename, 'rb') as f: return pickle.load(f) def predict_topic(self, doc_list): """predict topic of document list (consists of strings""" topic_list = [] for doc in doc_list: bow = self.dictionary.doc2bow(str(doc).split()) topics_probs = self.model.get_document_topics(bow) topics_probs.sort(key=lambda tup: tup[1], reverse=True) topic_list.append(topics_probs) return topic_list
from gensim.models.ldaseqmodel import LdaSeqModel from gensim.models.ldamodel import LdaModel from gensim import corpora from nltk import word_tokenize from tqdm import tqdm from csv import DictReader from collections import defaultdict import pprint pp = pprint.PrettyPrinter(indent=4) id2word = corpora.Dictionary.load('tokens.dict') mm = corpora.MmCorpus('messages.mm') ldaseq = LdaModel(corpus=mm, id2word=id2word, num_topics=15) pp.pprint(ldaseq.print_topics()) ldaseq.save("lda_model")
import logging import gensim from gensim.test.utils import datapath from gensim.models.ldamodel import LdaModel logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) id2word = gensim.corpora.Dictionary.load_from_text( 'WikiCorpus/wiki_wordids.txt') mm = gensim.corpora.MmCorpus('WikiCorpus/wiki_tfidf.mm') print(mm) lda = LdaModel(corpus=mm, id2word=id2word, num_topics=130, update_every=1, passes=1) model_location = datapath("D:/HazMat/Projects/ML/Models/model_130") lda.save(model_location)
class Topics: def __init__(self, data, num_topics, itertations): ''' initilizes the a Topics instance and tokenizes the input document ''' # self.text = open(filename, 'r').read() # self.filename = filename[0:-4] self.wholeDocument = data # if not os.path.exists(self.output_directory): # os.makedirs(self.output_directory) # self.filename = filename if isinstance(data, dict): self.Document = data self.text = '' for key in data: self.text += data[key] + ' ' else: self.text = data self.languageModel = spacy.load('en') self.lemmatizer = WordNetLemmatizer() self.en_stop = set(nltk.corpus.stopwords.words('english')).union( set(['the', 'The'])) self.num_topics = num_topics self.itertations = itertations self.tokenize() self.vocab_size = len(self.document) def tokenize(self): ''' function that returns all the sentences in the document after: 1- lemmatizing every word 2- removing stop words 3- removing short words ''' self.document = self.languageModel(self.text) self.sentences = [] # counter = 0 for sent in self.document.sents: sentence = [ self.lemmatizer.lemmatize(word.lower()) for word in sent.text.split() if (word.lower() not in self.en_stop and len(word) >= 3) ] self.sentences.append(sentence) # counter+=1 # print(counter) # print(len(self.sentences)) def tokenize_testDoc(self, testDoc): ''' function that returns all the sentences in a test document in the same way as tokenize. ''' # text = open(testDoc, 'r').read() text = testDoc document = self.languageModel(text) sentences = [] for sent in document.sents: sentence = [ self.lemmatizer.lemmatize(word.lower()) for word in sent.text.split() if word.lower() not in self.en_stop and len(word) > 3 ] sentences.append(sentence) return sentences def get_model(self): ''' function that builds a LDA model for the whole document being summarized the function also saved the model based on the document's name ''' dictionary = corpora.Dictionary(self.sentences) corpus = [dictionary.doc2bow(sentence) for sentence in self.sentences] self.ldamodel = LdaModel(corpus, num_topics=self.num_topics, id2word=dictionary, passes=self.itertations, random_state=0) self.print_model() def print_model(self): ''' function that prints topic distributions over all words in the original document and saves them into a json file ''' topics = self.ldamodel.print_topics(num_words=self.vocab_size) self.topics_dict = {} for topic in topics: topic_nb = topic[0] distribution = topic[1].split('+') topic_dict = {} for word_prob in distribution: word_prob_ = word_prob.split('*') if len(word_prob_) > 1: topic_dict[word_prob_[1][1:-2]] = float(word_prob_[0]) self.topics_dict[topic_nb] = topic_dict # with open(self.output_directory + self.filename + '_topics_dict.json', 'w') as output: # json.dump(self.topics_dict, output) # for key in self.topics_dict: # print("Topic ", key, " Distribution: ", self.topics_dict[key], '\n') def save_model(self): ''' functions that saves an LDA model ''' self.ldamodel.save(self.output_directory + self.filename + '_ldaModel.gensim') def load_model(self): ''' functions that loads an already existing LDA model ''' self.ldamodel = LdaModel.load(self.output_directory + self.filename + '_ldaModel.gensim') def get_topic_dist(self, doc): ''' funciton that can be used to get the topic distribution for every paragraph of the original document ''' paragraph = self.tokenize_testDoc(doc) # print(len(paragraph)) ''' reuse the dictionay built based on the original document for our case this dict should cover all the the paragraphs ''' dictionary = corpora.Dictionary(self.sentences) corpus = [dictionary.doc2bow(sentence) for sentence in paragraph] dists = [] for i, cor in enumerate(corpus): dist = self.ldamodel[cor] dists.append(dist) # print(dists) return dists def get_coverage(self): ''' this function finds how much each topic is covered in every paragraph of the document ''' self.coverage = {} for paragraph in list(self.Document.keys()): if len(self.Document[paragraph]) != 0: self.coverage[paragraph] = self.get_paragraph_coverage( paragraph) else: self.coverage[paragraph] = np.zeros([self.num_topics]).tolist() # self.print_coverage() # with open(self.output_directory + self.filename + '_paragraph_coverage_dict.json', 'w') as output: # json.dump(self.coverage, output) return self.coverage def get_paragraph_coverage(self, paragraph_key): ''' this function finds topic coverage per paragraph by averaging over topic coverage per sentence ''' paragraph = self.Document[paragraph_key] dists = self.get_topic_dist(paragraph) np_dist = [] for dist in dists: np_sent_dist = [] if len(dist) == self.num_topics: for tup in dist: np_sent_dist.append(tup[1]) np_dist.append(np_sent_dist) else: np_sent_dist = np.zeros([self.num_topics]) for tup in dist: np_sent_dist[int(tup[0])] = tup[1] np_dist.append(np_sent_dist.tolist()) paragraph_coverage = np.average(np.array(np_dist), axis=0) return paragraph_coverage.tolist()
run_id = "ldaU_K{K}_a{alpha_frac}-K_b{beta}_iter{iter}.gensim".format(K=num_topics, alpha_frac=alpha_frac, beta=beta, iter=num_iterations) print run_id output_file = output_file_template.format(run_id=run_id) # Train and save print 'Training...' model = LdaModel(corpus, alpha=alpha, eta=beta, id2word=dictionary, num_topics=num_topics, iterations=num_iterations ) # model = LdaMulticore(corpus, # alpha=alpha, eta=beta, # id2word=dictionary, num_topics=num_topics, iterations=num_iterations, workers=2 # ) print 'Done training.' model.save(output_file) # Print top 10 words in topics, if desired if print_topics: topics = model.show_topics(num_topics=100, formatted=False) for topic in topics: for tup in topic[1]: print tup[0] + ": " + str(tup[1]) print '\n' # Evaluate perplexity ll = model.log_perplexity(test_corpus) print "LL: "+str(ll) print "Perp: "+str(np.exp2(-ll))
def fetch_model(dictionary): corpus=my_corpus lda = LdaModel(corpus,num_topics=100,update_every=1,chunksize=1000,passes=15) #lda = LdaModel(corpus,num_topics=100,id2word=dictionary,update_every=1,chunksize=1000,passes=50) lda.save('Topic/lda.loc') return lda
def train_lda(corpus, vocab_dict, n_topics, n_iter, save_model): lda = LdaModel(corpus, num_topics=n_topics, id2word=vocab_dict, \ passes=n_iter, minimum_probability=1e-3) lda.save(save_model) return lda
data = pd.read_csv('nyt.csv') text_clean = [] for text in data['News_content']: text_clean.append(pptext(text).split()) print(text_clean[:3]) dictionary = Dictionary(text_clean) corpus = [dictionary.doc2bow(text) for text in text_clean] pickle.dump(corpus, open('topicModels//corpus2.pkl', 'wb')) dictionary.save('topicModels//dictionary2.gensim') ldamodel = LdaModel(corpus, num_topics=15, id2word=dictionary, passes=15) ldamodel.save('topicModels//model15.gensim') topics = ldamodel.print_topics(num_words=4) for topic in topics: print(topic) # Visaulization works only on Jupyter Notebook # type jupyter notebook or jupyter console dictionary = Dictionary.load('topicModels//dictionary2.gensim') corpus = pickle.load(open('topicModels//corpus2.pkl', 'rb')) ldamd = LdaModel.load('topicModels//model15.gensim') lda_display = pyLDAvis.gensim.prepare(ldamd, corpus, dictionary, sort_topics=False) pyLDAvis.display(lda_display)
dictionary = gensim.corpora.Dictionary.load_from_text( bz2file.BZ2File('wiki_wordids.txt.bz2')) corpus = gensim.corpora.MmCorpus('wiki_tfidf.mm') with open('mcdi_word.csv', 'rb') as f: reader = csv.reader(f) wlist = [] for row in reader: wlist.append(row) idlist = [] for row in wlist: idrow = [] for key in dictionary.iteritems(): if key[1].encode('utf-8') in row: idrow.append(key[0]) idlist.append(idrow) a = 0.05 ntopic = 75 eta_arr = ones((ntopic, len(dictionary))) * 0.5 for x in range(0, len(idlist)): for id in idlist[x]: eta_arr[x, id] *= 100000 lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=ntopic) topiclist = lda.print_topics(num_topics=75, num_words=50) lda.save('wiki_file_75.model')
#result_stem = [stemmer.stem(item) for item in result_lower] result_clean = [ item for item in result_lower if '\'' not in item and '_' not in item and len(item) > 1 ] result = [item for item in result_clean if item not in sw] filew.extend(result) resultlist.append(filew) print(resultlist[0]) dictionary = corpora.Dictionary(resultlist) corpus = [dictionary.doc2bow(text) for text in resultlist] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=50) topiclist = lda.print_topics(num_topics=50, num_words=50) lda.save('childs_file_50.model') ''' aniP = lda.get_document_topics(animal) print aniP rlist = [] for topic in topiclist: rlist.append(topic[1].split("+")) nlist = [] for n in rlist: xlist = [] for k in n: k = k.split("*") k = k[1].encode('utf-8') k = k.split('\"')[1]
class TextProcessor: def __init__(self, n_users, n_samples, n_dims): self.nUsers, self.nSamples, self.nDims = n_users, n_samples, n_dims self.tfIdfModel = self.lsiModel = self.ldaModel = self.w2vModel = self.dictionary = None self.dictPath, self.tfIdfPath, self.lsiPath, self.ldaPath, self.w2vPath, self.w2vVecPath =\ conf.get_filename_via_tpl('model', model_type='tfidf', n_users=n_users, n_samples=n_samples, model_filename='dict'), \ conf.get_filename_via_tpl('model', model_type='tfidf', n_users=n_users, n_samples=n_samples, model_filename='tfidf'),\ conf.get_filename_via_tpl('model', model_type='lsi', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='lsi_model'), \ conf.get_filename_via_tpl('model', model_type='lda', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='lda_model'),\ conf.get_filename_via_tpl('model', model_type='w2v', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='w2vmodel'), \ conf.get_filename_via_tpl('model', model_type='w2v', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='vec.txt') def load_model(self, model_type): model = None try: if model_type == 'tfidf': model = TfidfModel.load(self.tfIdfPath, mmap='r') self.tfIdfModel = model elif model_type == 'lsi': model = LsiModel.load(self.lsiPath, mmap='r') self.lsiModel = model elif model_type == 'lda': model = LdaModel.load(self.ldaPath, mmap='r') self.ldaModel = model elif model_type == 'w2v': model = Word2Vec.load(self.w2vPath, mmap='r') self.w2vModel = model else: logger.error('Model type error. Unexpected %s' % model_type) return None if self.dictionary is None and os.path.exists(self.dictPath): self.dictionary = corpora.Dictionary.load(self.dictPath) logger.info('%s model loaded completely.' % model_type) except IOError: logger.error( 'The %s model doesn\'t exist. Please train the model before load it.' % model_type) finally: return model def tf_idf_transform(self, doc): """ Perform tf-idf transformation on doc. """ self.dictionary = corpora.Dictionary(doc) corpus = [self.dictionary.doc2bow(text) for text in doc] self.tfIdfModel = TfidfModel(corpus) conf.mk_dir(self.tfIdfPath) self.dictionary.save(self.dictPath) logger.info('Dictionary has been saved in %s.' % self.dictPath) self.tfIdfModel.save(self.tfIdfPath) logger.info('TF-IDF model has been saved in %s.' % self.tfIdfPath) tfidf_corpus = self.tfIdfModel[corpus] tfidf_corpus_path = conf.get_filename_via_tpl('tfidf', n_users=self.nUsers, postfix='mm', n_samples=self.nSamples) corpora.MmCorpus.serialize(tfidf_corpus_path, tfidf_corpus) logger.info('TF-IDF corpus with a shape of %s has been saved in %s.' % (np.array(tfidf_corpus).shape, tfidf_corpus_path)) return tfidf_corpus def lsi_transform(self, corpus_tf_idf): logger.info('Training lsi model with a n_dims of %d...' % self.nDims) if self.dictionary is None and os.path.exists(self.dictPath): self.dictionary = corpora.Dictionary.load(self.dictPath) self.lsiModel = LsiModel(corpus=corpus_tf_idf, num_topics=self.nDims, id2word=self.dictionary) # print self.lsiModel[corpus] conf.mk_dir(self.lsiPath) self.lsiModel.save(self.lsiPath) logger.info('Lsi model has been saved in %s.' % self.lsiPath) lsi_corpus = self.lsiModel[corpus_tf_idf] lsi_corpus_path = conf.get_filename_via_tpl('lsi', n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims, postfix='mm') conf.mk_dir(lsi_corpus_path) corpora.MmCorpus.serialize(lsi_corpus_path, lsi_corpus) logger.info('Lsi corpus with a shape of %s has been saved in %s.' % (np.array(lsi_corpus).shape, lsi_corpus_path)) return lsi_corpus def lda_transform(self, corpus_tf_idf, train_separated=False, is_update=False): """ Init a lda model with a n_topics whose default is 500, then fit it with corpus_tf_idf and transform it. :param corpus_tf_idf: Corpus which has been transformed into tf-idf matrix. :param train_separated: The model is going to be train with all corpus one time or some of them separately one time. :param is_update: Whether the training to be perform is to construct a new model or update one existed. :return: lda corpus. """ logger.info('Training lda model with a n_dims of %d...' % self.nDims) if self.dictionary is None and os.path.exists(self.dictPath): self.dictionary = corpora.Dictionary.load(self.dictPath) if is_update: # A ldaModel had been trained before and now update the model with other corpus. if self.ldaModel is None: self.load_model('lda') self.ldaModel.update(corpus_tf_idf) logger.info('Lda model has been updated successfully.') return self.ldaModel[corpus_tf_idf] if train_separated: # corpus = [] # spacing = 10000 # for i in range(int(len(corpus_tf_idf)/spacing)): # corpus.append(corpus_tf_idf[i*spacing: i]) # self.ldaModel = LdaModel() pass self.ldaModel = LdaModel(corpus=corpus_tf_idf, num_topics=self.nDims, id2word=self.dictionary) conf.mk_dir(self.ldaPath) self.ldaModel.save(self.ldaPath) logger.info('lda model has been saved in %s' % self.ldaPath) lda_corpus = self.ldaModel[corpus_tf_idf] lda_corpus_path = conf.get_filename_via_tpl('lda', n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims, postfix='mm') conf.mk_dir(lda_corpus_path) corpora.MmCorpus.serialize(lda_corpus_path, lda_corpus) logger.info('Lda corpus with a shape of %s has been saved in %s.' % (np.array(lda_corpus).shape, lda_corpus_path)) return lda_corpus def w2v_transform(self, sentences): """ Perform word2vec on texts and obtain a w2v model. :param sentences: Sentences that each one of it contains a list of words of a text. :return: W2v model. """ logger.info('Training w2v model with a dim of %d...' % self.nDims) # file = open(infile_path, 'r', encoding='utf-8') if infile_path.find('\n') < 0 else StringIO(infile_path) # sentences = [] # for sen in file.readlines(): # sentences.append(sen.strip().split(' ')) # print(sentences) self.w2vModel = Word2Vec(sentences, size=self.nDims, min_count=0) conf.mk_dir(self.w2vPath) self.w2vModel.save(self.w2vPath) self.w2vModel.wv.save_word2vec_format(self.w2vVecPath, binary=False) # print(model['[']) # Construct w2v corpus w2v_corpus = [] for sen in sentences: vec = [0] * self.nDims if len(sen) > 0: for word in sen: vec = list( map(lambda m, n: m + n, vec, self.w2vModel[word])) # vec += self.w2vModel[word] w2v_corpus.append(vec) w2v_corpus_path = conf.get_filename_via_tpl('w2v', n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims) conf.mk_dir(w2v_corpus_path) with open(w2v_corpus_path, 'w') as fp: csv_writer = csv.writer(fp) for line in w2v_corpus: csv_writer.writerow(line) logger.info('W2v corpus has been saved in %s. ' % w2v_corpus_path) return w2v_corpus def load_corpus(self, model_type, dense=False): corpus = None try: if model_type == 'tfidf': corpus = corpora.MmCorpus( conf.get_filename_via_tpl('tfidf', n_users=self.nUsers, postfix='mm', n_samples=self.nSamples)) elif model_type in ['lsi', 'lda']: corpus = corpora.MmCorpus( conf.get_filename_via_tpl(model_type, n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims, postfix='mm')) elif model_type == 'w2v': corpus = np.loadtxt(conf.get_filename_via_tpl( model_type, n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims), dtype=np.float, delimiter=',') logger.info('%s corpus with a shape of %s has been loaded. ' % (model_type, np.array(corpus).shape)) if dense and model_type in ['tfidf', 'lsi', 'lda']: corpus = matutils.corpus2dense(corpus, self.nDims, self.nSamples * self.nUsers, dtype=np.float).T else: corpus = np.array(corpus) except Exception as e: raise e return corpus @staticmethod def corpus2dense(corpus, n_terms, n_docs=conf.N_SAMPLES, dtype=np.float): return matutils.corpus2dense(corpus, n_terms, n_docs, dtype).T def load_vec(self, vec_type): logger.info('Loading %s vectors...' % vec_type) try: corpus_vec = self.load_corpus(vec_type, True) except Exception as e: raise e data = [] for i in range(self.nUsers): data.append(corpus_vec[i * self.nSamples:(i + 1) * self.nSamples]) data = np.array(data, dtype=np.float) return data
if len(sys.argv) != 2: print 'Usage: {0} rcv1_data_dir'.format(sys.argv[0]) raise SystemExit(1) data_dir = sys.argv[1] mapping_file = data_dir+'/token_id_idf' dictionary_file = data_dir+'/id_token_df' token_file = data_dir+'/tokens' lda_file = data_dir+'/lda_model' print 'creating dictionary...' N = 23307 # supplied idfs from rcv1/lyrl2004 were based on 23307 training docs create_dictionary_file(mapping_file,dictionary_file,23307) dictionary = Dictionary.load_from_text(dictionary_file) print 'creating corpus...' corpus = SimpleLowCorpus(token_file,dictionary) print 'training model...' logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO) lda = LdaModel(corpus,id2word=dictionary,num_topics=200) print 'done!' print '\n'*3 print '======final topics======' topics = lda.show_topics(topics=-1,topn=4) for i,topic in enumerate(topics): print i,topic print 'saving model...' lda.save(lda_file)
from gensim import corpora,models, similarities import numpy as np import time corpusType = "sraa2_"; subDirectory = 'run_sraa' t1 = time.time() corpus = corpora.MmCorpus(subDirectory+'/'+corpusType+'corpus.mm') dictionary = corpora.dictionary.Dictionary.load(subDirectory+'/'+corpusType+'dictionary.dict') classes = np.loadtxt(subDirectory+'/'+corpusType+'classes.dat',dtype=int) t2 = time.time() print 'data loaded ... seconds: ', print (t2-t1) ldaModel = LdaModel(corpus, num_topics=30, id2word = dictionary, passes=20) ldaModel.save(subDirectory+'/'+corpusType+'sraa.lda_model') t3 = time.time() print 'ldaModel is finished... seconds:', print (t3-t2) tfidfModel = models.TfidfModel(corpus) tfidfModel.save(subDirectory+'/'+corpusType+'sraa.tfidf_model') t4 = time.time() print 'tfidfModel is finished... seconds:', print (t4-t3)
def create_models(db, lsi_num_topics=10, lda_num_topics=5, num_bars=None): """ Create and save a lsi object using data in the database. Save this object, along with the dictionary and the corpus, to disk """ bars = db['bars'] if num_bars == None: locations = bars.find({ 'nymag.review' : {'$ne':None}, 'foursquare.tips' : {'$exists':True}, 'foursquare.tips' : {'$ne':None} }) else: locations = bars.find({ 'nymag.review' : {'$ne':None}, 'foursquare.tips' : {'$exists':True}, 'foursquare.tips' : {'$ne':None} }).limit(num_bars) ignorechars = '''!"#$%&()*+,-./:;<=>?@[\]^_`{|}~''' stopwords = get_stopwords() texts = [] bar_idx_map = {} idx_bar_map = {} save_directory = "assets/" print "Fetching texts from database and tokenizing" for idx, location in enumerate(locations): bar_name = location['nymag']['name'] bar_idx_map[bar_name] = idx idx_bar_map[int(idx)] = bar_name text = create_string_from_database(location) tokens = tokenize_document(text, stopwords, ignorechars) texts.append(tokens) # Do some cleaning print "Cleaning texts" texts = remove_words_appearing_once(texts) # Create the counter word_counts = Counter() for text in texts: word_counts.update(text) # Create and save the dictionary print "Creating dictionary" dictionary = corpora.Dictionary(texts) dictionary.save(save_directory + 'keywords.dict') # Create and save the corpus print "Creating Corpus matrix" corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize(save_directory + 'corpus.mm', corpus) # Term Frequency, Inverse Document Frequency print "Applying TFIDF" tfidf = models.TfidfModel(corpus) tfidf.save(save_directory + "tfidf.model") # Map TFIDF on the corpus print "Mapping TFIDF on corpus" corpus_tfidf = tfidf[corpus] corpora.MmCorpus.serialize(save_directory + 'corpus_tfidf.mm', corpus_tfidf) # Create the LSI print "Creating LSI with %s topics" % lsi_num_topics lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=lsi_num_topics) lsi.save(save_directory + 'lsi.model') # Map LSI on the corpus corpus_lsi_tfidf = lsi[corpus_tfidf] corpora.MmCorpus.serialize(save_directory + 'corpus_lsi_tfidf.mm', corpus_lsi_tfidf) # Create the index #index = similarities.MatrixSimilarity(lsi[corpus_tfidf]) index = similarities.MatrixSimilarity(corpus_lsi_tfidf) index.save(save_directory + 'lsi_tfidf.index') # Create the LDA (on the raw corpus) print "Creating LDA with %s topics" % lda_num_topics lda = LdaModel(corpus, num_topics=lda_num_topics, id2word=dictionary, update_every=0, passes=30) #lda.show_topics(10, 20, formatted=False) lda.save(save_directory + 'lda.model') # Create the lda corpus corpus_lda = lda[corpus] corpora.MmCorpus.serialize(save_directory + 'corpus_lda.mm', corpus_lda) # Save some additional info with open(save_directory + 'bar_idx_map.json', 'wb') as fp: json.dump(bar_idx_map, fp) with open(save_directory + 'idx_bar_map.json', 'wb') as fp: json.dump(idx_bar_map, fp) with open(save_directory + 'model_info.txt', 'wb') as fp: ts = time.time() st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') fp.write("LSI Model %s\n" % st) info = "Number of Docs: %s\n" % len(corpus) info += "Number of key words: %s\n" % len(dictionary) info += "Number of LSI Topics: %s\n" % lsi_num_topics info += "Number of LDA Topics: %s\n" % lda_num_topics fp.write(info) # Print Stop Words words_per_row = 10 fp.write("\n\nStop Words:\n") for i, word in enumerate(stopwords): if i % words_per_row == 0: fp.write('\n') fp.write("%s " % word) # Print Corpus fp.write("\n\nWords Encountered:\n") for word, count in word_counts.most_common(): idx = dictionary.token2id[word] line = "%s (index=%s) freq: %s \n" % (word, idx, count) fp.write(line)
#filename = 'data/lkmlLinusAll.txt' with open(filename, encoding='utf-8') as f: documents = f.readlines() texts = [[word for word in document.lower().split() if word not in STOPWORDS and word.isalnum()] for document in documents] dictionary = corpora.Dictionary(texts) dictionary.save('lkml.dict') corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('lkml.mm', corpus) lda = LdaModel(corpus, id2word=dictionary, num_topics=num_topics, passes=passes) pp = pprint.PrettyPrinter(indent=4) pp.pprint(lda.print_topics(num_words=num_words)) lda.save('lkml.gensim') #newlda = LdaModel.load('lkml.gensim', mmap='r') #pp.pprint(lda.print_topics(num_words=num_words)) unseenText = 'data/lkmlSingleNewEmail.txt' with open(unseenText, encoding='utf-8') as fnew: newdoc = fnew.read() newcorpus = dictionary.doc2bow(newword for newword in newdoc.lower().split() if newword not in STOPWORDS and newword.isalnum()) pp.pprint(lda[newcorpus])
# How much does each bill fall under each topic? def get_bill_topics(model): topics = pd.DataFrame() for i, row in enumerate(model[corpus]): # Proportion that document falls into each topic topics = pd.concat( [topics, pd.DataFrame(model[corpus][i]).set_index(0)], axis=1) topics = topics.transpose().reset_index(drop=True) # Integer index of dominant topic dominant_topic = topics.idxmax(axis=1).rename('dominant_topic') # Percentage that document represents dominant topic max_perc = topics.max(axis=1, skipna=True).rename('max_perc') return pd.concat([ bills[['session', 'bill_id', 'title', 'text']], dominant_topic, max_perc, topics ], axis=1) bill_topics = get_bill_topics(lda_model) print('Exporting topic model...') lda_model.save('models/topic_model_' + subject.lower()) print('Done!')
def make_lda_result(): lda = LdaModel(corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=2, iterations=1000) # save LDA result lda.save(LDA_FILE)
def get_params(files): print "Converting data to features..." tweets = imap(lambda f: open(f).read(), files) features = [to_features(tweet) for tweet in tweets] # features = json.load(open("models/lda_features.json")) print "Converting features to bag of words..." dictionary = corpora.Dictionary(features) corpus = [dictionary.doc2bow(text) for text in features] # corpus = json.load(open("models/lda_corpus.json")) return corpus, features, dictionary if __name__ == "__main__": print "Loading file names..." files = glob.glob("../tweets/*") corpus, features, dictionary = get_params(files) print "Creating LDA Model..." lda = LdaModel(corpus, id2word=dictionary, num_topics=30, iterations=1000, alpha='auto', chunksize=50) lda_topic_distribution = [l for l in lda[corpus]] print "Saving model..." lda.save("lda_model_unigrams.dat") print "Saving distribution..." f = open("lda_topic_distribution.json", 'w') json.dump(lda_topic_distribution, f) f.close()
for text in newTexts: for token in text: newfrequency[token] += 1 # In[87]: logger.info("Generating topics from LDA") num_topics=100 model=LdaModel(num_topics=num_topics,corpus=corpus,id2word=dictionary,iterations=1500) #model=LdaMulticore(num_topics=100,workers=3,corpus=corpus,id2word=dictionary,iterations=3000) #model=HdpModel(corpus=corpus, id2word=dictionary) # In[94]: model.save('cache/model.pkl') # In[96]: cursor = db.movies.find({},{"movieId":1}) movieId=[] for doc in cursor: movieId.append(doc['movieId']) # In[97]: movieDict={} for i,val in enumerate(movieId): movieDict[val]=newTexts[i]
def train_lda(path): # from gensim.test.utils import common_texts # from gensim.corpora.dictionary import Dictionary # from gensim.models.ldamodel import LdaModel # from gensim.test.utils import datapath # common_dictionary = Dictionary(common_texts) # common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] # print(common_dictionary.get(80)) # lda = LdaModel(common_corpus, num_topics=5) # temp_file = datapath(path) # lda.save(temp_file) # lda = LdaModel.load(temp_file) documents = [ "Amazon sells many things ", "Apple is releasing a new product ", "Microsoft announces Nokia acquisition ", 'Julie loves me more than Linda loves me ', 'Jane likes me more than Julie loves me' ] documents = [rm_special_chars(s) for s in documents] stoplist = [ 'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', "don't", 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than' ] texts = [[ word for word in document.lower().split() if word not in stoplist ] for document in documents] # print(texts) dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] # print(dictionary.get(0)) num_topics = 3 lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, update_every=1, random_state=100, chunksize=100, passes=20, alpha='auto') temp_file = datapath(path + 'lda_model') lda.save(temp_file) lda = LdaModel.load(temp_file) dictionary.save(path + 'dict') return lda, dictionary
word = term[1].strip().split(); data_list.append(word); dic = corpora.Dictionary(data_list); #construc dictionary corpus = [dic.doc2bow(text) for text in data_list];#for each word sparse vec tfidf = models.TfidfModel(corpus); #statistic ftidf corpus_tfidf = tfidf[corpus]; #get each text tfidf->sparse matrix #500 can't generate a model so again 200; #the max num of this is 300; topic_nums = [10, 50, 80,100,150]; corpus_ldas = []; for t_num in topic_nums: lda = LdaModel(corpus_tfidf,id2word = dic, num_topics = t_num); #generate LDA lda.save('weibo_lda'+str(t_num)+'.model'); corpus_ldas.append(lda[corpus_tfidf]); print("LDA has done!!!") for corpus_lda in corpus_ldas: num = 0; for doc in corpus_lda: wstr = ""; for i in range(len(doc)): item = doc[i]; wstr += str(item[0]) + ","+str(item[1])[0:7]+"/"; fo.write(id_list[num]+"\t"+wstr[0:-1]+"\n"); num += 1; fr.close(); fo.close();