def loadModelfromFile(modelPath, readOnly=False): if readOnly == True: lda_model = LdaModel.load(fname=modelPath, mmap='r') dictionary = Dictionary.load(fname=modelPath.replace( '.topic', '.dict'), mmap='r') else: lda_model = LdaModel.load(fname=modelPath) dictionary = Dictionary.load( fname=modelPath.replace('.topic', '.dict')) print('load lda_model model from {0} ok!'.format(modelPath)) return lda_model, dictionary
def __init__(self, topics=10, worker=3, pretrained_model=None, dictionary=None): """ lda模型训练初始化。 Args: topics -- 指定主题个数 worker -- 并行化参数,一般为core数量减一 pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型 dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典 Example: >>> lda = LDA(topics = 20, worker = 2, pretrained_model = model_file, dictionary = dictionary_file) >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']] >>> lda.update(corpus) >>> lda.save(model_file, dictionary_file) >>> topics = lda.inference(['word5', 'word6']) """ self._topics = topics self._workers = worker self._model = None self._common_dictionary = None if pretrained_model and common_dictionary: self._model = LdaModel.load(pretrained_model) self._common_dictionary = Dictionary.load(dictionary)
def __init__(self, topics = 10, worker = 3, pretrained_model = None, dictionary = None): """ lda模型训练初始化。 Args: topics -- 指定主题个数 worker -- 并行化参数,一般为core数量减一 pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型 dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典 Example: >>> lda = LDA(topics = 20, worker = 2, pretrained_model = model_file, dictionary = dictionary_file) >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']] >>> lda.update(corpus) >>> lda.save(model_file, dictionary_file) >>> topics = lda.inference(['word5', 'word6']) """ self._topics = topics self._workers = worker self._model = None self._common_dictionary = None if pretrained_model and common_dictionary: self._model = LdaModel.load(pretrained_model) self._common_dictionary = Dictionary.load(dictionary)
def make_clouds(files, n_words=20): # set locations base_model_name = os.path.splitext(os.path.basename(files.model))[0] output_d = '../browser/clouds/' + base_model_name + '/' if not os.path.exists(output_d): os.makedirs(output_d) # create wordcloud generator wc = WordCloud(width=1000, height=500, background_color='white') print('Loading model') model = LdaModel.load(files.model) beta = model.expElogbeta print('Normalizing by topics, and by words') pTW = normalize(beta, axis=0) pWT = normalize(beta, axis=1) # load bug<->id map, then invert to id<-> bug bug_to_id = json.loads(open(files.replacements).read()) id_to_bug = {v: k for k, v in bug_to_id.items() if "." not in k} for i in range(len(beta)): # compute RAR t_rar = np.sqrt(pTW[i] * pWT[i]) top_word_ids = t_rar.argsort()[:-1 - n_words:-1] top_words = [model.id2word.id2token[wordid] for wordid in top_word_ids] top_words = [id_to_bug[word] if word in id_to_bug else word for word in top_words] wc.fit_words(zip(top_words, t_rar[top_word_ids])) wc.to_file(output_d + str(i) + '.png')
def trainModel(): """ Train a model """ if args.mode == 'Random': return args.topics, 0 # need to train on dump files = [ f"{args.input}/{f}" for f in os.listdir(args.input) if os.path.isfile(os.path.join(args.input, f)) ] if args.mode == 'LDA': # create dictionary with open(files[0], "r", encoding='utf-8') as f: dct = Dictionary([' '.join(f.readlines()).split()]) for filename in files[1:]: with open(filename, "r", encoding='utf-8') as f: dct.add_documents([' '.join(f.readlines()).split()]) # create corpus corpus = [] for filename in files: with open(filename, "r", encoding='utf-8') as f: corpus.append(dct.doc2bow(' '.join(f.readlines()).split())) lda = LdaModel(corpus, num_topics=args.topics) lda.save("./models/LDAdump.model") dct.save("./models/LDAdump.dct") return lda, dct if args.mode == 'loadLDA': return LdaModel.load("./models/LDAdump.model"), Dictionary.load( "./models/LDAdump.dct")
def __init__(self, fnames, model=None, corpus=None, dictionary=None): """`fnames` is an array of files for [lda_model, distribution]""" self.reviews = open('data/electronics_topics_in.txt').readlines() print "Loding topic model..." if model is not None: print "Using argument model" self.lda = model else: self.lda = LdaModel.load(fnames[0]) if corpus is not None: print "Using argument corpus and dictionary" self.corpus = corpus self.dictionary = dictionary else: print "Loading corpus and dictionary from file" self.corpus = load("data/models/electronics_tfidf_corpus.pkl") self.dictionary = load("data/models/electronics_dict.pkl") print "Loading review-topic distribution..." self.review_dist = [l for l in self.lda[self.corpus]] tmp = lambda dist: sorted(dist, key=lambda arr: arr[1], reverse=True) self.review_dist = map(lambda dist: tmp(dist), self.review_dist) print "processing topics" tmp = map(lambda t: re.sub("(\d*\.\d*\*)", "", t), self.lda.show_topics(-1)) self.topics = map(lambda ts: re.sub("\\s\+", ",", ts), tmp)
def load_model(self, model_type): model = None try: if model_type == 'tfidf': model = TfidfModel.load(self.tfIdfPath, mmap='r') self.tfIdfModel = model elif model_type == 'lsi': model = LsiModel.load(self.lsiPath, mmap='r') self.lsiModel = model elif model_type == 'lda': model = LdaModel.load(self.ldaPath, mmap='r') self.ldaModel = model elif model_type == 'w2v': model = Word2Vec.load(self.w2vPath, mmap='r') self.w2vModel = model else: logger.error('Model type error. Unexpected %s' % model_type) return None if self.dictionary is None and os.path.exists(self.dictPath): self.dictionary = corpora.Dictionary.load(self.dictPath) logger.info('%s model loaded completely.' % model_type) except IOError: logger.error( 'The %s model doesn\'t exist. Please train the model before load it.' % model_type) finally: return model
def __init__(self, fnames, model=None, corpus=None, dictionary=None): """`fnames` is an array of files for [lda_model, distribution]""" self.reviews = open('data/electronics_topics_in.txt').readlines() print "Loding topic model..." if model is not None: print "Using argument model" self.lda = model else: self.lda = LdaModel.load(fnames[0]) if corpus is not None: print "Using argument corpus and dictionary" self.corpus = corpus self.dictionary = dictionary else: print "Loading corpus and dictionary from file" self.corpus = load("data/models/electronics_tfidf_corpus.pkl") self.dictionary = load("data/models/electronics_dict.pkl") print "Loading review-topic distribution..." self.review_dist = [l for l in self.lda[self.corpus]] tmp = lambda dist: sorted(dist, key=lambda arr: arr[1], reverse=True) self.review_dist = map(lambda dist: tmp(dist), self.review_dist) print "processing topics" tmp = map(lambda t: re.sub("(\d*\.\d*\*)", "", t), self.lda.show_topics(-1)) self.topics = map(lambda ts: re.sub("\\s\+", ",", ts), tmp)
def display_perplexity_on_topic_num(start, step, limit): model_list = [] pplxty_list = [] names = locals() for num_topics in range(start, limit, step): print("############### current num:", num_topics, "###############") model_path = os.getcwd() + "\\Model\\topic_num_" + str( num_topics) + ".model" if not os.path.exists(model_path): # Modeling!!!!! print("Modeling in progress...") names['model' + str(num_topics)] = LdaModel( pubs_corpus, num_topics=num_topics, id2word=pubs_dictionary, passes=10, eval_every=1) names['model' + str(num_topics)].save(model_path) else: print("Model already exists.") names['model' + str(num_topics)] = LdaModel.load(model_path) model_list.append(names['model' + str(num_topics)]) pplxty_value = perplexity(names['model' + str(num_topics)], pubs_corpus, pubs_dictionary, len(pubs_dictionary.keys()), num_topics) pplxty_list.append(pplxty_value) return model_list, pplxty_list
def load(self, path='default'): """ :param path: the path of trained model. :return: """ if path == 'default': path = 'model' file_list = os.listdir(path) for file in file_list: if file.endswith('.model'): self.model_name = file.split('.')[0] if self.model_name == 'lda': self.model = LdaModel.load(str(path + '/lda.model')) if self.model_name == 'lsi': self.model = LsiModel.load(str(path + '/lsi.model')) if self.model_name == 'hdp': self.model = HdpModel.load(str(path + '/hdp.model')) self.id2word = self.model.id2word if self.model_name == 'hdp': self.num_topics = self.model.get_topics().shape[0] else: self.num_topics = self.model.num_topics #self.iterations = self.model.iterations f = open(str(path + '/original_data.pickle'), 'rb') self.original_data = pickle.load(f) f.close() f = open(str(path + '/text.pickle'), 'rb') self.text = pickle.load(f) f.close() f = open(str(path + '/token.pickle'), 'rb') self.token = pickle.load(f) f.close() f = open(str(path + '/corpus.pickle'), 'rb') self.corpus = pickle.load(f) f.close() path = path + '/result' f = open(str(path + '/topic_key.pickle'), 'rb') self.topic_key = pickle.load(f) f.close() f = open(str(path + '/doc_topic.pickle'), 'rb') self.doc_topic = pickle.load(f) f.close() f = open(str(path + '/topic_doc.pickle'), 'rb') self.topic_doc = pickle.load(f) f.close() f = open(str(path + '/topic_sent.pickle'), 'rb') self.topic_sent = pickle.load(f) f.close() self.id2word = self.model.id2word if self.model_name == 'hdp': self.num_topics = self.topic_doc.shape[0] else: self.num_topics = self.model.num_topics
def load(self, subfolder=None): if subfolder: sf = subfolder + '/' else: sf = '' self.ldamodel = LdaModel.load(self.dataFolder + sf + self.saveFile) self.dictionary = gensim.corpora.Dictionary.load(self.dataFolder + sf + self.saveFileDict)
def get_matrices(): start = time.time() with open("matrix.json") as df: doc_term_matrix = json.load(df) ldamodel = Lda.load(MODEL_PATH) loaded = time.time() print "Doc-Term Matrix loaded in", loaded - start, "seconds" doc_topic_mtx = ldamodel[doc_term_matrix] topic_word_mtx = ldamodel.print_topics() array = [] for i in range(len(doc_topic_mtx)): mp = {} for topic_id, topic_score in doc_topic_mtx[i]: mp[topic_id] = topic_score array.append(mp) topicwordarray = [] for _, words in (topic_word_mtx): topicwordarray.append(words) with open("doc_topic_mtx.json", "w") as df: json.dump(array, df) with open("topic_word_mtx.json", "w") as df: json.dump(topicwordarray, df) for i in topic_word_mtx: print i print "Doc-Topic and Topic-Word Matrices loaded in", time.time( ) - loaded, "seconds" return array, ldamodel
def prepare_for_analysis(): import configparser config_parser = configparser.ConfigParser() config_parser.read("config.ini") config = config_parser['default'] from corpus_compiler.tbmmcorpus import TbmmCorpus corpus = TbmmCorpus(metadata=True, config=config) corpus.load_tbmm_corpus("corpus-v0.1/tbmm_corpus.mm") corpus.prepare_metadata_to_description_dictionary() corpus.generate_word_counts() from gensim.models.ldamodel import LdaModel lda = LdaModel.load("tbmm_lda.model.passes_100") import matplotlib matplotlib.use( 'Agg') # Must be before importing matplotlib.pyplot or pylab! topic_dist_matrix, label_vector = corpus.calculate_topic_distributions_of_all_documents( lda) for topic_no in range(1, 20): corpus.plot_topic_across_time(topic_no, topic_dist_matrix, label_vector) corpus.plot_word_freqs_given_a_regexp(r"^lokavt", keyword="lokavt") corpus.plot_word_freqs_given_a_regexp(r"^mebus", keyword="mebus")
def LDAmodel(X, passes=2, num_topics=10, workers=2, re_train=False): tokens = [] for c in X: tokens.append(c.split()) dictionary = gensim.corpora.Dictionary(tokens) bow_corpus = [dictionary.doc2bow(caption) for caption in tokens] if (re_train == True): ldamodel = gensim.models.LdaMulticore(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=passes, workers=workers) ldamodel.save("data/3_topic_modeling_weights//ldamodel_weights") else: ldamodel = LdaModel.load( "data/3_topic_modeling_weights//ldamodel_weights") sent_topics_df = pd.DataFrame() for i, row in enumerate(ldamodel[bow_corpus]): row = sorted(row, key=lambda x: (x[1]), reverse=True) # Get the Dominant topic, Perc Contribution and Keywords for each document for j, (topic_num, prop_topic) in enumerate(row): if j == 0: # => dominant topic sent_topics_df = sent_topics_df.append(pd.Series( [str(int(topic_num)), round(prop_topic, 4)]), ignore_index=True) else: break sent_topics_df.columns = ['dominant_topic', 'perc_contribution'] return sent_topics_df
def run(self): if self.clean_level in ('raw','clean','stopwords'): kind = self.clean_level else: kind = 'stopwords' if not os.path.exists(self.res_dir): print 'Creando carpeta para resultados...' os.mkdir(self.res_dir) # Aplicar cada modelo for idioma, modelos in self.input()['lda']['langs'].iteritems(): corp_path = self.input()['corp']['langs'][idioma].path corpus = corpora.MmCorpus(corp_path) for n_topics, modelo in modelos.iteritems(): model_path = modelo.path model = LdaModel.load(model_path) classification = [] for doc in corpus: topic = model.get_document_topics(doc) classification.append(topic) print '--------------------------------------' print 'USER INFO: Clasificando textos en %s con nivel de limpieza "%s" con %d tópicos' % (idioma, kind, n_topics) model.print_topics(len(corpus),5) with self.output()['langs'][idioma][n_topics]['doc_topics'].open('w') as f: pickle.dump(classification, f) with self.output()['langs'][idioma][n_topics]['topics'].open('w') as f: pickle.dump(model.print_topics(n_topics,5), f) # el 5 es un parámetro que se puede editar (numero de palabras del tópico a mostrar)
def calculate_topics(application_id: str) -> None: """Uses the latest topic model to assign a topic for each completely fetched account in the database.""" with engine.begin() as connection: topic_model = models.topic_model.select_latest(application_id, SOURCES['TWITTER'], connection) if not topic_model: return accounts = list( models.account.select_multiple_complete(application_id, SOURCES['TWITTER'], connection)) topic_model_path = get_topic_model_path(application_id) lda_model = LdaModel.load(os.path.join(topic_model_path, 'ldamodel')) dictionary = Dictionary.load( os.path.join(topic_model_path, 'dictionary')) documents = load_documents(accounts, connection) topic_iteration_id = models.topic_iteration.insert_one( topic_model['id'], connection) for account, document in zip(accounts, documents): bow = dictionary.doc2bow(document) weights = get_document_topic_weights(lda_model, bow) models.topic.insert_one(account['id'], weights, topic_iteration_id, connection) cluster_accounts(topic_iteration_id, connection)
def load_model(self, model_path='lda.model'): """ Loads a pretrained LDA model :param model_path: :return LDA model: """ return LdaModel.load(model_path)
def newsList_topicInfer(news_list, model_basepath='./model', reserved_word_path=''): dictionary_path = model_basepath + '/dictionary.pickle' fr = file(dictionary_path, 'rb') dictionary = pickle.load(fr) model_path = model_basepath + '/lda_100.model' lda = LdaModel.load(model_path, mmap='r') for news_json in news_list: news_title = news_json['title'] news_content = news_json['content'] if not news_content: news_content = news_title * 3 all_content = news_title + news_content word_list = utils.wordcut4lda( all_content, reserved_word_path=reserved_word_path).split() topic_vec = [ str(key[1]) for key in lda.get_document_topics(dictionary.doc2bow(word_list), minimum_probability=0) ] news_json['topic'] = topic_vec
def main(): logger.info(f'Loading data from {args.dataset_dir}') corpus = load_corpus(args.dataset_dir) model_path = os.path.join(args.dump_dir, 'lda.model') logger.info(f'Loading model from {model_path}') model = LdaModel.load(model_path) corpus_bow = (model.id2word.doc2bow(text['candidates']) for text in corpus) predictions_path = os.path.join(args.dump_dir, 'lda.prediction.jsonl') topic_ids = set() with open(predictions_path, 'w') as f: for tweet, tweet_bow in tqdm(zip(corpus, corpus_bow)): topics = model.get_document_topics(tweet_bow) topics = [(topic_id, topic_prob.item()) for topic_id, topic_prob in topics] tweet['topics'] = topics f.write(json.dumps(tweet) + '\n') logger.info(f'Predictions have been written to {predictions_path}') topics_path = os.path.join(args.dump_dir, 'lda.topics.txt') topics = model.show_topics(num_topics=model.num_topics, num_words=10, log=False, formatted=True) with open(topics_path, 'w') as f: for topic_no, topic in topics: f.write(f'Topic {topic_no}: {topic}\n') logging.info(f'Topics have been written to {topics_path}')
def convert_to_pyLDAvis(data_folder, basename, **opts): opts = extend( dict(R=50, mds='tsne', sort_topics=False, plot_opts={ 'xlab': 'PC1', 'ylab': 'PC2' }), opts or {}) target_folder = os.path.join(data_folder, basename) corpus_filename = os.path.join(target_folder, 'corpus.mm') model_filename = os.path.join(target_folder, 'gensim_model_{}.gensim.gz'.format(basename)) lda = LdaModel.load(model_filename) corpus = MmCorpus(corpus_filename) data = pyLDAvis.gensim.prepare(lda, corpus, lda.id2word, **opts) pyLDAvis.save_html(data, os.path.join(target_folder, 'pyldavis.html')) return data
def getAllTopicTerms(self): """ return all topic / word distribution """ # load dictionary and model self.dictionary = Dictionary.load(self.getModelFilePath("common.dictionary.file")) self.ldaModel = LdaModel.load(self.getModelFilePath("common.model.file")) idto = self.dictionary.id2token allTiDistr = self.ldaModel.get_topics() return allTiDistr
def add_lda(x, corpus): train_lda = [] lda = LdaModel.load('lda.model') for i in range(len(x)): top_topics = lda.get_document_topics(corpus[i], minimum_probability=0.0) topic_vec = [top_topics[i][1] for i in range(2)] train_lda.append(topic_vec) return train_lda
def model_play(fname): """Extract insights from trained model.""" start = time.time() ldamodel = Lda.load(fname) print "[INFO] Model loaded in", time.time() - start, "seconds" for i in range(ldamodel.num_topics): print "[INFO]", ldamodel.print_topic(i)
def plot_lda(application_id): """Saves a html file that visualizes the topic model""" topic_model_path = get_topic_model_path(application_id) lda_model = LdaModel.load(os.path.join(topic_model_path, 'ldamodel')) SerializedCorpus = MmCorpus(os.path.join(topic_model_path, 'corpus.mm')) dictionary = Dictionary.load(os.path.join(topic_model_path, 'dictionary')) vis_data = gensim.prepare(lda_model, SerializedCorpus, dictionary) pyLDAvis.save_html( vis_data, os.path.join(topic_model_path, 'lda_visualization.html'))
def getTopicTerms(self, topicId, topN): """ return word distribution for a topic """ if self.ldaModel is None: self.dictionary = Dictionary.load(self.getModelFilePath("common.dictionary.file")) self.ldaModel = LdaModel.load(self.getModelFilePath("common.model.file")) idto = self.dictionary.id2token tiDistr = self.ldaModel.get_topic_terms(topicId, topN) toDistr = [(idto[ti[0]], ti[1]) for ti in tiDistr] return toDistr
def class_model(m): lda = LdaModel.load("model/lda_model/lda_model_user_ml") dic = Dictionary.load('model/lda_model/lda_model_user_ml.id2word') topics = [tokenizer.find_topic(message, lda, dic) for message in m] return pd.Series(topics)
def __init__(self, connect_file, database, model_file): self._alphabet = 'abcdefghijklmnopqrstuvwxyz' self._dbConnect(connect_file, database) with open(model_file, 'rb') as mdlf: contents = json.load(mdlf) model = contents['model-path'] dictionary = contents['dictionary-path'] self.model = LdaModel.load(model) self.dictionary = Dictionary.load(dictionary)
def __init__(self, jobdesc_fname, jobtitle_fname): self.es = Elasticsearch([{'host': app.config['ES_HOST'], 'port': 9200, 'timeout': 120}]) self.model = LdaModel.load(app.config['RCMDR_LDA_MODEL']) self.job_labels = { int(k):v for k, v in (line.split("=") for line in open(app.config['RCMDR_JOB_LABELS']) .read().strip().split('\n')) } self.jobdesc_fname = jobdesc_fname self.jobtitle_fname = jobtitle_fname
def load_model(self, dir_name="text_mining_models", file_name="gensim_model"): path = os.path.join(os.getcwd(), "..", dir_name, file_name) if os.path.isfile(path): return LdaModel.load(path) else: model = self.create_model() self.save_model() return model
def __init__(self, ac): with open('../TextMining/Topic/data.loc','rb') as f: load(f) self.data = load(f) with open('../TextMining/Topic/translator.loc','rb') as f: self.translator = load(f) self.index = similarities.MatrixSimilarity.load('../TextMining/Topic/index.loc') self.lda = LdaModel.load('../TextMining/Topic/lda.loc') self.dictionary = Dictionary().load("../TextMining/Topic/dic.loc") self.ac_terms = ac
def analyze(self, docs): # load dictionary and model self.dictionary = Dictionary.load(self.getModelFilePath("common.dictionary.file")) self.ldaModel = LdaModel.load(self.getModelFilePath("common.model.file")) # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. docTermMatrix = [self.dictionary.doc2bow(doc) for doc in docs] docTopicDistr = self.getDocumentTopics(docTermMatrix) return docTopicDistr
def analyze(self, docs): # load dictionary and model self.dictionary = Dictionary.load(self.getModelFilePath("common.dictionary.file")) self.ldaModel = LdaModel.load(self.getModelFilePath("common.model.file")) # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. docTermMatrix = [self.dictionary.doc2bow(doc) for doc in docs] docTopicDistr = self.getDocumentTopics(docTermMatrix) return docTopicDistr
def __init__(self, model_dir=os.path.join('models', 'gensim', 'lda'), stopwords_dir=os.path.join('resources', 'stopwords')): Service.__init__(self, 'topic-modeling', 'lda-gensim', ['parse']) self.models = {} self.stopwords = {} for name in os.listdir(model_dir): self.models[name] = LdaModel.load(os.path.join(model_dir, name, 'model')) for name in os.listdir(stopwords_dir): lang = name[:2] with open(os.path.join(stopwords_dir, name)) as f: self.stopwords[lang] = set([line.strip() for line in f.readlines()])
def inference(doc, model_loc): lda = LdaModel.load(model_loc + '/ldamodel.model') unseen_doc = px.prepareDoc(doc) dictionary = Dictionary.load(model_loc + '/dictionary.dic') unseen_corpus = dictionary.doc2bow(unseen_doc) topics = lda[unseen_corpus] return topics, lda
def get_lda_model(regenmod, lmod): list_models = [] if os.path.isfile(lmod) and not regenmod: model = LdaModel.load(lmod) else: model = LdaModel(corpus=corpus, id2word=dictionary, iterations=50, num_topics=2) list_models.append(model) return list_models
def save_top_words(model_file, output_file): lda = LdaModel.load(model_file) topics = lda.show_topics(-1, topn=20, formatted=False) topics = [[word for (_, word) in topic] for topic in topics] with open(output_file, 'w') as fp: for i, topic in enumerate(topics): #print topic line = 'Topic %d: %s\n' % (i + 1, ', '.join(topic)) fp.write(line) return topics
def AuthorTopicStd(): import nltk from gensim import corpora from gensim import matutils from gensim.models.ldamodel import LdaModel from nltk.corpus import stopwords from unidecode import unidecode TOPIC_FILE = './lda_topic.dump' LDA_FILE = './result.lda' DICTIONARY_FILE = './keywords.dict' with open(TOPIC_FILE, 'rb') as f: num_topics, topic_result = serializer.load(f) lda = LdaModel.load(LDA_FILE) dictionary = corpora.Dictionary.load(DICTIONARY_FILE) tokenizer = nltk.tokenize.RegexpTokenizer(r'[\w]{2,}') stopwords_set = set(stopwords.words()) my_topic_cache_by_aid = [None, None] def calculator(aid, pid): if my_topic_cache_by_aid[0] == aid: my_topic = my_topic_cache_by_aid[1] else: my_keywords = [] for ipid, iaid in paper_authors.get_by_aid(aid): paper = papers.get(ipid) if paper is None: continue keywords = tokenizer.tokenize(unidecode(paper[Papers.IDX_TITLE]).lower()) if not keywords: continue my_keywords.extend(keywords) my_keywords = list(filter(lambda s: s not in stopwords_set, my_keywords)) if not my_keywords: return np.nan my_topic = lda[dictionary.doc2bow(my_keywords)] my_topic_cache_by_aid[0] = aid my_topic_cache_by_aid[1] = my_topic my_topic_array = matutils.sparse2full(my_topic, num_topics) return np.std(my_topic_array) return calculator
def getLdaModel(bow_corpus, dictionary, useSavedTill): if useSavedTill >= USESAVED.lda_model: common_logger.info("loading LDA model from file") return LdaModel.load(file_lda_model) else: common_logger.info("Training LDA model") num_topics = int(math.log(len(bow_corpus)) + 1) # assumption: lda_model = LdaModel(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=numPasses) common_logger.info("Saving LDA model") lda_model.save(file_lda_model) common_logger.info("Done creating LDA model") return lda_model
def __init__(self): self.dictionary = Dictionary.load(app.config["RCMDR_DICT"]) self.corpus = corpora.MmCorpus(app.config["RCMDR_CORPUS"]) self.tfidf = TfidfModel.load(app.config["RCMDR_TFIDF_MODEL"]) self.lda_model = LdaModel.load(app.config["RCMDR_LDA_MODEL"]) self.lsi_model = LsiModel.load(app.config["RCMDR_LSI_MODEL"]) self.lda_index = Similarity.load(app.config["RCMDR_LDA_INDEX"]) self.lsi_index = Similarity.load(app.config["RCMDR_LSI_INDEX"]) self.job_labels = { int(k): v for k, v in (line.split("=") for line in open(app.config["RCMDR_JOB_LABELS"]).read().strip().split("\n")) }
def fetch_model(dictionary): print "Fetching LDA Model... ", try: lda = LdaModel.load('Topic/lda.tm') print "LDA Model loaded!" except IOError: print "Model not found, building LDA..." corpus=MyCorpus() #lda = LdaModel(corpus,num_topics=50,update_every=1,chunksize=1000,passes=15) lda = LdaModel(corpus,num_topics=50,id2word=dictionary,update_every=1,chunksize=1000,passes=50) print "LDA Built!" lda.save('Topic/lda.tm') return lda
def update(self, docs): # load dictionary and model self.dictionary = Dictionary.load(self.getModelFilePath("common.dictionary.file")) self.ldaModel = LdaModel.load(self.getModelFilePath("common.model.file")) # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. docTermMatrix = [self.dictionary.doc2bow(doc) for doc in docs] numPass = self.config.getIntConfig("train.num.pass")[0] self.ldaModel.update(docTermMatrix, passes=numPasses) docTopicDistr = self.getDocumentTopics(docTermMatrix) return docTopicDistr
def SNAP_ldaTopicsForTopic(self, topic, numTopics = 10): if numTopics not in [5, 10, 20, 30]: print("[ERROR] Invalid numTopics") return inPath = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'snap_data', "gensim_snap_lda_%s_%d" % (topic, numTopics) ) lda = LdaModel.load(inPath) return lda.print_topics(numTopics) ################## # ##################
def main(): logformat = '%(asctime)s %(name)-12s: %(message)s' logging.basicConfig(level=logging.DEBUG, format=logformat) kera = NOB_kera() es = Elasticsearch(port=9201) mod = LdaModel.load(modelfile) vocab = Dictionary.load(vocabulary) tfidf = TfidfModel(dictionary=vocab) results = [] for (topics, topicid) in get_doc_topics(mod, mod.num_topics, num_words_from_topic, vocab, tfidf): res = es.search(index='wiki4', body={"query": {"match": {"_all": topics}}}, size=num_results_from_es) results.append({'topics': topics, 'result': res, 'topicid': topicid}) results = add_keywords(results, kera) df = pd.DataFrame(results) df.to_csv('nowiki_4_with_kera_250_topics.csv', encoding='utf-8')
def get_lda_model(num_topics): file_name = None if num_topics == 10: file_name = LDA_FILE_10 elif num_topics == 30: file_name = LDA_FILE_30 elif num_topics == 60: file_name = LDA_FILE_60 elif num_topics == 120: file_name = LDA_FILE_120 else: raise ValueError("bad number of topics") return LdaModel.load(file_name)
def main(): file = 'f:/projects/elasticsearch-enterprise-system/data/topic_models/nowiki_v2_3pass_lda_250' mod = LdaModel.load(file) dict = 'f:/projects/elasticsearch-enterprise-system/data/topic_models/voc_vocabulary_0.vocab' vocab = Dictionary.load(dict) corpfile = 'f:/projects/comperio-text-analytics/models/topicmodel/mojo_lda_100.corp' corpus = gensim.corpora.MmCorpus(corpfile) print mod.show_topic(0) print mod.id2word mod.id2word = vocab print mod.show_topic(0) pydavis = pyLDAvis.gensim.prepare(mod, corpus, vocab) pyLDAvis.save_html(pydavis, 'pydavis_250_v2_3passes.html') pyLDAvis.show(pydavis)
def setup(files): # setup the output directory base_model_name = os.path.splitext(os.path.basename(files.model))[0] output_dir = '../browser/json/' + base_model_name + '/' if not os.path.exists(output_dir): os.makedirs(output_dir) # load the topic model model = LdaModel.load(files.model) # load replacements used bug_to_id = json.loads(open(files.replacements).read()) # invert to id<->bug map, ditching s. genus terms id_to_bug = {v: k for k, v in bug_to_id.items() if "." not in k} # load the docsXwords and docsXtopics matrices (in sparse format) corpus = mmcorpus.MmCorpus(files.corpus) docsXwords_sparse = corpus2csc(corpus, num_terms=len(model.id2word.token2id)).T docsXtopics = mmcorpus.MmCorpus(files.docsXtopics) docsXtopics_sparse = corpus2csc(docsXtopics).T return docsXtopics_sparse, docsXwords_sparse, id_to_bug, model, output_dir
def read_model(self): self.dictionary = corpora.Dictionary.load(DICT) self.bow_corpus = corpora.MmCorpus(BOW_CORPUS) self.lda_model = LdaModel.load(MODEL) self.logit_classifier = joblib.load(CLASSIFIER) corpus = [] corpus += load_expo_cdc() corpus += load_lago() corpus += load_news() corpus += load_news_ic() corpus += load_palestras() corpus = preprocessing(corpus) test_bow = [self.dictionary.doc2bow(text) for text in corpus] lda_corpus = [self.lda_model[bow] for bow in test_bow] lda_dense = gensim.matutils.corpus2dense(lda_corpus, num_terms=TOPICS).transpose() probs = self.logit_classifier.predict_proba(lda_dense)
def main(argv): if len(argv) < 4: print 'python train_lda.py group_id num_topics passes' sys.exit(1) group_id = argv[1] num_topics = int(argv[2]) passes = int(argv[3]) log.info('Prepare corpus for group: %s' % group_id) base_path = 'tables/' + group_id + '/' model_base_path = 'ldamodels/' + group_id + '/' # buid dict and corpus #now = datetime.now() indicator = 'title-comment' source_path = base_path + 'corpus-topic-comment' corpus_path = model_base_path + 'corpus-'+ indicator + '-' + group_id + '.mm' dict_path = model_base_path + 'dict-' + indicator + '-' + group_id + '.dict' log.info('Building the dict...') build_dict_corpus(source_path, corpus_path, dict_path) log.info('Loading dict from pre-saved file...') dictionary = corpora.Dictionary.load(dict_path) log.info('Done') #dictionary.save_as_text(base_path + 'text-dict.txt') log.info('Build a lda model...') log.info('Loading corpus from pre-saved .mm file...') mmcorpus = corpora.MmCorpus(corpus_path) log.info('Done') log.info('Training lda model...') model = LdaModel(mmcorpus, num_topics=num_topics, id2word = dictionary, passes = passes) model_path = model_base_path + indicator + '-' + group_id + '.ldamodel' model.save(model_path) log.info('Done.') model = LdaModel.load(model_path) model.show_topics(topics=num_topics, topn=10, log=True)
def __init__(self, fnames): """`fnames` is an array of files for [lda_model, distribution]""" print "Accumulating tweets..." files = glob.glob("data/relevant/*") self.tweets = map(lambda f: open(f).read(), files) print "Loding topic model..." self.lda = LdaModel.load(fnames[0]) self.corpus, self.features, self.dictionary = get_params(files) print "Loading tweet distribution..." self.tweet_dist = [l for l in self.lda[self.corpus]] tmp = lambda dist: sorted(dist, key=lambda arr: arr[1], reverse=True) self.tweet_dist = map(lambda dist: tmp(dist), self.tweet_dist) # self.tweet_dist = json.load(open(fnames[1])) tmp = map(lambda t: re.sub("(\d*\.\d*\*)", "", t), self.lda.show_topics(-1)) self.topics = map(lambda ts: re.sub("\\s\+", ",", ts), tmp) self.topics.reverse()
def train(refresh=True): if refresh: ptb = BracketParseCorpusReader(Corpus.DATA_DIR, Corpus.FILE_PATTERN) train_folders = [str(i) + str(j) for i in range(2) for j in range(10)] train_folders += [str(i) + str(j) for i in range(2, 3) for j in range(5)] dictionary = corpora.dictionary.Dictionary() train_documents = list() logger.debug('Starting to parse training documents') for folder in train_folders: for ptb_file in os.listdir(os.path.join(Corpus.DATA_DIR, folder)): document_sentences = ptb.sents(fileids=[os.path.join(folder, ptb_file)]) if len(document_sentences) > DOC_LEN_THRESHOLD: doc2sentence = list(chain.from_iterable(document_sentences)) doc2sentence = clean_text(doc2sentence) dictionary.add_documents([doc2sentence]) train_documents.append(doc2sentence) logger.debug('Parsed all training documents') dictionary.filter_extremes(no_below=1, no_above=0.5) dictionary.save(DICTIONARY_FILE) logger.debug('Creating corpus for training data') corpus = [dictionary.doc2bow(text) for text in train_documents] logger.debug('Finished creating corpus') logger.debug('Training LDA model on corpus') lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=N_TOPICS, passes=20) logger.debug('Completed LDA training') lda.save(LDA_MODEL_FILE) else: dictionary = corpora.dictionary.Dictionary.load(DICTIONARY_FILE) lda = LdaModel.load(LDA_MODEL_FILE) return lda, dictionary
def update(self, name, n=500, method='FastICA'): settings = self._setstorage.load(encode_name(name)) clusterer = Clusterer(settings) # load the models dictionary = Dictionary.load(os.path.join(DICTIONARY_PATH, settings[DICTIONARY])) ngram_size = len(dictionary[0]) transformer = NgramTransformer(ngram_size) ldamodel = LdaModel.load(os.path.join(LDA_PATH, settings[LDA_MODEL])) # get the input segments = self._segstorage.load(name=settings[SEGMENT_NAME], limit=int(n)) documents = [s.value for s in segments] # prepare args kwargs = {'dictionary': dictionary, 'ngramtransformer': transformer, 'ldamodel': ldamodel, 'method': method} Xt = clusterer.fit_transform(documents, **kwargs) labels = clusterer.assign_labels(documents) data = self._make_data(Xt, labels, documents) return json.dumps({'result': 'OK', 'data': data})
def load(self): self._lda = LdaModel.load(self._model_file) self._dictionary = Dictionary.load(self._dict_file)
if __name__ == '__main__': parser = argparse.ArgumentParser(description='Cluster segments') parser.add_argument('clustermodel', type=unicode, help='The clusterer model to use.') args = parser.parse_args() setstorage = MongoSettingsStorage() docstorage = MongoDocumentStorage() segstorage = MongoSegmentStorage() logger.info('Loading clusterer model') settings = setstorage.load(encode_name(args.clustermodel)) dictionary = Dictionary.load(os.path.join(DICTIONARY_PATH, settings[DICTIONARY])) ngram_size = len(dictionary[0]) transformer = NgramTransformer(ngram_size) ldamodel = LdaModel.load(os.path.join(LDA_PATH, settings[LDA_MODEL])) logger.info('Clusterer model loaded!') kwargs = {'dictionary': dictionary, 'ngramtransformer': transformer, 'ldamodel': ldamodel, 'method': 'LDA'} logger.info('Fitting clusterer') clusterer = Clusterer(settings) texts, labels = clusterer.get_training_data() clusterer.fit(texts, labels, **kwargs) logger.info('Fitting completed!') # TODO: implement get_params and set_params for clusterer tool to allow cross-validation for better score estimation
# Make / Load LDA result def make_lda_result(): lda = LdaModel(corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=2, iterations=1000) # save LDA result lda.save(LDA_FILE) if os.path.isfile(LDA_FILE): if input('Do you want to reload LDA result? (yes|otherwise)') == 'yes': make_lda_result() else: make_lda_result() lda = LdaModel.load(LDA_FILE) # 4. Make and Save topic belief for each publication with open(PUBLICATION_KEYWORDS_FILE, 'rb') as f: publication_keywords = serializer.load(f) topic_result = dict() for i, (pub_id, keywords) in enumerate(publication_keywords.items()): pub_topic = dict(lda[corpus[i]]) if len(pub_topic) == 0: continue topic_belief = np.array([pub_topic.get(j, 0.0) for j in range(NUM_TOPICS)])
def __init__(self): self.model = LdaModel.load(settings.lda_model_name) self.dictionary = Dictionary.load_from_text(settings.wordids_txt)
def AuthorPaperTopicSim(): import nltk from gensim import corpora from gensim import matutils from gensim.models.ldamodel import LdaModel from nltk.corpus import stopwords from unidecode import unidecode TOPIC_FILE = './lda_topic.dump' LDA_FILE = './result.lda' DICTIONARY_FILE = './keywords.dict' with open(TOPIC_FILE, 'rb') as f: num_topics, topic_result = serializer.load(f) lda = LdaModel.load(LDA_FILE) dictionary = corpora.Dictionary.load(DICTIONARY_FILE) tokenizer = nltk.tokenize.RegexpTokenizer(r'[\w]{2,}') stopwords_set = set(stopwords.words()) my_topic_cache_by_aid = [None, None] def calculator(aid, pid): paper = papers.get(pid) if paper is None or paper[Papers.IDX_PUB_ID] is None: return np.nan publication = publications.get(paper[Papers.IDX_PUB_ID]) pub_ori_id = publication[Publications.IDX_ORIGINAL_ID] if pub_ori_id not in topic_result: return np.nan publication_topic = topic_result[pub_ori_id] if my_topic_cache_by_aid[0] == aid: my_topic = my_topic_cache_by_aid[1] else: my_keywords = [] for ipid, iaid in paper_authors.get_by_aid(aid): paper = papers.get(ipid) if paper is None: continue keywords = tokenizer.tokenize(unidecode(paper[Papers.IDX_TITLE]).lower()) if not keywords: continue my_keywords.extend(keywords) my_keywords = list(filter(lambda s: s not in stopwords_set, my_keywords)) if not my_keywords: return np.nan my_topic = lda[dictionary.doc2bow(my_keywords)] my_topic_cache_by_aid[0] = aid my_topic_cache_by_aid[1] = my_topic # Use Hellinger distance my_topic_array = matutils.sparse2full(my_topic, num_topics) sim = np.sqrt(0.5 * ((np.sqrt(my_topic_array) - np.sqrt(publication_topic)) ** 2).sum()) return sim return calculator
import pandas as pd import scipy as sp import matplotlib.pyplot as plt import time from scipy import sparse corpusType = "all_"; subDirectory = 'run_sraa' t1 = time.time() corpus = corpora.MmCorpus(subDirectory+'/'+ corpusType+'corpus.mm') # dictionary = corpora.dictionary.Dictionary.load(subDirectory+''+ corpusType+'/dictionary.dict') classes = np.loadtxt(subDirectory+'/'+ corpusType+'classes.dat',dtype=int) model = LdaModel.load(subDirectory+'/'+corpusType+'sraa.lda_model') numFeatures = model.num_topics numData = len(corpus) numNodes = numData + numFeatures + 2 sparseData = [] for data in corpus: sparseData.append(model[data]) A = sparse.lil_matrix((numNodes,numNodes)) # features: 0-numFeatures # data: numFeature-(numFeature+numData) # label: (numFeature+numData), (numFeature+numData+1) # connect datas to features
groups_users = json.load(open('data/app_gensim/groups_users_filt.txt')) users_groups = json.load(open('data/app_gensim/users_groups_filt.txt')) users_topics = json.load(open('data/app_gensim/users_topics_filt.txt')) groups_topics = json.load(open('data/app_gensim/group_topics.txt')) # member_data = pd.read_pickle('data/memfiltcleanfinal.pkl') member_data = pd.read_pickle('data/app_final_filt_members_data.pkl') # member_data['id'] = member_data['id'].apply(lambda x: str(x)) # member_data.set_index('id', inplace = True) group_data = pd.read_pickle('data/app_final_groups_data.pkl') loaded_model = graphlab.load_model('data/groups_model') dictionary = corpora.Dictionary.load('data/app_gensim/dictionary.dict') corpus = corpora.MmCorpus('data/app_gensim/corpus_tfidf.mm') # loaded_model.get_similar_users(users = ['68157442'], k=10) lda = LdaModel.load('data/app_gensim/model.lda') #group index --> {group_ind: [groupid, grouptext]} lda_dict = json.load(open('data/app_gensim/lda_dict.txt')) users_wanted = list(member_data.index.values) member_data['score']= member_data['n_connected']+ member_data['n_topics'] groups_topics = json.load(open('data/app_gensim/group_topics.txt')) users_groups = json.load(open('data/app_gensim/users_groups_filt.txt')) dfref = member_data.to_dict('dict') users_sims = np.load("users_sims.npy") def get_sim_score(user_id):
from gensim.corpora.dictionary import Dictionary from gensim.models.ldamodel import LdaModel from news.document import Tokenizer if len(sys.argv) != 2: print 'Usage: {0} rcv1_data_dir'.format(sys.argv[0]) raise SystemExit(1) data_dir = sys.argv[1] dictionary_file = data_dir+'/id_token_df' model_file = data_dir+'/lda_model' print 'creating tokenizer...' dictionary = Dictionary.load_from_text(dictionary_file) tok = Tokenizer(dictionary) print 'loading model...' lda = LdaModel.load(model_file) while True: text = raw_input('enter text (q to quit): ') if text == 'q': print 'bye!' break doc = tok.text2bow(text) topics = lda[doc] for topic,weight in sorted(topics,key=itemgetter(1),reverse=True): print weight,lda.show_topic(topic,topn=4)