def load_model(self, model_type): model = None try: if model_type == 'tfidf': model = TfidfModel.load(self.tfIdfPath, mmap='r') self.tfIdfModel = model elif model_type == 'lsi': model = LsiModel.load(self.lsiPath, mmap='r') self.lsiModel = model elif model_type == 'lda': model = LdaModel.load(self.ldaPath, mmap='r') self.ldaModel = model elif model_type == 'w2v': model = Word2Vec.load(self.w2vPath, mmap='r') self.w2vModel = model else: logger.error('Model type error. Unexpected %s' % model_type) return None if self.dictionary is None and os.path.exists(self.dictPath): self.dictionary = corpora.Dictionary.load(self.dictPath) logger.info('%s model loaded completely.' % model_type) except IOError: logger.error( 'The %s model doesn\'t exist. Please train the model before load it.' % model_type) finally: return model
def load(self, path='default'): """ :param path: the path of trained model. :return: """ if path == 'default': path = 'model' file_list = os.listdir(path) for file in file_list: if file.endswith('.model'): self.model_name = file.split('.')[0] if self.model_name == 'lda': self.model = LdaModel.load(str(path + '/lda.model')) if self.model_name == 'lsi': self.model = LsiModel.load(str(path + '/lsi.model')) if self.model_name == 'hdp': self.model = HdpModel.load(str(path + '/hdp.model')) self.id2word = self.model.id2word if self.model_name == 'hdp': self.num_topics = self.model.get_topics().shape[0] else: self.num_topics = self.model.num_topics #self.iterations = self.model.iterations f = open(str(path + '/original_data.pickle'), 'rb') self.original_data = pickle.load(f) f.close() f = open(str(path + '/text.pickle'), 'rb') self.text = pickle.load(f) f.close() f = open(str(path + '/token.pickle'), 'rb') self.token = pickle.load(f) f.close() f = open(str(path + '/corpus.pickle'), 'rb') self.corpus = pickle.load(f) f.close() path = path + '/result' f = open(str(path + '/topic_key.pickle'), 'rb') self.topic_key = pickle.load(f) f.close() f = open(str(path + '/doc_topic.pickle'), 'rb') self.doc_topic = pickle.load(f) f.close() f = open(str(path + '/topic_doc.pickle'), 'rb') self.topic_doc = pickle.load(f) f.close() f = open(str(path + '/topic_sent.pickle'), 'rb') self.topic_sent = pickle.load(f) f.close() self.id2word = self.model.id2word if self.model_name == 'hdp': self.num_topics = self.topic_doc.shape[0] else: self.num_topics = self.model.num_topics
def representation(self): if not self.model: print("LOAD MODEL...") self.model = LsiModel.load( os.path.join(self.preprocessor.source.path, self.preprocessor.source.info + '.model')) self.dictionary = Dictionary.load( os.path.join(self.preprocessor.source.path, self.preprocessor.source.info + '.dic')) pass
def __init__(self): self.dictionary = Dictionary.load(app.config["RCMDR_DICT"]) self.corpus = corpora.MmCorpus(app.config["RCMDR_CORPUS"]) self.tfidf = TfidfModel.load(app.config["RCMDR_TFIDF_MODEL"]) self.lda_model = LdaModel.load(app.config["RCMDR_LDA_MODEL"]) self.lsi_model = LsiModel.load(app.config["RCMDR_LSI_MODEL"]) self.lda_index = Similarity.load(app.config["RCMDR_LDA_INDEX"]) self.lsi_index = Similarity.load(app.config["RCMDR_LSI_INDEX"]) self.job_labels = { int(k): v for k, v in (line.split("=") for line in open(app.config["RCMDR_JOB_LABELS"]).read().strip().split("\n")) }
def __getitem__(self, modelo): ''' Retorna o modelo correspondente. Parâmetros: modelo (str) --> Indicador do modelo que pode ser "tfidf", "tfidf_pivot", "lsi", "lda" ou "doc2vec" Retorno: o modelo solicitado, se existir ''' if not os.path.isfile(self._arqs['modelos'][modelo]): print(f'O modelo "{modelo} não foi implementado ou montado."') return None if modelo in ['tfidf', 'tfidf_pivot']: model = TfidfModel.load(self._arqs['modelos'][modelo]) elif modelo == 'lsi': model = LsiModel.load(self._arqs['modelos'][modelo]) elif modelo == 'lda': model = LdaModel.load(self._arqs['modelos'][modelo]) elif modelo == 'doc2vec': model = Doc2Vec.load(self._arqs['modelos'][modelo]) return model
def main(): try: dictionary = Dictionary.load_from_text("dictionary.txt") except: dictionary = Dictionary(rcv1_train) dictionary.filter_extremes() dictionary.save_as_text("dictionary.txt") class RCV1BowCorpus(object): def __iter__(self): for document in rcv1_train: yield dictionary.doc2bow(document) ln.debug("Training model on %s documents" % len(rcv1_train)) try: vector_model = LsiModel.load("lsi_model") except: vector_model = LsiModel(corpus=RCV1BowCorpus(), num_topics=100, id2word=dictionary) vector_model.save("lsi_model") def get_lsi_features(text): """ Must return either numpy array or dictionary """ res = vector_model[dictionary.doc2bow(text)] return dict(res) def get_bow_features(text): return dict(dictionary.doc2bow(text)) clf = train_classifier(train_samples=rcv1_train, train_targets=rcv1_train_target, get_features=get_lsi_features, classifier="sgd") evaluate_classifier(clf, rcv1_test, rcv1_test_target, get_features=get_lsi_features)
def get_lsa_model(self, n_topics=50, recalculate=False, from_scratch=True): filepath = self.paths.get_lsa_filepath(n_topics) if not os.path.isfile(filepath) or recalculate: if not from_scratch: raise ValueError('No LSA file exists but from_scratch is False') trigram_dictionary = self.lda_builder.get_corpus_dict() trigram_bow_corpus = self.lda_builder.get_trigram_bow_corpus(trigram_dictionary) print('Building LSA model...') lsi = LsiModel(trigram_bow_corpus, id2word=trigram_dictionary, num_topics=n_topics) lsi.save(filepath) print('LSA model (n_topics={}) written to {}'.format(n_topics, filepath)) else: print('Loading LSA model (n_topics={})...'.format(n_topics)) lsi = LsiModel.load(filepath) return lsi
def main(): start_time=time.time() rootdir=os.getcwd() foldername='lsi_output' folderpath=os.path.join(rootdir,foldername) if (os.path.exists(folderpath)==False or (os.path.exists(folderpath)==True and args.force==True)): topics, lsi = createLsiModelforCorpus(args.corpus, args.dict, args.ntopics) else: os.chdir(folderpath) lsimodelfile=(str(args.corpus).replace('.mm',''))+'_lsi.model' topicsfile=(str(args.corpus).replace('.mm',''))+'_lsi_topics.pkl' modelpath=os.path.join(folderpath,lsimodelfile) topicspath=os.path.join(folderpath,topicsfile) lsi = LsiModel.load(modelpath) topics=pickle.load(open(topicspath,'r')) f = open('lsi_corpus_topics.txt','w') f.seek(0) f.write(str(topics)) f.close() os.chdir(rootdir) pp.pprint(lsi.show_topics(num_topics=args.ntopics, num_words=10, log=False, formatted=True)) corpus = corpora.MmCorpus(args.corpus) if args.query!=-1: queryresult = lsi[corpus[args.query]] sortedqueryresult = sorted(list(queryresult), key=lambda query: abs(query[1]), reverse=True) print "\nSimilarity of document number {0} in corpus with corpus topics:".format(args.query) pp.pprint(sortedqueryresult) # Generate topic probability-document matrix, along with vector containing most probable topic (assumed to be the label) for each document #os.chdir(folderpath) outlabel_name = 'lsi_document_labels_{0}.txt'.format((args.corpus).replace('.mm','')) outtopic_name = 'lsi_topic_vectors_{0}.txt'.format((args.corpus).replace('.mm','')) outlabelpath=os.path.join(folderpath,outlabel_name) outtopicpath=os.path.join(folderpath,outtopic_name) if (os.path.exists(outlabelpath)==False or os.path.exists(outtopicpath)==False): outtopic = open(outtopic_name, 'w') outlabel = open(outlabel_name, 'w') for idx,doc in enumerate(corpus): tops = lsi[doc] doc_tops=[] for j in range(args.ntopics): search = [v[1] for v in tops if v[0] == j] if len(search)>0: doc_tops.append(search[0]) else: doc_tops.append(0.) most_important = doc_tops.index(max(doc_tops)) outlabel.write('{0}\n'.format(most_important)) outtopic.write('\t'.join([str(d) for d in doc_tops])+'\n') outlabel.close() outtopic.close() shutil.move(outlabel_name,folderpath) shutil.move(outtopic_name,folderpath) #os.chdir(rootdir) end_time=time.time() runtime=end_time-start_time print "\nRuntime: {0} seconds\n".format(runtime)
from datetime import datetime from date_extractor import month_to_number from gensim.corpora import Dictionary from gensim.models.lsimodel import LsiModel from nltk.corpus import stopwords as nltk_stopwords from os.path import dirname, realpath try: path_to_directory_of_this_file = dirname(realpath(__file__)) stopwords = [] with open(path_to_directory_of_this_file + "/stopwords.txt") as f: stopwords.extend([word for word in f.read().decode("utf-8").split("\n") if word and not word.startswith("#")]) stopwords = set(stopwords) lsi = LsiModel.load(path_to_directory_of_this_file + "/model") dictionary = Dictionary.load(path_to_directory_of_this_file + "/dictionary") except Exception as e: print e def run(text): try: words = text.lower().replace("#"," ").replace("_"," ").replace("("," ").replace(")"," ").replace("/"," ").replace(":"," ").replace("."," ").split() words = [word for word in words if len(word) > 3 and word not in stopwords] if words: probabilities = lsi[dictionary.doc2bow(words)] if probabilities:
def get_matrix_similarity(tweet, data): ''' in: tweet: list of lemmatized strings from tweet body. Output of preprocessing data: Dict containing filenames of assoc. files format: dataset: filename of dataset csv file dict: filename of dictionary file model: filename of lsa model output_name: name to be used for all output files generated directory: directory name where outputs will be saved. If null, output_name is used ''' df = pd.read_csv('./data/dataset/%s' % data['dataset']) lemmas_list = [] for lemmas in df['lemmas']: lemmas = str(lemmas) lemmas = lemmas.replace('[', '').replace(']', '').replace(',', '').replace('\'', '') lemmas_list.append(lemmas.split()) dictionary = corpora.Dictionary.load('./data/dicts/%s' % data['dict']) clean_doc = [dictionary.doc2bow(text) for text in lemmas_list] lsi = LsiModel.load('./data/models/%s' % data['model']) index = similarities.MatrixSimilarity(lsi[clean_doc]) make_dir('./data/similarities/') directory = '' if ('directory' in data.keys()): make_dir('./data/similarities/%s' % data['directory']) directory = data['directory'] else: make_dir('./data/similarities/%s' % data['output_name']) directory = data['output_name'] data['directory'] = directory data['filename'] = [] counter = 0 for tw in tweet: corpus = lsi[dictionary.doc2bow(tw)] with open( './data/similarities/%s/similarities_%i.txt' % (directory, counter), 'w+') as file: for doc in sorted(enumerate(index[corpus]), key=lambda item: -item[1]): file.write(str(doc) + '\n') data['filename'].append('similarities_%i.txt' % counter) counter += 1 return data
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) result_path = path.join(base_path, p['result_path']) lee_corpus = path.join(base_path, p['lee_corpus']) logger = tools.get_logger('gensim', path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) # remember starting time for runtime evaluation start = datetime.now() # load model and corpus logger.info('loading word mapping') dictionary = Dictionary.load(path.join(result_path, p['run'], p['dict_extension'])) model_path = path.join(result_path, p['run'], p['lsi_ext']) logger.info('load model from: %s' % model_path) lsi = LsiModel.load(model_path) pre = SaveLoad.load(path.join(result_path, p['run'], p['pre_model_ext'])) logging.info('load smal lee corpus and preprocess') with open(lee_corpus, 'r') as f: preproc_lee_texts = preprocessing.preprocess_documents(f.readlines()) bow_lee_texts = [dictionary.doc2bow(text, allow_update=False, return_missing=False) for text in preproc_lee_texts] logger.info('transforming small lee corpus (only pre model)') corpus_pre = pre[bow_lee_texts] # read the human similarity data and flatten upper triangular human_sim_matrix = np.loadtxt(path.join(base_path, p['human_data_file'])) sim_m_size = np.shape(human_sim_matrix)[0] human_sim_vector = human_sim_matrix[np.triu_indices(sim_m_size, 1)] max_topics = lsi.num_topics logger.info("iterate from %d to %d dimensions (stepsize: %d)" % (p['min_dim'], max_topics, p['dim_step'])) iter_range = range(p['min_dim'], max_topics, p['dim_step']) res = np.zeros(len(iter_range)) for k, l in enumerate(iter_range): # do the lower dimensionality transformation lsi.num_topics = l corpus_lsi = lsi[corpus_pre] # compute pairwise similarity matrix of transformed corpus sim_matrix = np.zeros((len(corpus_lsi), len(corpus_lsi))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): sim_matrix[i, j] = matutils.cossim(par1, par2) sim_vector = sim_matrix[np.triu_indices(len(corpus_lsi), 1)] # compute correlations cor = np.corrcoef(sim_vector, human_sim_vector) logger.info("step %d: correlation with lee data: %f" % (k, cor[0, 1])) res[k] = cor[0, 1] plt.figure() plt.plot(iter_range, res) plt.savefig(os.path.join(output_dir, 'cor_plot.' + p['plot_extension'])) plt.close() np.save(path.join(output_dir, 'model_dim_res.npy'), res) dif = datetime.now() - start logger.info("finished after %d days and %d secs" % (dif.days, dif.seconds))
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) model_path = path.join(base_path, p['result_path'], p['model_label']) logger = tools.get_logger('gensim', path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) # train the model on the small marketing corpus preprocess = [] if 'stoplist' in p.as_dict(): stoplist = open(path.join(base_path, p['stoplist'])).readlines() stoplist = [unicode(s.strip(), encoding='utf-8').lower() for s in stoplist] def remove_stopwords(sentence): return [word for word in sentence if not word in stoplist] preprocess.append(remove_stopwords) if 'stemmer' in p.as_dict(): stemmer = Stemmer.Stemmer(p['stemmer']) preprocess.append(stemmer.stemWords) if not p['model_label']: cor = TextFilesCorpus(path.join(base_path, p['corpus_path']), no_below=p['no_below'], no_above=p['no_above'], preprocess=preprocess) dictionary = cor.dictionary pre = LogEntropyModel(cor, id2word=dictionary, normalize=True) lsi = LsiModel(pre[cor], id2word=dictionary, num_topics=p['num_topics']) else: dictionary = Dictionary.load(path.join(model_path, p['dict_name'])) pre = SaveLoad.load(path.join(model_path, 'pre.model')) lsi = LsiModel.load(path.join(model_path, 'lsi.model')) lsi.num_topics = p['num_topics'] test_cor_path = path.join(base_path, p['test_cor_path']) test_answers, gold_answers, ratings = [], [], [] flist = glob.glob(path.join(test_cor_path, 'corpus_3', '*.txt')) for file in flist: match = re.search('data3_(\d)_\d+.txt', file) ratings.append(int(match.group(1))) with open(file) as f: doc = string.join(map(string.strip, f.readlines())) doc = utils.tokenize(doc, lower=True) for func in preprocess: doc = func(doc) corpus = lsi[pre[dictionary.doc2bow(doc)]] test_answers.append(corpus) flist = glob.glob(path.join(test_cor_path, 'corpus_3_golden', '*.txt')) for file in flist: with open(file) as f: doc = string.join(map(string.strip, f.readlines())) doc = utils.tokenize(doc, lower=True) for func in preprocess: doc = func(doc) corpus = lsi[pre[dictionary.doc2bow(doc)]] gold_answers.append(corpus) sim = MatrixSimilarity(test_answers)[gold_answers] mean_sim = np.mean(sim, axis=0) print 'pearsons corrcoef: %f' % np.corrcoef(ratings, mean_sim)[0,1] print 'spearmans r: %f with p: %f' % stats.spearmanr(ratings, mean_sim)
from datetime import datetime from date_extractor import month_to_number from gensim.corpora import Dictionary from gensim.models.lsimodel import LsiModel from nltk.corpus import stopwords as nltk_stopwords from os.path import dirname, realpath try: path_to_directory_of_this_file = dirname(realpath(__file__)) stopwords = [] with open(path_to_directory_of_this_file + "/stopwords.txt") as f: stopwords.extend([word for word in f.read().decode("utf-8").split("\n") if word and not word.startswith("#")]) stopwords = set(stopwords) lsi = LsiModel.load(path_to_directory_of_this_file + "/model") dictionary = Dictionary.load(path_to_directory_of_this_file + "/dictionary") except Exception as e: print("Exception trying to load LSI index. You can most likely ignore this:", e) def run(text): try: words = text.lower().replace("#"," ").replace("_"," ").replace("("," ").replace(")"," ").replace("/"," ").replace(":"," ").replace("."," ").split() words = [word for word in words if len(word) > 3 and word not in stopwords] if words: probabilities = lsi[dictionary.doc2bow(words)] if probabilities:
matrices = {} logging.info('load the articles pickle') with open(results_path + "sparql_wiki.pickle", 'r') as f: articles = pickle.load(f) logging.info('load the dictionary') id2word, word2id = utils.loadDictionary(working_corpus + word_ids_extension) dictionary = Dictionary(word2id=word2id, id2word=id2word) logging.info('load the log_ent model') log_ent = LogEntropyModel.load(results_path + norm_model) logging.info('load the LSI model') lsi = LsiModel.load(results_path + trans_model) for key in articles.iterkeys(): logging.info('current term: %s' % key) term_list = articles[key].keys() text_list = [dictionary.doc2bow(article['text'], allowUpdate=False, returnMissingWords=False) for article in articles[key].values()] sim_matrix = np.zeros((len(text_list), len(text_list))) logging.info('transform the textlist') text_list = lsi[log_ent[text_list]] logging.info('compute similarity matrix') for i, par1 in enumerate(text_list):
matrices = {} logging.info('load the articles pickle') with open(results_path + "sparql_wiki.pickle", 'r') as f: articles = pickle.load(f) logging.info('load the dictionary') id2word, word2id = utils.loadDictionary(working_corpus + word_ids_extension) dictionary = Dictionary(word2id=word2id, id2word=id2word) logging.info('load the log_ent model') log_ent = LogEntropyModel.load(results_path + norm_model) logging.info('load the LSI model') lsi = LsiModel.load(results_path + trans_model) for key in articles.iterkeys(): logging.info('current term: %s' % key) term_list = articles[key].keys() text_list = [ dictionary.doc2bow(article['text'], allowUpdate=False, returnMissingWords=False) for article in articles[key].values() ] sim_matrix = np.zeros((len(text_list), len(text_list))) logging.info('transform the textlist')