def create_similarity_matrix(name): lda = gensim.models.ldamodel.LdaModel.load(name + '.lda') corpus = gensim.corpora.MmCorpus(name + '.mm') lda_corpus = lda[corpus] dictionary = gensim.corpora.Dictionary.load(name + '.dict') numTokens = len(dictionary.values()) index = MatrixSimilarity(lda_corpus, num_features=numTokens) index.save(name + '.sim') return
def __init__(self, links, stopwords=True, num_topics=40, num_clusters=40, **kwargs): from gensim.models import TfidfModel from gensim.similarities.docsim import MatrixSimilarity if 'n_below' not in kwargs: kwargs['n_below'] = 5 if 'n_above' not in kwargs: kwargs['n_above'] = 0.7 if 'iterations' not in kwargs: kwargs['iterations'] = 200 self.meta = _compute_meta_dataframe(links) self.lexicon, self.bow = _compute_lex_bow(self.meta, stopwords=stopwords, no_below=kwargs['n_below'], no_above=kwargs['n_above']) self.tfidf = TfidfModel(self.bow) self.matsim = MatrixSimilarity(self.bow, num_features=len(self.lexicon)) self.lda = _compute_lda(self.bow, self.lexicon, num_topics=num_topics, iterations=kwargs['iterations']) self.clust = _compute_spectral_clust(self.similarity_matrix(), num_clusters=num_clusters)
def compute_documents_similarity(target, name): dictionary = gensim.corpora.Dictionary.load(name + '.dict') index = MatrixSimilarity.load(name + '.sim') print index sims = index[target] sort_sims = sorted(enumerate(sims), key=lambda item: -item[1]) top_documents = sort_sims[:200] return map(lambda item: item[0], top_documents)
def predict_movies(input_list, corpus_path='data/corpus.txt', dic_path='data/dic.dict'): f = open(corpus_path, "rb") corpus = pickle.load(f) dic = Dictionary.load(dic_path) dic.add_documents([input_list]) corpus.append(dic.doc2bow(input_list)) lsi = LsiModel(corpus, num_topics=200, id2word=dic) vectorized_corpus = lsi[corpus] doc_index = MatrixSimilarity(vectorized_corpus) sims = doc_index[vectorized_corpus] return sims[-1:][0][:-1]
def get_proposals_auto_grouped(topics_count=100, threshold=.5): ids, words = _get_raw_docs() dictionary = corpora.Dictionary(words) corpus = [dictionary.doc2bow(x) for x in words] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topics_count) lsi_corpus = lsi[corpus_tfidf] ms = MatrixSimilarity(lsi_corpus) neighbors = {} for frm, row in zip(ids, lsi_corpus): neighbors[frm] = [ ids[n] for n, match in enumerate(ms[row]) if match > threshold and ids[n] != frm ] results = [] groups = {} for root, children in neighbors.items(): target = groups.get(root, None) if not target: target = set() results.append(target) target.add(root) target.update(children) for c in children: groups[c] = target rv = {} for n, row in enumerate(results): for x in row: rv[x] = n return rv
def get_proposals_auto_grouped(topics_count=20, cutoff=0.75): doc_words, ids, titles = _get_raw_docs() dictionary = corpora.Dictionary(doc_words) corpus = [dictionary.doc2bow(x) for x in doc_words] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topics_count) lsi_corpus = lsi[corpus_tfidf] ms = MatrixSimilarity(lsi_corpus) neighborhoods = [] seen = set() for n in range(len(lsi_corpus)): if n in seen: continue near = neighbors(n, ms, lsi_corpus, cutoff) neighborhoods.append({ 'talks': [{ 'id': ids[x], 'title': titles[x], 'row': x } for x in near] }) seen.update(near) for group in neighborhoods: rows = [x['row'] for x in group['talks']] #Horrible way to get closest topic, but just looking for a hint. closest_topic = sorted(lsi[lsi_corpus[rows[0]]], key=lambda x: x[-1])[0][0] topic = sorted(lsi.show_topic(closest_topic), key=lambda x: -x[-1]) group['topic'] = ', '.join('{} ({:.2f})'.format(x, score) for x, score in topic) return neighborhoods
texts = [[ word for word in document.lower().split() if word not in WORD_BLACKLIST ] for document in lines] #print("{}".format(texts)) #ESERCIZIO 2 # Costruire il lessico (gensim.corpora.Dictionary), scartando le parole con frequenza 1 lessico = corpora.Dictionary(texts) lessico.filter_n_most_frequent(1) print(lessico) #ESERCIZIO 3 # Rappresentare i documenti come vettori (gensim.corpora.Dictionary.doc2bow) documents = [document.lower().split() for document in lines] vector_documents = [lessico.doc2bow(document) for document in documents] print(vector_documents) #ESERCIZIO 4 # Calcolare la similarità tra (il vettore di) un documento qualsiasi e tutti gli altri (gensim.similarities.MatrixSimilarity) S = MatrixSimilarity(vector_documents) print("{}".format(S[lessico.doc2bow(documents[3])])) #ESERCIZIO 5 # Costruire una funzione che, dato in input (il vettore di) un documento qualsiasi, restituisca gli n=5 documenti della # collezione a lui più simili ordinati per punteggio di similarità in modo decrescente def mostSimilar(S, lessico, document, n=5): similars = S[lessico.doc2bow(document)] return sorted(similars)[1:n] print("{}".format(mostSimilar(S, lessico, documents[1])))
if word not in ENGLISH_STOP_WORDS ] for document in pieces] # remove words that appear only once frequency = Counter([token for text in texts for token in text]) texts = [[token for token in text if frequency[token] > 1] for text in texts] dictionary = corpora.Dictionary(texts) logger.info('dictionary size: {}'.format(len(dictionary))) corpus_ = [dictionary.doc2bow(text) for text in texts] lsi = models.LsiModel(corpus_, id2word=dictionary, num_topics=lsi_topic_count) lsi.show_topics(num_topics=lsi_topic_count, num_words=100, log=True) matrix_similarity = MatrixSimilarity( lsi[corpus_], num_features=similarity_feature_count) elif mode in {modes[2], modes[3]}: texts = [[ word for word in tokenize_by_word(document.lower()) if word not in ENGLISH_STOP_WORDS ] for document in pieces] # remove words that appear only once frequency = Counter([token for text in texts for token in text]) texts = [[token for token in text if frequency[token] > 1] for text in texts] documents = [ TaggedDocument(doc, [i]) for i, doc in enumerate(texts) ] doc2vec_model = Doc2Vec( documents, epochs=doc2vec_epochs,
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) result_path = path.join(base_path, p['result_path']) logger = tools.get_logger('gensim', path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) logger.info('loading models and dictionary') dictionary = Dictionary.load(path.join(result_path, p['model_label'], 'dic.dict')) model_path = path.join(result_path, p['model_label']) lsi = LsiModel.load(path.join(model_path, 'lsi.model')) pre = pickle.load(open(path.join(model_path, 'pre.model'))) lsi.num_topics = p['num_topics'] logger.info('load wikipedia articles') article_path = path.join(result_path, p['article_label']) wiki = pickle.load(open(path.join(article_path, 'articles.pickle'))) times = np.zeros((1, len(wiki))) count = 0 for query_key, query in wiki.iteritems(): logger.info("working on: %s" % query_key) n = len(query) human = [val['rating'] for val in query.itervalues()] t0 = time.time() corpus = [lsi[pre[dictionary.doc2bow(val['text'])]] for val in query.itervalues()] sim_res = MatrixSimilarity(corpus)[corpus] sim_res.save(path.join(output_dir, 'sim_' + query_key)) avg = np.mean(sim_res, axis=0) idx = np.argsort(avg) times[count] = time.time() - t0 # compute correlation with human rating res = np.zeros((n, 1)) for i in range(n): human_r = [human[j] for j in idx[i:]] res[i, 0] = np.mean(human_r) # plot correlation fig = plt.figure() ax = fig.add_subplot(3, 1, 1) ax.plot(res) ax = fig.add_subplot(3, 1, 2) ratings = [val['rating'] for val in query.itervalues()] ax.scatter(avg[idx], [ratings[i] for i in idx]) # plot similarity distribution ax = fig.add_subplot(3, 1, 3) ax.bar(range(n), avg[idx]) # Set the x tick labels to the group_labels defined above and rotate ax.set_xticks(range(n)) k = [key + ' ' + str(query[key]['rating']) for key in query.keys()] ax.set_xticklabels([k[i] for i in idx]) fig.autofmt_xdate() plt.savefig(path.join(output_dir, query_key + '.' + p['format'])) plt.close() logger.info('average similarity calculation time: %f' % np.mean(times))
group_centroids = [] for cluster_no, group in top_topic_words_u_df.groupby('cluster_number'): gsum = group.ix[:, 'topic_0':'topic_199'].as_matrix().sum(axis=0) gsize = len(group) c = gsum / gsize centroid_index.append(cluster_no) group_centroids.append(c) group_centroids = np.array(group_centroids) centroid_df = pd.DataFrame(group_centroids, index=centroid_index) centroid_df.to_csv('persistence/lsi_topic-agglom_centroids.csv') cluster_centroid_matrix = centroid_df.as_matrix() logger.info('bulding similarity matrix') word_mat_sim = MatrixSimilarity(cluster_centroid_matrix, num_features=200) tfidf_corpus_lsi = np.load('persistence/tfidf_corpus_lsi-200_matrix_similarity.index.npy') word_mat_sim.num_best = 1 word_mat_sim.save('persistence/lsi_word-agglom_word-similarity-matrix') with open('persistence/tfidf-lsi_sim_word-topic-hier.csv','w') as fout: with open('stats/tfidf-lsi_sim_problems.txt', 'w') as errout: csvw = csv.writer(fout) for doc_id, sim in enumerate(word_mat_sim[tfidf_corpus_lsi]): try: csvw.writerow((doc_id, sim[0][0], sim[0][1])) except Exception as e: errout.write(str(fnames[doc_id])+'\n') logger.error(e) continue
logger.info("deserializing tfidf_corpus_lsi") tfidf_corpus_lsi = corpora.MmCorpus(os.path.join(settings.PERSIST_DIR, "tfidf_corpus_lsi-200")) logger.info("loading lsi model") lsi_model = lsimodel.LsiModel.load(os.path.join(settings.PERSIST_DIR, "lsi_model-200")) logger.info("globbing filenames") fnames = iglob(os.path.join(settings.PROC_DIR, "*.json")) from gensim.similarities.docsim import MatrixSimilarity, SparseMatrixSimilarity, Similarity logger.info("building matrix similarity") sim_matrix = MatrixSimilarity(tfidf_corpus_lsi, num_features=tfidf_corpus_lsi.num_terms) logger.info("persisting matrix similarity index") sim_matrix.save(os.path.join(settings.PERSIST_DIR, "tfidf_corpus_lsi-200_matrix_similarity")) logger.info("survey of neighbor groupings") with open(os.path.join(settings.STATS_DIR, "num_neighbors.csv", "w")) as fout: csv_writer = csv.writer(fout) for i, doc in fnames: try: result = sim_matrix[matutils.unitvec(tfidf_corpus_lsi.docbyoffset(tfidf_corpus_lsi.sim_matrix[i]))] n_similar = np.argwhere(result > 0.5).flatten().size csv_writer.writerow((doc, n_similar)) except Exception as e: logger.error(e)
series = pd.Series(similarities_to_group) series.index.names = ['docx','docy'] return series def get_tfidf_similarities(doc_index_series): tfidfs_from = reviews_tfidf_docs[doc_index_series.name] similarities_to = similarity_indices[tfidfs_from] return _filter_similarities(doc_index_series,similarities_to) # In[47]: #similarity indices for each doc similarity_indices = MatrixSimilarity(reviews_tfidf_docs) # In[48]: #get index:name mappings doc_mapping = ( review_df .groupby('business_id') ['name'] .apply(lambda x: x.unique()[0]) .to_frame() .assign(doc_index=range(701)) )
logger.info('deserializing tfidf_corpus_lsi') tfidf_corpus_lsi = corpora.MmCorpus( os.path.join(settings.PERSIST_DIR, 'tfidf_corpus_lsi-200')) logger.info('loading lsi model') lsi_model = lsimodel.LsiModel.load( os.path.join(settings.PERSIST_DIR, 'lsi_model-200')) logger.info('globbing filenames') fnames = iglob(os.path.join(settings.PROC_DIR, '*.json')) from gensim.similarities.docsim import MatrixSimilarity, SparseMatrixSimilarity, Similarity logger.info('building matrix similarity') sim_matrix = MatrixSimilarity(tfidf_corpus_lsi, num_features=tfidf_corpus_lsi.num_terms) logger.info('persisting matrix similarity index') sim_matrix.save( os.path.join(settings.PERSIST_DIR, 'tfidf_corpus_lsi-200_matrix_similarity')) logger.info('survey of neighbor groupings') with open(os.path.join(settings.STATS_DIR, 'num_neighbors.csv', 'w')) as fout: csv_writer = csv.writer(fout) for i, doc in fnames: try: result = sim_matrix[matutils.unitvec( tfidf_corpus_lsi.docbyoffset(tfidf_corpus_lsi.sim_matrix[i]))] n_similar = np.argwhere(result > 0.5).flatten().size
group_centroids = [] for cluster_no, group in top_topic_words_u_df.groupby("cluster_number"): gsum = group.ix[:, "topic_0":"topic_199"].as_matrix().sum(axis=0) gsize = len(group) c = gsum / gsize centroid_index.append(cluster_no) group_centroids.append(c) group_centroids = np.array(group_centroids) centroid_df = pd.DataFrame(group_centroids, index=centroid_index) centroid_df.to_csv("persistence/lsi_topic-agglom_centroids.csv") cluster_centroid_matrix = centroid_df.as_matrix() logger.info("bulding similarity matrix") word_mat_sim = MatrixSimilarity(cluster_centroid_matrix, num_features=200) tfidf_corpus_lsi = np.load("persistence/tfidf_corpus_lsi-200_matrix_similarity.index.npy") word_mat_sim.num_best = 1 word_mat_sim.save("persistence/lsi_word-agglom_word-similarity-matrix") with open("persistence/tfidf-lsi_sim_word-topic-hier.csv", "w") as fout: with open("stats/tfidf-lsi_sim_problems.txt", "w") as errout: csvw = csv.writer(fout) for doc_id, sim in enumerate(word_mat_sim[tfidf_corpus_lsi]): try: csvw.writerow((doc_id, sim[0][0], sim[0][1])) except Exception as e: errout.write(str(fnames[doc_id]) + "\n") logger.error(e) continue
logger.info('deserializing tfidf_corpus_lsi') tfidf_corpus_lsi = corpora.MmCorpus(os.path.join(settings.PERSIST_DIR, 'tfidf_corpus_lsi-200')) logger.info('loading lsi model') lsi_model = lsimodel.LsiModel.load(os.path.join(settings.PERSIST_DIR, 'lsi_model-200')) logger.info('globbing filenames') fnames = iglob(os.path.join(settings.PROC_DIR, '*.json')) from gensim.similarities.docsim import MatrixSimilarity, SparseMatrixSimilarity, Similarity logger.info('building matrix similarity') sim_matrix = MatrixSimilarity(tfidf_corpus_lsi, num_features=tfidf_corpus_lsi.num_terms) logger.info('persisting matrix similarity index') sim_matrix.save(os.path.join(settings.PERSIST_DIR, 'tfidf_corpus_lsi_matrix_similarity')) logger.info('survey of neighbor groupings') with open(os.path.join(settings.STATS_DIR, 'num_neighbors.csv', 'w')) as fout: csv_writer = csv.writer(fout) for i, doc in fnames: try: result = sim_matrix[matutils.unitvec(tfidf_corpus_lsi.docbyoffset(tfidf_corpus_lsi.sim_matrix[i]))] n_similar = np.argwhere(result > 0.5).flatten().size csv_writer.writerow((doc, n_similar)) except Exception as e: logger.error(e)