def loadmodel(self, nameprefix): """ Load the topic model with the given prefix of the file paths. Given the prefix of the file paths, load the corresponding topic model. The files include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict), and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf). :param nameprefix: prefix of the file paths :return: None :type nameprefix: str """ # load the JSON file (parameters) parameters = json.load(open(nameprefix + '.json', 'rb')) self.nb_topics = parameters['nb_topics'] self.toweigh = parameters['toweigh'] self.algorithm = parameters['algorithm'] self.classlabels = parameters['classlabels'] # load the dictionary self.dictionary = Dictionary.load(nameprefix + '.gensimdict') # load the topic model self.topicmodel = gensim_topic_model_dict[self.algorithm].load( nameprefix + '.gensimmodel') # load the similarity matrix self.matsim = MatrixSimilarity.load(nameprefix + '.gensimmat') # load the tf-idf modek if self.toweigh: self.tfidf = TfidfModel.load(nameprefix + '.gensimtfidf') # flag self.trained = True
def __init__(self, model_prefix=None, num_best=None): self.model_prefix = model_prefix self.num_best = num_best if self.model_prefix is None: raise ValueError("model_prefix must be specified") logger.info("ESA: Loading word dictionary...") self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2') logger.info("ESA: Loading document name map...") self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle') logger.info("ESA: Loading TF-IDF model...") self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model') logger.info("ESA: Loading similarity index...") sim_fname = "%s.cluster.%d.centroids" % (model_prefix, 2000) self.similarity_index = MatrixSimilarity.load(sim_fname, mmap='r') #logger.info("ESA: Preloading reverse indexes...") #self.similarity_index.preload_reverse_index() logger.info("ESA: Finished loading model files.")
def get_recommendation(self, movie_title: str): """ Accepts Movie Name and fetches the list of recommended movie names using matrix(cosine) similarity :param movie_title: :return: array of movie names """ print("movie : ", movie_title) dictionary = gensim.corpora.Dictionary.load(self.corpus_dictionary) tfidf_model = gensim.models.TfidfModel.load(self.tfidf_model) similarity = MatrixSimilarity.load(self.matrix_similarity) data = pd.read_csv(self.processed_data) del data['Unnamed: 0'] data["original_title"] = data["original_title"].str.lower() movie = data.loc[data.original_title == movie_title] print(movie) if movie.shape[0] == 0: status = ["Failed to Recommend Movies with existing movie data."] return status else: movie_doc_bow = dictionary.doc2bow( movie['doc'].map(break_to_tokens)[0]) movie_tfidf = tfidf_model[movie_doc_bow] movie_recommendations = pd.DataFrame({ 'Cosine_sim_values': similarity[movie_tfidf], 'title': data.original_title.values }).sort_values(by="Cosine_sim_values", ascending=False) top_recommendations = movie_recommendations['title'].head(11) return top_recommendations.to_numpy()
def load(lsi_path=None, id2word_path=None, index_path=None): """ If specified, attempts to load gensim LsiModel from `lsi_path` and gensim Dictionary from `dictionary_path`. Parameters ---------- lsi_path: str File-path designating where self.model should be saved. id2word_path: str File-path designating where self.dictionary should be saved. """ if lsi_path is not None: from gensim.models import LsiModel if not os.path.exists(lsi_path): raise IOError( 'The provided file path to the LsiModel was not found.' 'Please ensure that the argument is the correct path.') return LsiModel.load(lsi_path) if id2word_path is not None: from gensim.corpora.dictionary import Dictionary if not os.path.exists(id2word_path): raise IOError( 'The provided file path to the Dictionary was not found.' 'Please ensure that the argument is the correct path.') return Dictionary.load(id2word_path) if index_path is not None: from gensim.similarities import MatrixSimilarity if not os.path.exists(index_path): raise IOError( 'The provided file path to the Dictionary was not found.' 'Please ensure that the argument is the correct path.') return MatrixSimilarity.load(index_path)
def load(cls, fname): """ Load a previously saved object from file (also see `save`). """ logger.info("loading %s object from %s and %s" % (cls.__name__, fname, fname + ".index")) result = utils.unpickle(fname) result.similarity_index = MatrixSimilarity.load(fname + ".index") return result
def getMatrixSimilarity(tfidfModel, lsiModel=None) -> SparseMatrixSimilarity: similarityPath = os.path.join('.cache', 'sim_mat.gensim_sim') try: sim = MatrixSimilarity.load(similarityPath) except FileNotFoundError: corpus = Sparse2Corpus(tfidfModel.vectors, documents_columns=False) if lsiModel is None: lsiModel = getLsiModel(tfidfModel) sim = SparseMatrixSimilarity(lsiModel[corpus], num_best=21, num_features=tfidfModel.vectors.shape[0]) sim.save(similarityPath) return sim
def main(self): print("Recommendation using TF_IDF") # Loading preprocessed data vagas_ti = pd.read_csv(self.dataPrepFile) vagas_ids = pickle.load( open(self.out + "preprocessing/vagas_ids.array", "rb")) vagas_words = pickle.load( open(self.out + "preprocessing/vagas_words.list", "rb")) cvs_words = pickle.load( open(self.out + "preprocessing/cvs_words.series", "rb")) cvs = pd.read_csv(self.dataCvsFile) cvs = cvs.fillna("") cvs.isnull().any() #print("Loading cvs done!") # Creating a dictionary dictionary = gcorp.Dictionary(vagas_words) dictionary.save(self.out + 'preprocessing/tf_idf/vagas.dict' ) # store the dictionary, for future reference # compile corpus (vectors number of times each elements appears) raw_corpus = [dictionary.doc2bow(v) for v in vagas_words] gcorp.MmCorpus.serialize(self.out + 'preprocessing/tf_idf/vagas.mm', raw_corpus) # store to disk print("Tamanho do dicionário: " + str(len(dictionary))) # STEP 2 : similarity between corpuses dictionary = gcorp.Dictionary.load(self.out + 'preprocessing/tf_idf/vagas.dict') corpus = gcorp.MmCorpus(self.out + 'preprocessing/tf_idf/vagas.mm') # Transform Text with TF-IDF tfidf = gsm.TfidfModel(corpus) # step 1 -- initialize a model # corpus tf-idf corpus_tfidf = tfidf[corpus] # STEP 3 : Create similarity matrix of all files index = MatrixSimilarity(corpus_tfidf, num_features=len(dictionary), num_best=10) index.save(self.out + 'preprocessing/tf_idf/vagas.index') index = MatrixSimilarity.load(self.out + 'preprocessing/tf_idf/vagas.index') self.recommendationTf_idf(cvs, vagas_ti, vagas_ids, cvs_words, dictionary, tfidf, index) print("Recommendation using TF_IDF done!")
def load_model(documents, model_name, matrix_name, dic_name, queue=None): try: tfidfmodel = TfidfModel.load(model_name) index = MatrixSimilarity.load(matrix_name) dictionary = Dictionary.load(dic_name) except Exception: tfidfmodel, index, dictionary = create_model_tfidf_model( documents=documents, model_name=model_name, matrix_name=matrix_name, dic_name=dic_name) if queue is not None: queue.put([tfidfmodel, index, dictionary]) return tfidfmodel, index, dictionary
def __init__(self, series, dictionary, lsi, index, sim_opt, rank_opt): super().__init__() self.norm = LookupNormalization() self.dictionary: Dictionary = Dictionary.load(dictionary) self.lsi: LsiModel = LsiModel.load(lsi) self.index: MatrixSimilarity = MatrixSimilarity.load(index) sr = SerializationReader(series) self.documents, self.doc2idx, self.idx2doc = sr.read() sim_class = globals()[self.SIM_OPTS[sim_opt]["cls"]] self.sim_strategy: SimilarityStrategy = sim_class(self.SIM_OPTS[sim_opt]["constant"]) rank_class = globals()[self.RANK_OPTS[rank_opt]] self.rank_strategy: RankingStrategy = rank_class()
def get_similarity_index(self, bow_corpus, lsa: LsiModel, recalculate=False, from_scratch=True): filepath = self.paths.get_lsa_index(lsa.num_topics) if not os.path.isfile(filepath) or recalculate: if not from_scratch: raise ValueError('No similarity index file exists but from_scratch is False') print('Building index...') index = MatrixSimilarity(lsa[bow_corpus]) index.save(filepath) else: print('Loading index...') index = MatrixSimilarity.load(filepath) return index
def __init__(self, filename): self.docs = loads(open(filename, "r").read()) self.docmap = hoist_dict(self.docs, "id") if isfile("data.dict"): self.dictionary = Dictionary.load("data.dict") else: self.dictionary = Dictionary(iterate_summaries(self.docs)) self.dictionary.save("data.dict") if isfile("data.mm"): self.corpus = MmCorpus("data.mm") else: corpus = (self.dictionary.doc2bow(text) for text in iterate_summaries(self.docs)) MmCorpus.serialize("data.mm", corpus) self.corpus = MmCorpus("data.mm") self.lsi = LsiModel(self.corpus, id2word=self.dictionary, num_topics=3) if isfile("data.sim"): self.sim = MatrixSimilarity.load("data.sim") else: self.sim = MatrixSimilarity(self.lsi[self.corpus]) self.sim.save("data.sim") # self.lda = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=100, update_every=1, chunksize=10000, passes=1) self.sentiment_model = Doc2Vec.load("imdb.d2v") self.sentiment = LogisticRegression() self.sentiment.fit([self.sentiment_model.docvecs["TEST_POS_" + str(i)] for i in range(12500)] + [self.sentiment_model.docvecs["TEST_NEG_" + str(i)] for i in range(12500)], asarray(list(chain(repeat(0, 12500), repeat(1, 12500))))) if isfile("arxiv.d2v"): self.doc_model = Doc2Vec.load("arxiv.d2v") else: tagged = [TaggedDocument(doc.get("summary").split(), [doc.get("id")]) for doc in self.docs] doc_model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7) doc_model.build_vocab(tagged) shuffle(tagged) # Replace with functional stuff for epoch in range(10): doc_model.train(tagged, total_examples=doc_model.corpus_count, epochs=doc_model.iter) doc_model.save("arxiv.d2v")
def __init__(self, model_prefix = None, num_best = None): self.model_prefix = model_prefix self.num_best = num_best if self.model_prefix is None: raise ValueError("model_prefix must be specified") logger.info("ESA: Loading word dictionary...") self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2') logger.info("ESA: Loading document name map...") self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle') logger.info("ESA: Loading TF-IDF model...") self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model') logger.info("ESA: Loading similarity index...") sim_fname = "%s.cluster.%d.centroids" % (model_prefix, 2000) self.similarity_index = MatrixSimilarity.load(sim_fname, mmap='r') #logger.info("ESA: Preloading reverse indexes...") #self.similarity_index.preload_reverse_index() logger.info("ESA: Finished loading model files.")
import json import codecs import utils import re huffPostDataFilePath = '../../5w1h-result-data/huffPostDataIncludingKeywords.json' gensimDictionaryInTimeBaseFilePath = '../../5w1h-result-data/gensim-in-time/gensimDictionary' gensimSimilarityIndexInTimeBaseFilePath = '../../5w1h-result-data/gensim-in-time/similarityIndex' huffPostData = json.load(codecs.open(huffPostDataFilePath, 'r', 'utf-8-sig')) # corpus = corpora.MmCorpus('../../lda-ner-result-data/gensimCorpus.mm') dictionary = corpora.Dictionary.load( '../../5w1h-result-data/gensimDictionary.dict') # load similarity_index similarityIndex = MatrixSimilarity.load( '../../5w1h-result-data/similarityIndex') def get_top_k_documents_total_time(queryKeywords): bow = dictionary.doc2bow(queryKeywords) sims = similarityIndex[bow] print('sims', sims) # make [(documentIndex, point)] # convertedSims = [] # for sim in sims: # convertedSims.append((int(sim[0]), sim[1])) # return convertedSims # return huffPostDatum topKHuffPostData = []
from sklearn.linear_model import LogisticRegression, Ridge from src.text import preprocess from src.transformers import PreprocessTokensTransformer # constants API_KEY = os.getenv('API_KEY', '_') MIN_DF = 3 MAX_DF = 0.6 app = Flask(__name__) # pylint: disable=C0103 # load data dictionary = corpora.Dictionary.load('data/lda.dictionary') # pylint: disable=C0103 lda = LdaModel.load('data/lda.model') # pylint: disable=C0103 similarities = MatrixSimilarity.load('data/lda.similarity') # pylint: disable=C0103 data = pd.read_csv( 'data/data.csv', index_col='index', # pylint: disable=C0103 usecols=[ 'index', 'project_number', 'title', 'abstract', 'rcr', 'preprocessed_text' ]) topic_counts = pd.read_csv('data/lda.topic_counts.csv', index_col='year') # pylint: disable=C0103 topic_counts.columns = topic_counts.columns.astype(int) def get_most_similar_documents(query, k=10): """Return indices for similar documents""" stime = time.time() sims = -similarities[query]
tfidf_corpus_lsi = corpora.MmCorpus(os.path.join(settings.PERSIST_DIR, 'tfidf_corpus_lsi-200')) logger.info('loading lsi model') lsi_model = lsimodel.LsiModel.load(os.path.join(settings.PERSIST_DIR, 'lsi_model-200')) fnames = [line.strip() for line in open(os.path.join(settings.PERSIST_DIR, 'document_index'))] doc_ids = pd.Series(map(lambda x: os.path.basename(x).split('.')[0], fnames), dtype=object) #logger.info('building matrix similarity') #doc_topic = MatrixSimilarity(tfidf_corpus_lsi, num_features=tfidf_corpus_lsi.num_terms) #logger.info('persisting matrix similarity index') #doc_topic.save(os.path.join(settings.PERSIST_DIR, 'tfidf_corpus_lsi-200_matrix_similarity')) doc_topic = MatrixSimilarity.load(os.path.join(settings.PERSIST_DIR, 'tfidf_corpus_lsi-200_matrix_similarity')) def cluster(group, level, nbranches): if len(group) < min_nodes: logger.info("......less than {min_nodes} nodes ({n})".format( min_nodes=min_nodes, n=len(group))) return mbk = MiniBatchKMeans(init='k-means++', n_clusters=nbranches, n_init=1, init_size=1000, batch_size=1000) mbk.fit(doc_topic.index[group['original_id']]) return mbk def index_freq_above(na, minval): l = pd.Series(na)
import numpy as np import pickle import requests import re import gensim from textwrap import dedent as d from collections import defaultdict from gensim import corpora, models from gensim.similarities import MatrixSimilarity, SoftCosineSimilarity, SparseTermSimilarityMatrix from gensim.corpora import Dictionary from gensim.models import LsiModel td = corpora.Dictionary.load('scratch_work/NLPScratch/brand_new_strains.dict') m = LsiModel.load('scratch_work/NLPScratch/_fit_LSI_Model.model') s_index = MatrixSimilarity.load('scratch_work/NLPScratch/strain_sim.index') descripts = open("models/descriptions.pkl", "rb") strain_lookup = open("models/strain_lookup.pkl", "rb") descripion_dict = pickle.load(descripts) lookup = pickle.load(strain_lookup) external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css'] app = dash.Dash(__name__, external_stylesheets=external_stylesheets) app.layout = html.Div([ html.H1("Green-Rex: We Recommend It, You Smoke It!", style={ 'color': '#008000', "textAlign": "center" }),
# load models print "\n Loading models, etc..\n" id2word_pgfin = gensim.corpora.Dictionary.load('./data/pgfin.dictionary') tfidf_model = gensim.models.TfidfModel.load('./data/tfidf_pgfin.model') lsi_model = gensim.models.LsiModel.load('./data/lsi_pgfin.model') indexfile = ('./data/ta_index.txt') queryfile = './queryfiles/queryfile.txt' # text in corpus # queryfile = './queryfiles/45vuotta.txt' # Film review # queryfile = './queryfiles/tktjohdessee2.txt' # Ancient essay # check similarity print "\n Load similarity indices.\n" index = Similarity.load('./data/pgfin_index.index') index_dense = MatrixSimilarity.load('./data/pgfin_matrixindex.index') with open(queryfile, 'r') as datafile: query = datafile.read() # vectorize the query text into bag-of-words and tfidf query_bow = id2word_pgfin.doc2bow(tokenize(query)) query_tfidf = tfidf_model[query_bow] query_lsi = lsi_model[query_tfidf] index_dense.num_best = 5 class BookHitValue(object): def __init__(self, indexfile, author_title, hit_percent):
doc_ids = pd.Series(map(lambda x: os.path.basename(x).split('.')[0], fnames), dtype=object) matrix_sim_loc = os.path.join(settings.PERSIST_DIR, 'tfidf_corpus_lsi{}-200_matrix_similarity'.format(fname_suffix)) if not os.path.exists(matrix_sim_loc): logger.info('building matrix similarity') doc_topic = MatrixSimilarity(tfidf_corpus_lsi, num_features=tfidf_corpus_lsi.num_terms) logger.info('persisting matrix similarity index') doc_topic.save(matrix_sim_loc) else: logger.info('matrix similarity already available. using that') doc_topic = MatrixSimilarity.load(matrix_sim_loc) def cluster(group, level, nbranches): if len(group) < min_nodes: logger.info("......less than {min_nodes} nodes ({n})".format( min_nodes=min_nodes, n=len(group))) return mbk = MiniBatchKMeans(init='k-means++', n_clusters=nbranches, n_init=1, init_size=1000, batch_size=1000) mbk.fit(doc_topic.index[group['original_id'],:TOPIC_LIMIT]) return mbk def index_freq_above(na, minval): l = pd.Series(na)
corpus_tfidf = tfidf[corpus] corpora.MmCorpus.serialize('corpus.mm', corpus_tfidf) tfidf.save("my_model.tfidf") tfidf = models.TfidfModel.load("my_model.tfidf") print('Building LsiModel...') corpus_tfidf = corpora.MmCorpus('corpus.mm') lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100) print('Building MatrixSimilarity...') from gensim.similarities import MatrixSimilarity index = MatrixSimilarity(lsi[corpus_tfidf]) index.save('deerwester.index') index = MatrixSimilarity.load('deerwester.index') print('Testing...') result = np.zeros((20, 300)).astype('str') j = 0 for doc in query_test['Query']: doc = jieba.cut(doc) tokens = [] for word in doc: tokens.append(word) vec_bow = dictionary.doc2bow(tokens) vec_lsi = lsi[vec_bow] sims = index[vec_lsi] sims = sorted(enumerate(sims), key=lambda item: -item[1]) for i in range(300):
dtype=object) matrix_sim_loc = os.path.join( settings.PERSIST_DIR, 'tfidf_corpus_lsi{}-200_matrix_similarity'.format(fname_suffix)) if not os.path.exists(matrix_sim_loc): logger.info('building matrix similarity') doc_topic = MatrixSimilarity(tfidf_corpus_lsi, num_features=tfidf_corpus_lsi.num_terms) logger.info('persisting matrix similarity index') doc_topic.save(matrix_sim_loc) else: logger.info('matrix similarity already available. using that') doc_topic = MatrixSimilarity.load(matrix_sim_loc) def cluster(group, level, nbranches): if len(group) < min_nodes: logger.info("......less than {min_nodes} nodes ({n})".format( min_nodes=min_nodes, n=len(group))) return mbk = MiniBatchKMeans(init='k-means++', n_clusters=nbranches, n_init=1, init_size=1000, batch_size=1000) mbk.fit(doc_topic.index[group['original_id'], :TOPIC_LIMIT]) return mbk
# load dictionary from gensim import corpora dictionary_name = "maqa.dict" dictionary = corpora.Dictionary.load(dictionary_name) # load model from gensim import models tfidf_name = "tfidf.model" tfidf = models.TfidfModel.load(tfidf_name) # load index from gensim.similarities import MatrixSimilarity index_name = "index_tfidf.index" index = MatrixSimilarity.load(index_name) # load original Q&A import pandas df_qa = pandas.read_csv('AllQA.csv') from TFIDF import TFIDF_sims_argmax @handler.add(MessageEvent, message=TextMessage) def handle_message(event): # response=LSI_prediction(event.message.text, dictionary, tfidf, lsi_model, index, text_corpus) sims_argmax = TFIDF_sims_argmax(event.message.text, dictionary, tfidf, index) response = df_qa.iloc[sims_argmax, 1] conn = psycopg2.connect(DATABASE_URL, sslmode='require')
from gensim.similarities import MatrixSimilarity from gensim.models import LsiModel from gensim.corpora import Dictionary import pandas as pd import time bugs = pd.read_csv('./data/query_result_4189_cleared.csv') documents = bugs.tokenized real_documents = bugs.subject lsi = LsiModel.load('gensim_lsi_model.lsi') dictionary = Dictionary.load('./gen_sim.dict') index = MatrixSimilarity.load('./gensim_lsi_matrix_similarity.index') start = time.time() query = documents[0] query_vec = dictionary.doc2bow(query.lower().split()) # convert the query to LSI space vec_lsi = lsi[query_vec] sims = index[vec_lsi] sims_s = sorted(list(enumerate(sims)), key=lambda tup: tup[1], reverse=True) end = time.time() print('Counting one query took: {}'.format(end - start)) c = 0 for item in sims_s: i = item[0] v = item[1]
def createSearchObjs(): """ Creates the SimSearch and KeySearch objects using the data structures created in `make_wikicorpus.py`. Returns (simsearch, keysearch, titles_to_id) """ # Load the article titles. These have the format (pageid, article title) fprint('Loading Wikipedia article titles...') t0 = time.time() id_to_titles = utils.unpickle('./data/bow.mm.metadata.cpickle') titles_to_id = utils.unpickle('./data/titles_to_id.pickle') # id_to_titles is actually a map of indeces to (pageid, article title) # The 'pageid' property is unused. # Convert id_to_titles into a simple list of titles. titles = [item[1][1] for item in id_to_titles.items()] fprint(' Took %.2f seconds' % (time.time() - t0)) # Load the dictionary (830ms on my machine) fprint('\nLoading dictionary...') t0 = time.time() dictionary = Dictionary.load_from_text('./data/dictionary.txt.bz2') fprint(' Took %.2f seconds' % (time.time() - t0)) # Load tf-idf model (60ms on my machine). fprint('\nLoading tf-idf model...') t0 = time.time() tfidf_model = TfidfModel.load('./data/tfidf.tfidf_model') fprint(' Took %.2f seconds' % (time.time() - t0)) # We must not use `load`--that would attempt to load the corpus into # memory, and it's 16.7 GB!! #corpus_tfidf = MmCorpus.load('./data/corpus_tfidf.mm') fprint('\nCreating tf-idf corpus object (leaves the vectors on disk)...') t0 = time.time() corpus_tfidf = MmCorpus('./data/corpus_tfidf.mm') fprint(' Took %.2f seconds' % (time.time() - t0)) # Create the KeySearch and SimSearch objects. ksearch = KeySearch(dictionary, tfidf_model, corpus_tfidf, titles) simsearch = SimSearch(ksearch) # TODO - SimSearch doesn't currently have a clean way to provide the index # and model. fprint('\nLoading LSI model...') t0 = time.time() simsearch.lsi = LsiModel.load('./data/lsi.lsi_model') fprint(' Took %.2f seconds' % (time.time() - t0)) # Load the Wikipedia LSI vectors into memory. # The matrix is 4.69GB for me, and takes ~15 seconds on my machine to load. fprint('\nLoading Wikipedia LSI index...') t0 = time.time() simsearch.index = MatrixSimilarity.load('./data/lsi_index.mm') fprint(' Took %.2f seconds' % (time.time() - t0)) # TODO - It would be interesting to try the 'Similarity' class which # shards the dataset on disk for you... return (simsearch, ksearch, titles_to_id)
def explain_lda_prediction(inputs, prediction, index, num_topics): book_meta_data = pd.read_csv(os.path.join(datapath, "meta_dataset.csv")) vector = prediction[-num_topics:] sims = index[vector] sims = sorted(enumerate(sims), key=lambda item: -item[1])[:10] print("books that are similar by LDA: ") for sim in sims: book_id = sim[0] book_title = book_meta_data.title[book_id] print("\t", book_title) " OPEN RBM MODEL PARAMETERS " rbm_parameters = open_pickle(modelpath, "lda_rbm_parameters") W = rbm_parameters['W'] bv = rbm_parameters['bv'] bh = rbm_parameters['bh'] num_topics = 50 from gensim.test.utils import datapath as dp from gensim.similarities import MatrixSimilarity index_file = dp(os.path.join(modelpath, "lda_similarity.index")) index = MatrixSimilarity.load(index_file) user_vecs = open_pickle(picklepath, "lda_rbm_inputs")