def testOnlineTransform(self): corpus = list(self.corpus) doc = corpus[0] # use the corpus' first document for testing # create the transformation model model2 = lsimodel.LsiModel(corpus=corpus, num_topics=5) # compute everything at once model = lsimodel.LsiModel(corpus=None, id2word=model2.id2word, num_topics=5) # start with no documents, we will add them later # train model on a single document model.add_documents([corpus[0]]) # transform the testing document with this partial transformation transformed = model[doc] vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests expected = np.array([-1.73205078, 0.0, 0.0, 0.0, 0.0]) # scaled LSI version self.assertTrue(np.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign # train on another 4 documents model.add_documents(corpus[1:5], chunksize=2) # train on 4 extra docs, in chunks of 2 documents, for the lols # transform a document with this partial transformation transformed = model[doc] vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests expected = np.array([-0.66493785, -0.28314203, -1.56376302, 0.05488682, 0.17123269]) # scaled LSI version self.assertTrue(np.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign # train on the rest of documents model.add_documents(corpus[5:]) # make sure the final transformation is the same as if we had decomposed the whole corpus at once vec1 = matutils.sparse2full(model[doc], model.num_topics) vec2 = matutils.sparse2full(model2[doc], model2.num_topics) self.assertTrue(np.allclose(abs(vec1), abs(vec2), atol=1e-5)) # the two LSI representations must equal up to sign
def testPersistence(self): model = lsimodel.LsiModel(self.corpus, numTopics = 2) model.save(testfile()) model2 = lsimodel.LsiModel.load(testfile()) self.assertEqual(model.numTopics, model2.numTopics) self.assertTrue(numpy.allclose(model.projection.u, model2.projection.u)) self.assertTrue(numpy.allclose(model.projection.s, model2.projection.s))
def getLsiModel(lsipath='./lsi/', num_topics=300): # 加载字典 dictionary = corpora.Dictionary.load(lsipath + 'viva.dict') print '字典加载完毕' # 语料库 corpus = corpora.MmCorpus(lsipath +'viva.mm') print ('mm load') t31 = time.time() # tfidf tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] t32 = time.time() print "tfidf_corpus time = ", t32 - t31 # baobao change 3 lines # corpus = MyCorpus() # lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=NUM_TOPIC,power_iters=2,chunksize=50000,onepass=True,distributed=False) # lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics,chunksize=20000) lsi = None try: lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics, chunksize=60000, power_iters=2, onepass=True) #其他参数都是默认 lsi.save(lsipath + 'viva.lsi') print('lsi模型保存完毕') except (SystemExit, KeyboardInterrupt): raise except Exception, e: logging.error('Failed to lsi train', exc_info=True)
def testLargeMmap(self): model = lsimodel.LsiModel(self.corpus, num_topics=2) # test storing the internal arrays into separate files model.save(testfile(), sep_limit=0) model2 = lsimodel.LsiModel.load(testfile()) self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(numpy.allclose(model.projection.u, model2.projection.u)) self.assertTrue(numpy.allclose(model.projection.s, model2.projection.s)) tstvec = [] self.assertTrue(numpy.allclose( model[tstvec], model2[tstvec])) # try projecting an empty vector # now load the external arrays via mmap model2 = lsimodel.LsiModel.load(testfile(), mmap='r') self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(numpy.allclose(model.projection.u, model2.projection.u)) self.assertTrue(numpy.allclose(model.projection.s, model2.projection.s)) tstvec = [] self.assertTrue(numpy.allclose( model[tstvec], model2[tstvec])) # try projecting an empty vector
def generateTopic(self,wordsLists, method=TopicMethod.LSI, numTopics=25): """step4: 主题向量转换""" """Note: 采用LDA转换后,经文本相似度比较后发现效果一点都不好, 故而采用LSI转换,效果不错. Created by flx on 2018-4-7 """ bowCorpus = self.generateBow(wordsLists) tfidfCorpus = self.generateTfidf(bowCorpus) if method == TopicMethod.LDA: instance = ldamodel.LdaModel(tfidfCorpus, id2word=self.dictionary, num_topics=numTopics) CacheUtil.dumpTopicModel(instance) elif method == TopicMethod.LSI: instance = lsimodel.LsiModel(tfidfCorpus, id2word=self.dictionary, num_topics=numTopics) CacheUtil.dumpTopicModel(instance) dstCorpus = instance[tfidfCorpus] features=[] # gensim转换后的格式是tuple列表格式,如: # vec = [(0, 0.12345), (2,0.458124),(4,0.485263),(7,0.589542)...] # 只保存向量中的非零值 # 我们转换为普通向量形式 for doc in dstCorpus: vector=[0]*numTopics for pair in doc: vector[pair[0]] = pair[1] features.append(vector) return features
def initialize(self, myid, dispatcher, **model_params): self.lock_update = threading.Lock() self.jobsdone = 0 # how many jobs has this worker completed? self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove? self.dispatcher = dispatcher logger.info("initializing worker #%s" % myid) self.model = lsimodel.LsiModel(**model_params)
def finalize(self): if self.model_exist: return if self.num_of_scans == 1: print "Loaded the model from file." else: print "Performing SVD..." # svd = SVD(n_components=self.num_of_features, random_state=42) # x = svd.fit_transform(self.vectors) # self.vectors = x x = Sparse2Corpus(self.vectors) lsi = lsimodel.LsiModel(corpus=x, id2word=None, num_topics=self.num_of_features) lsi.save(self.model_file_name) self.vectors = lsi.projection.u print "done." if self.n <= 1: self.n = 2.0 self.mean = self.sum / self.n self.var = (self.sum_sq - (self.sum * self.sum) / self.n) / (self.n - 1) self.var = math.sqrt(self.var) f = open(self.stat_filename, 'a') lang_pair = self.src_language + self.trg_language f.write("\n" + lang_pair + "\n") f.write("stats\t" + str(self.mean) + "\t" + str(self.var) + "\n") f.close()
def testPersistence(self): model = lsimodel.LsiModel(self.corpus, num_topics=2) model.save(testfile()) model2 = lsimodel.LsiModel.load(testfile()) self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(numpy.allclose(model.projection.u, model2.projection.u)) self.assertTrue(numpy.allclose(model.projection.s, model2.projection.s)) tstvec = [] self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
def testPersistenceCompressed(self): fname = testfile() + '.gz' model = lsimodel.LsiModel(self.corpus, num_topics=2) model.save(fname) model2 = lsimodel.LsiModel.load(fname, mmap=None) self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(numpy.allclose(model.projection.u, model2.projection.u)) self.assertTrue(numpy.allclose(model.projection.s, model2.projection.s)) tstvec = [] self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
def testTransform(self): # create the transformation model model = lsimodel.LsiModel(self.corpus, numTopics = 2) # transform one document doc = list(self.corpus)[0] transformed = model[doc] vec = matutils.doc2vec(transformed, 2) # convert to dense vector, for easier equality tests expected = [0.1973928, 0.05591352] self.assertTrue(numpy.allclose(abs(vec), expected)) # transformed entries must be equal up to sign
def testLargeMmapCompressed(self): fname = testfile() + '.gz' model = lsimodel.LsiModel(self.corpus, num_topics=2) # test storing the internal arrays into separate files model.save(fname, sep_limit=0) # now load the external arrays via mmap return # turns out this test doesn't exercise this because there are no arrays # to be mmaped! self.assertRaises(IOError, lsimodel.LsiModel.load, fname, mmap='r')
def testTransform(self): # create the transformation model model = lsimodel.LsiModel(self.corpus, numTopics = 2) # transform one document doc = list(self.corpus)[0] transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign
def testCorpusTransform(self): """Test lsi[corpus] transformation.""" model = lsimodel.LsiModel(self.corpus, num_topics=2) got = numpy.vstack(matutils.sparse2full(doc, 2) for doc in model[corpus]) expected = numpy.array([ [ 0.65946639, 0.14211544], [ 2.02454305, -0.42088759], [ 1.54655361, 0.32358921], [ 1.81114125, 0.5890525 ], [ 0.9336738 , -0.27138939], [ 0.01274618, -0.49016181], [ 0.04888203, -1.11294699], [ 0.08063836, -1.56345594], [ 0.27381003, -1.34694159]]) self.assertTrue(numpy.allclose(abs(got), abs(expected))) # must equal up to sign
def testTransform(self): """Test lsi[vector] transformation.""" # create the transformation model model = lsimodel.LsiModel(self.corpus, num_topics=2) # make sure the decomposition is enough accurate u, s, vt = scipy.linalg.svd(matutils.corpus2dense(self.corpus, self.corpus.num_terms), full_matrices=False) self.assertTrue(numpy.allclose(s[:2], model.projection.s)) # singular values must match # transform one document doc = list(self.corpus)[0] transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign
def lsi(corpus, num_topics, tfidf=False): dictionary = corpora.Dictionary(corpus) corpus = [dictionary.doc2bow(text) for text in corpus] if tfidf: corpus = tfidf_bow(corpus) lsi_model = lsimodel.LsiModel(corpus=corpus, id2word=dictionary, num_topics=num_topics) result = lsi_model[corpus] topics = {topic: [] for topic in range(num_topics)} for i in range(len(corpus)): t = dict(result[i]) if len(t) > 0: max_topic = list(t.keys())[0] for topic_no, value in t.items(): if t[max_topic] < value: max_topic = topic_no topics[max_topic].append(i) return topics
def create_model(self): if not os.path.isfile(self.model_file): if self.model_name == 'LSI': self.model = lsimodel.LsiModel(corpus = self.corpus, \ id2word = self.dictionary, num_topics = self.num_topics) else: self.model = ldamodel.LdaModel(corpus = self.corpus, \ num_topics = self.num_topics, id2word = self.dictionary) self.model.save(self.model_file) self.corpora = self.model[self.corpus] corpora.MmCorpus.serialize(self.corpora_file, self.corpora) else: self.corpora = gensim.corpora.MmCorpus(self.corpora_file) if self.model_name == 'LSI': self.model = gensim.models.LsiModel.load(self.model_file) else: self.model = gensim.models.LdaModel.load(self.model_file)
def lda_keyWords(cat): df = pd.read_csv('train_set.csv', encoding="utf_8_sig") text = [] for i in range(len(df)): if df['label'][i] == cat: text += splitWords(df['content'][i]) text = [text] dictionary = corpora.Dictionary(text) corpus = [dictionary.doc2bow(t) for t in text] #print(cat + ':') lsi = lsimodel.LsiModel(corpus, id2word=dictionary) #print("LSI: ", lsi.print_topics(5)) lda = ldamodel.LdaModel(corpus, id2word=dictionary) #print("LDA: ", lda.print_topics(5)) wc_lsi(cat, lsi, 0) wc_lsi(cat, lda, 1)
def initialize(self, myid, dispatcher, **model_params): """Fully initialize the worker. Parameters ---------- myid : int An ID number used to identify this worker in the dispatcher object. dispatcher : :class:`~gensim.models.lsi_dispatcher.Dispatcher` The dispatcher responsible for scheduling this worker. **model_params Keyword parameters to initialize the inner LSI model, see :class:`~gensim.models.lsimodel.LsiModel`. """ self.lock_update = threading.Lock() self.jobsdone = 0 # how many jobs has this worker completed? # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove? self.myid = myid self.dispatcher = dispatcher self.finished = False logger.info("initializing worker #%s", myid) self.model = lsimodel.LsiModel(**model_params)
def do_after_a_full_scan(self, num_of_finished_scans): # First iteration of a normal run (collecting the vocabulary) if num_of_finished_scans == 1 and self.num_of_scans == 3: self.vocab = Counter(self.all_words) self.all_words = {} for word in self.vocab: if self.vocab[word] >= self.min_count: self.all_words[word] = len(self.all_words) self.vectors = lil_matrix( (len(self.all_words), self.number_of_tus), dtype=np.int8) print("-#-#-#-#-#-#-#-#-#-#-#-") print("size of vocab:", len(self.vocab)) print("size of common words:", len(self.all_words)) print("number of TUs:", self.number_of_tus) self.number_of_tus = 0 f = open(self.dict_file_name, "a+") for w in self.all_words: f.write(w) f.write("\t" + str(self.all_words[w]) + "\n") f.close() # Second iteration of a normal run (making the tu-word matrix) elif num_of_finished_scans == 2: print("Performing SVD...") x = Sparse2Corpus(self.vectors) lsi = lsimodel.LsiModel(corpus=x, id2word=None, num_topics=self.num_of_features) lsi.save(self.model_file_name) self.vectors = lsi.projection.u print("done.") else: print("-#-#-#-#-#-#-#-#-#-#-#-")
def train(self, filepath, dict_path, docs, num_topics = 5, chunksize = 2000): if(path.exists(filepath)): LOGGER.info('Model already exists...load model') self._inner_model = lsimodel.LsiModel.load(filepath) else: start = time.time() clean_docs = [d for d in docs] if(path.exists(dict_path)): LOGGER.info('Dictionary already exists...loading dictionary') self._dict = corpora.Dictionary.load(dict_path) else: self._dict = corpora.Dictionary(clean_docs) self._dict.save(dict_path) self.dict_time = (time.time() - start) corpus_dict = self._dict corpus = [self._dict.doc2bow(x) for x in clean_docs] #tfidf = tfidfmodel.TfidfModel(corpus) #corpus_tfidf = tfidf[corpus] self._inner_model = lsimodel.LsiModel(corpus, num_topics=num_topics, id2word=corpus_dict, chunksize=chunksize) self._inner_model.save(filepath) self.model_time = (time.time() - start) return self
def NLP_process(df, dictionary=None, post_lsi=None, title_lsi=None, num_lsi_topics=None, use_timer=True): """ Function for NLP pre-processing. If dictionary isn't specified, create it from the posts and titles. If post_lsi and title_lsi are not specified, create them as well. """ from gensim.models import lsimodel if use_timer: my_timer = SimpleTimer() posts_tokenized = ProcessText(df.selftext) # posts_tokenized = [] if use_timer: my_timer.elapsed('Processed Posts') titles_tokenized = ProcessText(df.title) if use_timer: my_timer.elapsed('Processed Titles') if not dictionary: dictionary = CreateCorpusDictionary(posts_tokenized + titles_tokenized) if use_timer: my_timer.elapsed('Created Dictionary') posts_vec = Vectorize_text(posts_tokenized, dictionary) titles_vec = Vectorize_text(titles_tokenized, dictionary) print(len(titles_vec), df.shape) df_new = df.copy() df_new = df_new.assign(post_word_len2=[len(post) for post in posts_vec]) df_new = df_new.assign(title_word_len2=[len(post) for post in titles_vec]) df_new = df_new[sorted(df_new.columns)] if use_timer: my_timer.elapsed('Vectorized') if not post_lsi: post_lsi = lsimodel.LsiModel(posts_vec, num_topics=num_lsi_topics, id2word=dictionary) if not title_lsi: title_lsi = lsimodel.LsiModel(titles_vec, num_topics=num_lsi_topics, id2word=dictionary) my_timer.elapsed('Trained LSI') post_lsi_features = ComputeDocumentLSIs(posts_vec, post_lsi, num_lsi_topics, label_base='post_lsi') if use_timer: my_timer.elapsed('Computed Post LSIs') title_lsi_features = ComputeDocumentLSIs(titles_vec, title_lsi, num_lsi_topics, label_base='title_lsi') if use_timer: my_timer.elapsed('Computed Title LSIs') post_lsi_features = post_lsi_features.set_index(df_new.index) title_lsi_features = title_lsi_features.set_index(df_new.index) df_new = df_new.join(post_lsi_features) df_new = df_new.join(title_lsi_features) df_new = df_new.drop(['selftext', 'title'], axis=1) if use_timer: my_timer.elapsed('Completed {} records'.format(len(df_new))) return (df_new, dictionary, post_lsi, title_lsi)
def setUp(self): self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) self.model = lsimodel.LsiModel(self.corpus, num_topics=2)
id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt')) logging.info("loaded %i word ids" % len(id2word)) corpus = MmCorpus(config.resultFile('bow.mm')) if method == 'tfidf': model = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True) model.save(config.resultFile('model_tfidf.pkl')) elif method == 'lda': model = ldamodel.LdaModel(corpus, id2word = id2word, numTopics = DIM_LDA) model.save(config.resultFile('model_lda.pkl')) elif method == 'lsi': # first, transform word counts to tf-idf weights tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True) # then find the transformation from tf-idf to latent space model = lsimodel.LsiModel(tfidf[corpus], id2word = id2word, numTopics = DIM_LSI) model.save(config.resultFile('model_lsi.pkl')) elif method == 'rp': # first, transform word counts to tf-idf weights tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True) # then find the transformation from tf-idf to latent space model = rpmodel.RpModel(tfidf[corpus], id2word = id2word, numTopics = DIM_RP) model.save(config.resultFile('model_rp.pkl')) else: raise ValueError('unknown topic extraction method: %s' % repr(method)) MmCorpus.saveCorpus(config.resultFile('corpus_%s.mm' % method), model[corpus]) logging.info("finished running %s" % program)
ratings_df.loc[:, 'rating'] = sk.minmax_scale(ratings_df.loc[:, 'rating']) print(ratings_df.loc[:, 'rating']) print(ratings_df) print(ratings_df.head()) R_df = ratings_df.pivot(index='user_id', columns='book_id', values='rating').fillna(0).to_sparse(fill_value=0) print(R_df.head()) R = R_df.as_matrix() if (np.isinf(R).all() == False): print("tr") ##print(np.isinf(R),np.isnan(R)) Z = gensim.matutils.Dense2Corpus(R, documents_columns=True) print(Z) ##user_ratings_mean = np.mean(R, axis = 1) #print(R.size) lsi = ls.LsiModel(Z, num_topics=3) print("Sigma") print(lsi.projection.s) print("U") print(lsi.projection.u) print("VT") V = gensim.matutils.corpus2dense(lsi[Z], len( lsi.projection.s)).T / lsi.projection.s print(V)
logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt')) id2word = corpora.DmlCorpus.loadDictionary( config.resultFile('wordids.txt')) logging.info("loaded %i word ids" % len(id2word)) if method == 'tfidf': corpus = corpora.MmCorpus(config.resultFile('bow.mm')) model = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True) model.save(config.resultFile('tfidfmodel.pkl')) elif method == 'lda': corpus = corpora.MmCorpus(config.resultFile('bow.mm')) model = ldamodel.LdaModel(corpus, id2word=id2word, numTopics=DIM_LDA) model.save(config.resultFile('ldamodel%i.pkl' % DIM_LDA)) elif method == 'lsi' or method == 'lsa': # first, transform word counts to tf-idf weights corpus = corpora.MmCorpus(config.resultFile('bow.mm')) tfidf = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True) # then find the transformation from tf-idf to latent space model = lsimodel.LsiModel(tfidf.apply(corpus), id2word=id2word, numTopics=DIM_LSI) model.save(config.resultFile('lsimodel%i.pkl' % DIM_LSI)) elif method == 'rp': raise NotImplementedError( "Random Projections not converted to the new interface yet") else: raise ValueError('unknown topic extraction method: %s' % repr(method)) logging.info("finished running %s" % program)
if use_pickle: results = useThreads() dictionary = corpora.Dictionary(results) print(dictionary) dictionary.filter_extremes() print(dictionary) corpus = [dictionary.doc2bow(text) for text in results] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] with open('models//tfidf_corpus.pickle', 'wb') as output: pickle.dump(corpus_tfidf, output, pickle.HIGHEST_PROTOCOL) with open('models//dictionary.pickle', 'wb') as output: pickle.dump(dictionary, output, pickle.HIGHEST_PROTOCOL) else: with open('models//tfidf_corpus.pickle', 'rb') as input: corpus_tfidf = pickle.load(input) with open('models//dictionary.pickle', 'rb') as input: dictionary = pickle.load(input) lsimodel = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300) corpus_lsi = lsimodel[corpus_tfidf] # lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=300, update_every=1, chunksize=10000, passes=1) # lda.save("models//lda.pickle") # hdp = models.hdpmodel.HdpModel(corpus_tfidf, id2word=dictionary) # hdp.save("models//hdp.pickle") # hdp.update_expectations() # hdpformatter = models.hdpmodel.HdpTopicFormatter(hdp.id2word,hdp.m_lambda+hdp.m_eta) # pprint(hdpformatter.show_topics(topics=-1, topn=20)) print(time.time() - start_time, "seconds")
#lemmatizer les mots dans le fichierIN txt = [[ lemm.lemmatize(unicode(word, 'utf-8')) for word in d.lower().split() if (word not in stop and len(word) > 3) ] for d in fileIN] #calculer la frequence des mots dans le fichierIN all_tokens = sum(txt, []) #print type(all_tokens) #fire un set de tous les tokens dans le fichierIN qui ont une frequence moin 2 tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) < 2) #si la freq d'un mot est plus qu'un, pour pas avoir des doublons texts = [[word for word in text if word not in tokens_once] for text in txt] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] #num_topics ici c'est le nombre de groupe qu'on veut sortir lsi = lsimodel.LsiModel(corpus, id2word=dictionary, num_topics=20) if len(fileIN) > 1: tfidf = models.TfidfModel(corpus) doctfidf = tfidf[corpus] #num_topics ici c'est le nombre de termes qu'on veut sortir pour chaque groupe lsit = lsimodel.LsiModel(doctfidf, id2word=dictionary, num_topics=10) dd = dict() for i in range(0, lsi.num_topics): fileOut.write(lsi.print_topic(i) + '\n') dd[i] = lsi.print_topic(i)
def build_model(dictionary_path, mm_corpus_path): dictionary = Dictionary.load_from_text(dictionary_path) # Use the if-idf corpus here, not the original one. mm = MmCorpus(mm_corpus_path) lsi = lsimodel.LsiModel(corpus=mm, id2word=dictionary, num_topics=400) lsi.save('/home/andre/Develop/corpora/lsamodel_lsi.model')
#Normalizing the ratings to fit in between 0 to 1 using squash function as new x=(x - min/ max-min) . # 1 will be now 0, 2 will be now 0.25, 3 will be now 0.5, 4 will be now 0.75, 5 will be now 1 ratings_dataset.loc[:, 'rating'] = sk.minmax_scale(ratings_dataset.loc[:, 'rating']) #Printing the first five rows after normalization print(ratings_dataset.head()) #Reshape data (produce a “pivot” table) based on column values. # Uses unique values from index / columns to form axes of the resulting DataFrame. R = ratings_dataset.pivot(index='user_id', columns='song_id', values='rating').fillna(0).to_sparse(fill_value=0) print(R.head()) #Interpret the input as a matrix. R_matrix = R.as_matrix() #Treat dense numpy array as a streamed gensim corpus in BoW format. R_corpus = gensim.matutils.Dense2Corpus(R_matrix, documents_columns=True) print(R_corpus) #Implements fast truncated SVD lsi = ls.LsiModel(R_corpus, num_topics=3) print("Sigma Matrix (Singular Values) :\n") print(lsi.projection.s) print("U Matrix : \n") print(lsi.projection.u) print("V Transpose Matrix :\n") VT = gensim.matutils.corpus2dense(lsi[R_corpus], len( lsi.projection.s)).T / lsi.projection.s print(VT)
def getTfidfLsiSims(corpus, confId, confIdtoIndex, dictionary, outputDir): print( "Using gensim to get TFIDF vector and LSI vector for conferences in corpus " ) #tfidf tfidf = tfidfmodel.TfidfModel( corpus) # initialize a tfidf transformation for corpus corpus_tfidf = tfidf[corpus] # get tfidf vectors #lsi lsi = lsimodel.LsiModel( corpus_tfidf, id2word=dictionary, num_topics=4 ) # initialize an LSI transformation for corpus, with number of topics = 4 corpus_lsi = lsi[corpus_tfidf] ####### not important, just printing print("Printing TF-IDF vectors in " + outputDir + '/conffTFIDF.txt') fTFIDFFile = open(outputDir + '/conffTFIDF.txt', 'w') j = 0 for doc in corpus_tfidf: print >> fTFIDFFile, confId[j], doc j = j + 1 if j % 100 == 0: print(j) tfidf.save(outputDir + '/conftfidf.mod') #print "length of corpus is",len(corpus) printvectors = False if printvectors == True: i = 0 for doc in corpus_tfidf: print("tfidf doc", confId[i], doc) i += 1 i = 0 for doc in corpus_lsi: print("lsi doc", confId[i], doc) i += 1 ####### not important #compute similarity of corpus against itself listofMethods = ['corpus_lsi', 'corpus_tfidf'] for method in listofMethods: if method == 'corpus_lsi': cor = corpus_lsi elif method == 'corpus_tfidf': cor = corpus_tfidf index = similarities.MatrixSimilarity(cor) confSims = dict() confSimsDict = dict() # dictionary of [confId1][confId2] j = 0 sims = [] for vec_tfidf in cor: sims = index[vec_tfidf] sims = sorted(enumerate(sims), key=lambda item: -item[1]) confSims[confId[j]] = sims # in khat be dard nemikhore confSimsDict[j] = dict(sims) #print "index: ",confIdtoIndex[confId[j]], "confId: ", confId[j], confSims[confId[j]] j += 1 if method == 'corpus_lsi': cslsi = dict() for c1index in confSimsDict.keys(): cslsi[confId[c1index]] = dict() for c2index in confSimsDict.keys(): cslsi[confId[c1index]][ confId[c2index]] = confSimsDict[c1index][c2index] elif method == 'corpus_tfidf': cstfidf = dict() for c1index in confSimsDict.keys(): cstfidf[confId[c1index]] = dict() for c2index in confSimsDict.keys(): cstfidf[confId[c1index]][ confId[c2index]] = confSimsDict[c1index][c2index] return cstfidf, cslsi