def get_jensen_shannon(components, ntopics): topic_dists = components js_dists = [] for i in range(ntopics): for j in range(ntopics): if i>j: js_dists.append(jensen_shannon(topic_dists[i,:], topic_dists[j,:])) return np.min(js_dists), np.mean(js_dists)
def testNewAuthorTopics(self): model = self.class_(corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, passes=100, random_state=np.random.seed(0)) author2doc_newauthor = {} author2doc_newauthor["test"] = [0, 1] model.update(corpus=corpus[0:2], author2doc=author2doc_newauthor) # temp save model state vars before get_new_author_topics is called state_gamma_len = len(model.state.gamma) author2doc_len = len(model.author2doc) author2id_len = len(model.author2id) id2author_len = len(model.id2author) doc2author_len = len(model.doc2author) new_author_topics = model.get_new_author_topics(corpus=corpus[0:2]) # sanity check for k, v in new_author_topics: self.assertTrue(isinstance(k, int)) self.assertTrue(isinstance(v, float)) # make sure topics are similar enough similarity = 1 / (1 + jensen_shannon(model["test"], new_author_topics)) self.assertTrue(similarity >= 0.9) # produce an error to test if rollback occurs with self.assertRaises(TypeError): model.get_new_author_topics(corpus=corpus[0]) # assure rollback was successful and the model state is as before self.assertEqual(state_gamma_len, len(model.state.gamma)) self.assertEqual(author2doc_len, len(model.author2doc)) self.assertEqual(author2id_len, len(model.author2id)) self.assertEqual(id2author_len, len(model.id2author)) self.assertEqual(doc2author_len, len(model.doc2author))
def testNewAuthorTopics(self): model = self.class_( corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, passes=100, random_state=np.random.seed(0) ) author2doc_newauthor = {} author2doc_newauthor["test"] = [0, 1] model.update(corpus=corpus[0:2], author2doc=author2doc_newauthor) # temp save model state vars before get_new_author_topics is called state_gamma_len = len(model.state.gamma) author2doc_len = len(model.author2doc) author2id_len = len(model.author2id) id2author_len = len(model.id2author) doc2author_len = len(model.doc2author) new_author_topics = model.get_new_author_topics(corpus=corpus[0:2]) # sanity check for k, v in new_author_topics: self.assertTrue(isinstance(k, int)) self.assertTrue(isinstance(v, float)) # make sure topics are similar enough similarity = 1 / (1 + jensen_shannon(model["test"], new_author_topics)) self.assertTrue(similarity >= 0.9) # produce an error to test if rollback occurs with self.assertRaises(TypeError): model.get_new_author_topics(corpus=corpus[0]) # assure rollback was successful and the model state is as before self.assertEqual(state_gamma_len, len(model.state.gamma)) self.assertEqual(author2doc_len, len(model.author2doc)) self.assertEqual(author2id_len, len(model.author2id)) self.assertEqual(id2author_len, len(model.id2author)) self.assertEqual(doc2author_len, len(model.doc2author))
with open("movie_ldabow.txt", "rb") as fp: lda_bow = pickle.load(fp) df_res = pd.DataFrame({"title": [], "rec": []}) df_title = pd.DataFrame({"title": [], "dist": []}) for i in range(34, len(titles)): df_title.empty df_curr = pd.DataFrame({"title": [titles[i]]}) start = time.time() for j in range(len(lda_bow)): if j != i: dst = jensen_shannon(lda_bow[i], lda_bow[j], 130) df_individual = pd.DataFrame({"title": [titles[j]], "dist": [dst]}) df_title = df_title.append(df_individual, ignore_index=True) df_title = df_title.sort_values(by=['dist']) df_rec = pd.DataFrame({"rec": [df_title['title'][0:50]]}) rec = df_rec["rec"][0].tolist() df_rec = pd.DataFrame({"rec": [rec]}) df_ind_res = pd.merge(df_curr, df_rec, left_index=True, right_index=True, how='inner') df_res = df_res.append(df_ind_res, ignore_index=True) if i == 0: df_ind_res.to_csv('movie_recommendation.csv', index=False) else: df_ind_res.to_csv('movie_recommendation.csv', mode='a', index=False, header=False) if i == 99: # remove this or change this as requirement | Here i is number of movies in CSV
def js_dist(X): return pdist(X, lambda u, v: jensen_shannon(u, v))
"-->'1' to get a random movie and it's recommendation\n" + "-->'2' to see some of the titles in the dataset\n" + "-->'3' or 'STOP' to end sequence\n" + "-->Title copied from dataset\n" + "Input: ") df_res = pd.DataFrame({"title": [], "rec": []}) df_title = pd.DataFrame({"title": [], "dist": []}) if check == '1': num = random.randint(0, i - 25) print("Getting recommendations. Please wait\n") df_curr = pd.DataFrame({"title": [titles[num]]}) for j in range(len(lda_bow)): if j != num: dst = jensen_shannon(lda_bow[num], lda_bow[j], 130) df_individual = pd.DataFrame({ "title": [titles[j]], "dist": [dst] }) df_title = df_title.append(df_individual, ignore_index=True) df_title = df_title.sort_values(by=['dist']) df_rec = pd.DataFrame({"rec": [df_title['title'][0:50]]}) rec = df_rec["rec"][0].tolist() title = titles[num] rec = rec[0:10] print("Movie: %s\nTop 10 Recommendations: %s" % (title, rec)) elif check == '2': start = random.randint(0, i - 20)