def print_topic_results_html(self): ''' print the results of topic modeling in html file: for each topic, n_top_terms and n_top_articles - INPUT: self.model_name, self.topic_terms, self.W, self.df - OUTPUT: filename model_name+'.html' ''' model_name = self.model_name W_t = self.W.T topic_terms = self.topic_terms df2 = self.df print 'print_topic_results_html' #model_name, n_top_articles, W_t,topic_terms, n_top_terms with open(model_name + '.html', 'w') as out_fh: out_fh.write('<html>\n<body>') for topic_idx, article_w in enumerate(W_t): out_fh.write("<h1>Topic #%d: </h1>\n" % topic_idx) # for i, topic in enumerate(topic_terms): terms = topic_terms[topic_idx] l = sorted(terms.items(), key=lambda x: x[1])[::-1] print "terms: ", terms print "l: ", l txt_list = [] for item in l[:n_top_terms]: txt_list.append('%s (%.4f)' % (item[0], item[1])) out_fh.write( '<p><strong>top terms: ' + ' '.join(txt_list) + "</strong></p>\n") out_fh.write('-----------------------------------<br>\n') # print article_w.shape idx_article_topn = article_w.argsort()[:-n_top_articles - 1:-1] for i, idx in enumerate(idx_article_topn): url = df2.iloc[idx]['url'] #title_this = article_dict_all.get(url,'') title_this = df2.iloc[idx]['title'] title_this_cleaned = ascii_text(title_this) if title_this_cleaned == '': title_this_cleaned = url out_fh.write('<p> ' + str(i) + '. (%.2f))' % article_w[idx] + '</p>\n') out_fh.write( '<a href="' + url + '" target="_blank"> %s </a> <br>\n' % title_this_cleaned) body_text_str = df2.iloc[idx]['body_text'][:400] #body_text_str = body_text_str.encode('utf8') body_cleaned = ascii_text(body_text_str) out_fh.write(body_cleaned + ' \n<br>\n') out_fh.write('\n') out_fh.write('</body>\n</html>\n')
def get_recommendation_dataframe(self, cosine_similarities, cosin_simi_latent_topics, cosine_similarities_rank, cosin_simi_latent_topics_rank): ''' - OUTPUT: df_recom data frame each row has the following: r['score'] = dict [cosine_similarities[i], cosine_similarities_rank[i],cosin_simi_latent_topics[i],cosin_simi_latent_topics_rank[i] ] r['topics'] dict [(itpic, topic_name, weight),topic:weight}, {..}] r['title'], r['body_text']. r['url'] ''' desc_sim_indexes = np.argsort(cosine_similarities)[::-1] recommed_articles = [] i_print = 0 max_cosine_sim_tfidf = 0.5 if self.max_cosine_sim_tfidf: max_cosine_sim_tfidf = self.max_cosine_sim_tfidf for i in desc_sim_indexes[:self.max_rank]: if cosin_simi_latent_topics_rank[i] < self.max_rank and i_print < self.top_k_recommend: r = {} url = self.df_articles.iloc[i].url r['score'] = {'cosine_sim_tfidf': cosine_similarities[i], 'cosine_sim_tfidf_rank': cosine_similarities_rank[ i], 'cosine_simi_latent_topics': cosin_simi_latent_topics[i], 'cosine_sim_latent_topics_rank': cosin_simi_latent_topics_rank[i]} r['title'] = self.df_articles.iloc[i].title body_cleaned = ascii_text( self.df_articles.iloc[i].body_text[:self.max_len_body_text]) r['body_text'] = body_cleaned r['url'] = url r['topics'] = self.sorted_topics_articles[i][ 0:self.top_k_topics] # (itopic, topic_name, weight) r['relevance'] = min( 100, int(cosine_similarities[i] / self.max_cosine_sim_tfidf * 100)) recommed_articles.append(r) # print sorted_topics_articles[i][0:2] print cosine_similarities[i], cosine_similarities_rank[i], '***' + self.df_articles.iloc[i].title + '***' # print cosin_simi_latent_topics[i],cosin_simi_latent_topics_rank[i] # print #body_cleaned = ascii_text(df_articles.iloc[i].body_text[:300]) # print body_cleaned # print i_print = i_print + 1 df_recom = pd.DataFrame(recommed_articles) return df_recom
def make_recommendation(fname, model_name='v2_2'): ''' test run content based recommendation content in fname command line make recommendations - INPUT: fname str input file name ( in folder data/) - OUTPUT: ''' # load model t0 = time.time() func_tokenizer = TfidfVectorizer(stop_words='english').build_tokenizer() #model_name = 'v2_2' topic_model = load_topic_model(model_name, func_tokenizer) t1 = time.time() # time it print "finished in %4.4f %s " % ((t1 - t0) / 60, 'loading model\n') t0 = t1 print 'fname: %s' % fname #read in input cleaned_slack = read_slack_msgs(func_tokenizer, fname=fname) # print type(cleaned_slack) W, tokenized_slacks2, test_X2 = topic_model.transform_bodytext2topics( [cleaned_slack], 1) print 'topics for slack messages' print topic_model.sorted_topics_for_articles(W) t1 = time.time() # time it print "finished in %4.4f min %s " % ((t1 - t0) / 60, 'topics of slack message\n') # load articles t0 = t1 df_articles = read_articles() W_articles, tokenized_articles, X_articles = topic_model.transform_bodytext2topics( df_articles.body_text, 1) # print topic_model.sorted_topics_for_articles(W_articles[:1,:]) sorted_topics_articles = topic_model.sorted_topics_for_articles(W_articles) # print sorted_topics_articles[:1] t1 = time.time() # time it print '%i articles processed' % df_articles.shape[0] print "finished in %4.4f min for %s " % ((t1 - t0) / 60, 'topics of articles\n') #test_X2, tokenized_slacks2 = transform_tfidf(vectorizer, [cleaned_slack]) #test_X2 = test_X2.getA().flatten() # summary of input top_n = 50 print "top %i most frequenct features in input %s" % (top_n, fname) sorted_feature_indexes = np.argsort(test_X2, axis=1) # print test_X2[desc_feature_indexes[:top_n]] features = topic_model.vectorizer.get_feature_names() i_article = 0 desc_feature_indexes = sorted_feature_indexes[ i_article, :].getA().flatten()[::-1] txt_list = [] for i in desc_feature_indexes[:top_n]: txt_list.append('%s (%.2f)' % (features[i], test_X2[i_article, i])) print ', '.join(txt_list) # cal simmilarity to all articles t0 = time.time() cosine_similarities = linear_kernel( X_articles, test_X2[i_article, :]).flatten() cosin_simi_latent_topics = linear_kernel( W_articles, W[i_article, :]).flatten() cosine_similarities_rank = get_rank(cosine_similarities) cosin_simi_latent_topics_rank = get_rank(cosin_simi_latent_topics) t1 = time.time() # time it print "finished in %4.4f min for %s " % ((t1 - t0) / 60, 'calculate cosine similarity\n') # diagnostic similarity plots fig, ax = plt.subplots(1, 2, figsize=(10, 6)) ax[0].scatter(cosine_similarities, cosin_simi_latent_topics, alpha=0.2) ax[0].set_title('cosine similarity: tfidf vs. latent topic') ax[1].scatter( cosine_similarities_rank, cosin_simi_latent_topics_rank, alpha=0.2) ax[1].set_title('rank cosine similarity: tfidf vs. latent topic') # plt.show() fig.savefig(fname.replace('.txt', '') + '_similarities.png') plt.close(fig) fig = plt.figure() plt.hist(cosine_similarities, bins=30, alpha=0.2, label='tfidf') plt.hist(cosin_simi_latent_topics, bins=30, alpha=0.2, label='topics') plt.title('cosine similarity to all articles') plt.legend() fig.savefig(fname.replace('.txt', '') + '_similarity_hist.png') # plt.show() # recommendations print '--------------- recommendations --------------' desc_sim_indexes = np.argsort(cosine_similarities)[::-1] cosine_similarities[desc_sim_indexes[:20]] i_print = 0 top_k = 5 for i in desc_sim_indexes[:50]: if cosin_simi_latent_topics_rank[i] < 50 and i_print < top_k: url = df_articles.iloc[i].url print sorted_topics_articles[i][0:2] print cosine_similarities[i], cosine_similarities_rank[i], '***' + df_articles.iloc[i].title + '***' print cosin_simi_latent_topics[i], cosin_simi_latent_topics_rank[i] print body_cleaned = ascii_text(df_articles.iloc[i].body_text[:300]) print body_cleaned print i_print = i_print + 1