def __init__(self, min_publications_per_author, topics_per_author): self.load_data() self.get_computed_TOM() self.min_publications_per_author = min_publications_per_author self.n_topics_per_author = topics_per_author feature_names = zip(*sorted(self.topic_model.corpus.vocabulary.items(), key=lambda a: a[0]))[1] self.dict_topic_top_words, \ self.dict_doc_top_topics, \ self.dict_topic_top_docs = nmf_clustering(data=None, doc_topic_mat=self.topic_model.document_topic_matrix, topic_token_mat=self.topic_model.topic_word_matrix, feature_names=feature_names)
def create_nmf_graph(): one_pages = load_text_data("../../input/pdfs/1page/", "txt") dict_topics = nmf_clustering(one_pages) import pprint pprint.pprint(dict_topics) d3js_dict = defaultdict(list) # import snap as sn # G = sn.TUNGraph.New() # for topic_idx, topic in dict_topics.iteritems(): pass
def __init__(self, theta, min_n_papers_per_author, n_topics_per_author, topic_topic=False): self.load_computed_topics() # self.dict_topic_top_words, self.dict_doc_top_topics, self.dict_topic_top_docs = nmf_clustering(self.one_pages) feature_names = zip(*sorted(self.topic_model.corpus.vocabulary.items(), key=lambda a: a[0]))[1] self.dict_topic_top_words, \ self.dict_doc_top_topics, \ self.dict_topic_top_docs = nmf_clustering(data=None, doc_topic_mat=self.topic_model.document_topic_matrix, topic_token_mat=self.topic_model.topic_word_matrix, feature_names=feature_names) # return dict_topic_top_words, dict_doc_top_topics, dict_topic_top_docs self.load_data() self.theta = theta self.min_publications_per_author = min_n_papers_per_author self.n_topics_per_author = n_topics_per_author self.topic_topic_recomms = topic_topic
def get_1page_topics(): one_pages = load_text_data("../../input/pdfs/1page/", "txt") return nmf_clustering(one_pages)
def get_abstract_and_title_topics(): egc_df = get_EGC_articles(load_data_egc("../../input/RNTI_articles_export_fixed1347_ids.txt")) egc_df["title+abstract"] = egc_df["title"] + " " + egc_df["abstract"].fillna("") data = dict(zip(egc_df["id"].tolist(), egc_df["title+abstract"].tolist())) return nmf_clustering(data, k=15)