def __init__(self, min_publications_per_author, topics_per_author):
     self.load_data()
     self.get_computed_TOM()
     self.min_publications_per_author = min_publications_per_author
     self.n_topics_per_author = topics_per_author
     feature_names = zip(*sorted(self.topic_model.corpus.vocabulary.items(), key=lambda a: a[0]))[1]
     self.dict_topic_top_words, \
     self.dict_doc_top_topics, \
     self.dict_topic_top_docs = nmf_clustering(data=None, doc_topic_mat=self.topic_model.document_topic_matrix,
                                               topic_token_mat=self.topic_model.topic_word_matrix,
                                               feature_names=feature_names)
示例#2
0
 def __init__(self, min_publications_per_author, topics_per_author):
     self.load_data()
     self.get_computed_TOM()
     self.min_publications_per_author = min_publications_per_author
     self.n_topics_per_author = topics_per_author
     feature_names = zip(*sorted(self.topic_model.corpus.vocabulary.items(),
                                 key=lambda a: a[0]))[1]
     self.dict_topic_top_words, \
     self.dict_doc_top_topics, \
     self.dict_topic_top_docs = nmf_clustering(data=None, doc_topic_mat=self.topic_model.document_topic_matrix,
                                               topic_token_mat=self.topic_model.topic_word_matrix,
                                               feature_names=feature_names)
def create_nmf_graph():
    one_pages = load_text_data("../../input/pdfs/1page/", "txt")
    dict_topics = nmf_clustering(one_pages)
    import pprint
    pprint.pprint(dict_topics)
    d3js_dict = defaultdict(list)
    # import snap as sn
    # G = sn.TUNGraph.New()
    # for topic_idx, topic in dict_topics.iteritems():


    pass
    def __init__(self, theta, min_n_papers_per_author, n_topics_per_author, topic_topic=False):
        self.load_computed_topics()
        # self.dict_topic_top_words, self.dict_doc_top_topics, self.dict_topic_top_docs = nmf_clustering(self.one_pages)
        feature_names = zip(*sorted(self.topic_model.corpus.vocabulary.items(), key=lambda a: a[0]))[1]

        self.dict_topic_top_words, \
        self.dict_doc_top_topics, \
        self.dict_topic_top_docs = nmf_clustering(data=None, doc_topic_mat=self.topic_model.document_topic_matrix,
                                                  topic_token_mat=self.topic_model.topic_word_matrix,
                                                  feature_names=feature_names)
        # return dict_topic_top_words, dict_doc_top_topics, dict_topic_top_docs
        self.load_data()
        self.theta = theta
        self.min_publications_per_author = min_n_papers_per_author
        self.n_topics_per_author = n_topics_per_author
        self.topic_topic_recomms = topic_topic
def get_1page_topics():
    one_pages = load_text_data("../../input/pdfs/1page/", "txt")
    return nmf_clustering(one_pages)
def get_abstract_and_title_topics():
    egc_df = get_EGC_articles(load_data_egc("../../input/RNTI_articles_export_fixed1347_ids.txt"))

    egc_df["title+abstract"] = egc_df["title"] + " " + egc_df["abstract"].fillna("")
    data = dict(zip(egc_df["id"].tolist(), egc_df["title+abstract"].tolist()))
    return nmf_clustering(data, k=15)
def get_1page_topics():
    one_pages = load_text_data("../../input/pdfs/1page/", "txt")
    return nmf_clustering(one_pages)
def get_abstract_and_title_topics():
    egc_df = get_EGC_articles(load_data_egc("../../input/RNTI_articles_export_fixed1347_ids.txt"))

    egc_df["title+abstract"] = egc_df["title"] + " " + egc_df["abstract"].fillna("")
    data = dict(zip(egc_df["id"].tolist(), egc_df["title+abstract"].tolist()))
    return nmf_clustering(data, k=15)