def get_article_distance(article1_name, article2_name): #print "Article 1: %s, article 2: %s" % (article1_name, article2_name) #article1_text = None #article2_text = None article1_text = wiki_index.get_article(article1_name) article2_text = wiki_index.get_article(article2_name) # split the text by space; convert to a set; filter stop words #article1_words = set(article1_text) #article2_words = set(article2_text) # feature 1 = 1.0 / number of non-stop words in common # feature 2 = 1.0 / Jaccard similarity # feature 3 = 1.0 / cos sim between tfidf vectors # feature 4 = Hellinger distance between LDA distr #size_int = float(len(article1_words.intersection(article2_words))) #feat1 = size_int #size_union = float(len(article1_words.union(article2_words))) #feat2 = size_int / size_union # feature 3: tf idf cosine sim # vec1 = lda.get_tfidf_for_doc(article1_text) # vec2 = lda.get_tfidf_for_doc(article2_text) # feat3 = lda.get_cosine_sim(vec1, vec2) # feature 4: hellinger dist vec1 = lda.get_topics_for_article_text(article1_text, 10) vec2 = lda.get_topics_for_article_text(article2_text, 10) feat4 = lda.get_hellinger(vec1, vec2, 10) return feat4
def extract_nlp_features(article1_name, article2_name, num_lda_topics, name_to_type, type_to_depth, type_to_node): article_name_to_linenum = wiki_index.get_article_name_to_linenum() # lists of words article1_text = wiki_index.get_article(article1_name) article2_text = wiki_index.get_article(article2_name) return get_features(article1_name, article2_name, article1_text, article2_text, num_lda_topics, name_to_type, type_to_depth, type_to_node)
def check_adjlist_articles(adj_list_arg): print "Checking adj list..." errors = 0 total = 0 for key in adj_list_arg: name = linenum_to_title[str(key)] try: total += 1 text = wiki_index.get_article(name) except KeyError: errors += 1 for v in adj_list_arg[key]: name1 = linenum_to_title[str(v)] try: total += 1 text1 = wiki_index.get_article(name1) except KeyError: errors += 1 print "Number of errors = %d; total = %d" % (errors, total)
def get_topics_for_article_name(article_name, num_topics): model = None if num_topics == 10: model = lda_10 elif num_topics == 30: model = lda_30 elif num_topics == 60: model = lda_60 elif num_topics == 120: model = lda_120 else: raise ValueError("bad number of topics") article = wiki_index.get_article(article_name) doc_bow = dictionary.doc2bow(article) return model[doc_bow]
def get_tfidf_for_article_name(article_name): bow = dictionary.doc2bow(wiki_index.get_article(article_name)) return tfidf[bow]