def get_features(article1_name, article2_name, article1_words, article2_words, num_lda_topics, name_to_type, type_to_depth, type_to_node): features = [] # feature 1: number of words in overlap (Jaccard sim) size_int = float(len(set(article1_words).intersection(set(article2_words)))) size_union = float(len(set(article1_words).union(set(article2_words)))) feat1 = size_int / size_union features.append(feat1) # feature 2: lda - Hellinger distance betwene topic distributions vec1 = lda.get_topics_for_article_text(article1_words, num_lda_topics) vec2 = lda.get_topics_for_article_text(article2_words, num_lda_topics) feat2 = lda.get_hellinger(vec1, vec2, num_lda_topics) features.append(feat2) # feature 3: TF-IDF - cosine sim vec1 = lda.get_tfidf_for_doc(article1_words) vec2 = lda.get_tfidf_for_doc(article2_words) feat3 = lda.get_cosine_sim(vec1, vec2) features.append(feat3) # feature 4: num words of article 1 features.append(len(article1_words)) # feature 5: num words of article 2 features.append(len(article2_words)) return features
def get_article_distance(article1_name, article2_name): #print "Article 1: %s, article 2: %s" % (article1_name, article2_name) #article1_text = None #article2_text = None article1_text = wiki_index.get_article(article1_name) article2_text = wiki_index.get_article(article2_name) # split the text by space; convert to a set; filter stop words #article1_words = set(article1_text) #article2_words = set(article2_text) # feature 1 = 1.0 / number of non-stop words in common # feature 2 = 1.0 / Jaccard similarity # feature 3 = 1.0 / cos sim between tfidf vectors # feature 4 = Hellinger distance between LDA distr #size_int = float(len(article1_words.intersection(article2_words))) #feat1 = size_int #size_union = float(len(article1_words.union(article2_words))) #feat2 = size_int / size_union # feature 3: tf idf cosine sim # vec1 = lda.get_tfidf_for_doc(article1_text) # vec2 = lda.get_tfidf_for_doc(article2_text) # feat3 = lda.get_cosine_sim(vec1, vec2) # feature 4: hellinger dist vec1 = lda.get_topics_for_article_text(article1_text, 10) vec2 = lda.get_topics_for_article_text(article2_text, 10) feat4 = lda.get_hellinger(vec1, vec2, 10) return feat4