示例#1
0
def get_features(article1_name, article2_name, article1_words, article2_words, num_lda_topics, name_to_type, type_to_depth, type_to_node):
    features = []

    # feature 1: number of words in overlap (Jaccard sim)
    size_int = float(len(set(article1_words).intersection(set(article2_words))))
    size_union = float(len(set(article1_words).union(set(article2_words))))
    feat1 = size_int / size_union
    features.append(feat1)

    # feature 2: lda - Hellinger distance betwene topic distributions
    vec1 = lda.get_topics_for_article_text(article1_words, num_lda_topics)
    vec2 = lda.get_topics_for_article_text(article2_words, num_lda_topics)
    feat2 = lda.get_hellinger(vec1, vec2, num_lda_topics)
    features.append(feat2)

    # feature 3: TF-IDF - cosine sim
    vec1 = lda.get_tfidf_for_doc(article1_words)
    vec2 = lda.get_tfidf_for_doc(article2_words)
    feat3 = lda.get_cosine_sim(vec1, vec2)
    features.append(feat3)

    # feature 4: num words of article 1
    features.append(len(article1_words))

    # feature 5: num words of article 2
    features.append(len(article2_words))

    return features
示例#2
0
def get_article_distance(article1_name, article2_name):
    #print "Article 1: %s, article 2: %s" % (article1_name, article2_name)

    #article1_text = None
    #article2_text = None

    article1_text = wiki_index.get_article(article1_name)
    article2_text = wiki_index.get_article(article2_name)

    # split the text by space; convert to a set; filter stop words
    #article1_words = set(article1_text)
    #article2_words = set(article2_text)

    # feature 1 = 1.0 / number of non-stop words in common
    # feature 2 = 1.0 / Jaccard similarity
    # feature 3 = 1.0 / cos sim between tfidf vectors
    # feature 4 = Hellinger distance between LDA distr

    #size_int = float(len(article1_words.intersection(article2_words)))
    #feat1 = size_int

    #size_union = float(len(article1_words.union(article2_words)))
    #feat2 = size_int / size_union
    
    # feature 3: tf idf cosine sim
    # vec1 = lda.get_tfidf_for_doc(article1_text)
    # vec2 = lda.get_tfidf_for_doc(article2_text)
    # feat3 = lda.get_cosine_sim(vec1, vec2)

    # feature 4: hellinger dist
    vec1 = lda.get_topics_for_article_text(article1_text, 10)
    vec2 = lda.get_topics_for_article_text(article2_text, 10)
    feat4 = lda.get_hellinger(vec1, vec2, 10)

    return feat4