def insert_new_relation(post1): """ Arguments: - `post1`: newly added post """ posts = Post.objects() if post1.post_type == "pdf": return None for post2 in posts: if post2.post_type != "pdf" and post2.url != post1.url: # text similarity text1 = post1.content.lower() text2 = post2.content.lower() vector1 = text_to_vector(text1) vector2 = text_to_vector(text2) content_cosine = get_cosine(vector1, vector2) # title similarity title1 = post1.title.lower() title2 = post2.title.lower() tvector1 = text_to_vector(title1) tvector2 = text_to_vector(title2) title_cosine = get_cosine(tvector1, tvector2) category_point = get_category_point(post1, post2) cosine = content_cosine + title_cosine + category_point if cosine > 0.1: relation = Relation(post1, post2, cosine) relation.save() relation = Relation(post2, post1, cosine) relation.save()
def build_relation_db(): """ Build a relation collection that includes every similarity between posts. Only includes relation when similarity > 0.2 This takes a lot of time, run this periodically. Eg. once a week or everynight. Use insert_new_relation() for new posts """ posts = Post.objects() posts2 = Post.objects() Relation.drop_collection() counter = 0 print counter for p1 in posts: for p2 in posts2: if p1.url != p2.url: if p1.post_type != "pdf" and p2.post_type != "pdf": counter = counter + 1 # text similarity text1 = p1.content.lower() text2 = p2.content.lower() vector1 = text_to_vector(text1) vector2 = text_to_vector(text2) content_cosine = get_cosine(vector1, vector2) # title similarity title1 = p1.title.lower() title2 = p2.title.lower() tvector1 = text_to_vector(title1) tvector2 = text_to_vector(title2) title_cosine = get_cosine(tvector1, tvector2) category_point = get_category_point(p1, p2) cosine = content_cosine + title_cosine + category_point if cosine > 0.1: relation = Relation(p1, p2, cosine) relation.save() print counter