def test_construct_ptree(): A, B, C, D = 'A', 'B', 'C', 'D' labelset_list = [(A, B, C), (B), (A, C), tuple([D]), (B, C), (A)] labelset_list = map(set, labelset_list) actual_tree = construct_ptree(labelset_list) expected_tree = [[], [0], [0], [], [0, 1, 2], [0, 2]] # [(0, 1), (0, 2), # (1, 4), (2, 4), (2, 5)] assert_equal(actual_tree, expected_tree)
def main(): N_TOP_TAGs = 50 from db import conn articles = conn['bloomberg'].articles tag_freq = count_tags(articles.find()) target_tags = set([k for k, _ in tag_freq.most_common(N_TOP_TAGs)]) compact_edges = compactize_edges_by_tags(articles.find(), target_tags) pprint(compact_edges[:10]) print(sum([1 for e in compact_edges if len(e['tags']) > 1])) pprint(len(compact_edges)) sorted_compact_edges = sorted(compact_edges, key=lambda item: item['publish_time']) tagset_list = [e['tags'] for e in sorted_compact_edges] tree = construct_ptree(tagset_list)