Пример #1
0
def run_search():    
    #G = load_30k_graph_object()
    article_pairs = load_article_pairs()
    adj_list_30k = load_30k_adj_list()

    num_fail = 0
    num_successes = 0

    results = []

    c = 0

    for (article1_name, article2_name) in article_pairs[:100]:
        print c
        #print "Article 1: %s, article 2: %s" % (article1_name, article2_name)

        src_id = int(title_to_linenum[article1_name])
        dst_id = int(title_to_linenum[article2_name])

        success_dec_path_lengths = []
        suc = 0
        fail = 0

        #try:
        (success_or_fail, dec_search_path_length) = util.run_decentralized_search(src_id, dst_id, \
            adj_list_30k, linenum_to_title, util.get_article_distance)

        #shortest_path_length = get_graph_shortest_path(G, src_id, dst_id)
        #(ont_dist, lca_height) = get_ontology_distance(article1_name, article2_name)

        # failure 
        if success_or_fail == "FAILURE":
            fail += 1
            num_fail += 1
        else:
            suc += 1
            num_successes += 1
            success_dec_path_lengths.append(dec_search_path_length)

        # except KeyError:
        #     continue

        x = (article1_name, article2_name, suc, fail, success_dec_path_lengths)
        results.append(x)
        c += 1

        #print success_or_fail

    print "%d successes, %d failures" % (num_successes, num_fail)

    # save object to file
    load_data.save_object(results, "bin/results/feat3.pk1")
Пример #2
0
def save_article_pairs():
    NUM_PAIRS = 50000

    articles_30k = load_30k_articles()
    article_pairs = [] # list of (name1, name2)

    count = 0
    while count < NUM_PAIRS:
        article1_name = random.choice(articles_30k)
        article2_name = random.choice(articles_30k)
        while article1_name == article2_name:
            article2_name = random.choice(articles_30k)
        article_pairs.append((article1_name, article2_name))
        count += 1

    load_data.save_object(article_pairs, ARTICLE_PAIRS_FILE)
Пример #3
0
def process_in_snappy():
    global articles, adj_list

    print "Starting graph processing..."

    G1 = create_snap_graph_from_adjlist(adj_list)

    print "Finding largest CC..."

    G = snap.GetMxScc(G1)

    print "Size of max SCC (nodes): %s" % str(G.GetNodes())
    print "Size of max SCC (edges): %s" % str(G.GetEdges())

    # update articles
    print "Updating articles..."
    new_articles = []
    for node in G.Nodes():
        node_id = node.GetId()
        article_name = linenum_to_title[str(node_id)]
        new_articles.append(article_name)

    print "Length of new articles = %d" % len(new_articles)

    # update adj_list
    print "Updating adj_list..."
    new_adj_list = {}
    for Edge in G.Edges():
        src_id = Edge.GetSrcNId()
        dst_id = Edge.GetDstNId()
    
        num_src = np.uint32(src_id)
        num_dst = np.uint32(dst_id)

        if num_src not in new_adj_list:
            new_adj_list[num_src] = np.array([], dtype=np.uint32)
        new_adj_list[num_src] = np.append(new_adj_list[num_src], num_dst)

    # save adj_list and articles
    print "Saving to binary..."
    articles = new_articles
    adj_list = new_adj_list
    load_data.save_object(new_adj_list, "bin/adj_list.pk1")
    load_data.save_object(new_articles, "bin/article_names.pk1")
Пример #4
0
def run_random_search(): 
    article_pairs = load_article_pairs()
    adj_list_30k = load_30k_adj_list()

    # tuple of (a1_name, a2_name, # success, # fail, success_dec_path_lengths)
    results = []

    for (article1_name, article2_name) in article_pairs[:100]:
        src_id = int(title_to_linenum[article1_name])
        dst_id = int(title_to_linenum[article2_name])

        success_dec_path_lengths = []
        num_successes = 0
        num_fail = 0

        try:
            for i in range(1000):
                (success_or_fail, dec_search_path_length) = util.run_decentralized_search(src_id, dst_id, \
                    adj_list_30k, linenum_to_title, util.get_article_distance)

                # failure 
                if success_or_fail == "FAILURE":
                    num_fail += 1
                else:
                    num_successes += 1
                    success_dec_path_lengths.append(dec_search_path_length)
        
        except KeyError:
            continue

        x = (article1_name, article2_name, num_successes, num_fail, success_dec_path_lengths)
        results.append(x)

        print "%d successes, %d failures" % (num_successes, num_fail)

    print "Number of pairs actually completed: %d" % len(results)

    # save object to file
    load_data.save_object(results, "bin/results/random_dec_search_1ktrials.pk1")
Пример #5
0
def save_30k_articles(G):
    NUM = 30000
    articles_30k = set()
    ids_30k = set()
    curr_hop = 1
    
    first_article = random.choice(articles)
    first_id = int(title_to_linenum[first_article])
    articles_30k.add(first_article)
    ids_30k.add(first_id)

    while len(articles_30k) < NUM:
        NodeVec = snap.TIntV()
        snap.GetNodesAtHop(G, first_id, curr_hop, NodeVec, True)
        for next_id in NodeVec:
            title = linenum_to_title[str(next_id)]
            articles_30k.add(title)
            ids_30k.add(next_id)
        curr_hop += 1

    print "It took %d hops to get to %d nodes!" % (curr_hop, NUM)

    load_data.save_object(list(articles_30k), ARTICLE_NAMES_30K_FILE)

    # save adj_list_30k
    new_adj_list = {}

    for key in adj_list.keys():
        if key in ids_30k:
            if key not in new_adj_list:
                new_adj_list[key] = np.array([], dtype=np.uint32)

            for node_id in adj_list[key]:
                if node_id in ids_30k:
                    new_adj_list[key] = np.append(new_adj_list[key], node_id)

    load_data.save_object(new_adj_list, ADJ_LIST_30K_FILE)
Пример #6
0
def generate_gold():
    G = load_30k_graph_object()
    article_pairs = load_article_pairs()
    results = []
    for (article1_name, article2_name) in article_pairs[:100]:
        
        src_id = int(title_to_linenum[article1_name])
        dst_id = int(title_to_linenum[article2_name])

        shortest_path_length = get_graph_shortest_path(G, src_id, dst_id)
        (ont_dist, lca_height) = get_ontology_distance(article1_name, article2_name)
        
        x = (article1_name, article2_name, shortest_path_length, ont_dist, lca_height)
        results.append(x)

    load_data.save_object(results, "bin/results/goldset.pk1")


#generate_gold()



# print "Num LDA topics = %d" % num_lda_topics
# print (f1, score_test_or_dev, score_train)

# plt.figure(1)
# plt.xlabel('Actual height of LCA')
# plt.ylabel('Predicted height of LCA')
# plt.title('Predicted vs actual height of LCA in ontology tree (for dev set).')
# plt.plot(y_actual_dev, y_predicted_dev, 'bo')

# x1 = np.arange(min(y_actual_dev), max(y_actual_dev), 0.1)
# y1 = [v for v in x1]

# plt.plot(x1, y1, 'r-')
# plt.show()
Пример #7
0
def save_pairwise_distances(actual_shortest_path):
    load_data.save_object(actual_shortest_path, "bin/pairwise_distances.pk1")