def run_search(): #G = load_30k_graph_object() article_pairs = load_article_pairs() adj_list_30k = load_30k_adj_list() num_fail = 0 num_successes = 0 results = [] c = 0 for (article1_name, article2_name) in article_pairs[:100]: print c #print "Article 1: %s, article 2: %s" % (article1_name, article2_name) src_id = int(title_to_linenum[article1_name]) dst_id = int(title_to_linenum[article2_name]) success_dec_path_lengths = [] suc = 0 fail = 0 #try: (success_or_fail, dec_search_path_length) = util.run_decentralized_search(src_id, dst_id, \ adj_list_30k, linenum_to_title, util.get_article_distance) #shortest_path_length = get_graph_shortest_path(G, src_id, dst_id) #(ont_dist, lca_height) = get_ontology_distance(article1_name, article2_name) # failure if success_or_fail == "FAILURE": fail += 1 num_fail += 1 else: suc += 1 num_successes += 1 success_dec_path_lengths.append(dec_search_path_length) # except KeyError: # continue x = (article1_name, article2_name, suc, fail, success_dec_path_lengths) results.append(x) c += 1 #print success_or_fail print "%d successes, %d failures" % (num_successes, num_fail) # save object to file load_data.save_object(results, "bin/results/feat3.pk1")
def save_article_pairs(): NUM_PAIRS = 50000 articles_30k = load_30k_articles() article_pairs = [] # list of (name1, name2) count = 0 while count < NUM_PAIRS: article1_name = random.choice(articles_30k) article2_name = random.choice(articles_30k) while article1_name == article2_name: article2_name = random.choice(articles_30k) article_pairs.append((article1_name, article2_name)) count += 1 load_data.save_object(article_pairs, ARTICLE_PAIRS_FILE)
def process_in_snappy(): global articles, adj_list print "Starting graph processing..." G1 = create_snap_graph_from_adjlist(adj_list) print "Finding largest CC..." G = snap.GetMxScc(G1) print "Size of max SCC (nodes): %s" % str(G.GetNodes()) print "Size of max SCC (edges): %s" % str(G.GetEdges()) # update articles print "Updating articles..." new_articles = [] for node in G.Nodes(): node_id = node.GetId() article_name = linenum_to_title[str(node_id)] new_articles.append(article_name) print "Length of new articles = %d" % len(new_articles) # update adj_list print "Updating adj_list..." new_adj_list = {} for Edge in G.Edges(): src_id = Edge.GetSrcNId() dst_id = Edge.GetDstNId() num_src = np.uint32(src_id) num_dst = np.uint32(dst_id) if num_src not in new_adj_list: new_adj_list[num_src] = np.array([], dtype=np.uint32) new_adj_list[num_src] = np.append(new_adj_list[num_src], num_dst) # save adj_list and articles print "Saving to binary..." articles = new_articles adj_list = new_adj_list load_data.save_object(new_adj_list, "bin/adj_list.pk1") load_data.save_object(new_articles, "bin/article_names.pk1")
def run_random_search(): article_pairs = load_article_pairs() adj_list_30k = load_30k_adj_list() # tuple of (a1_name, a2_name, # success, # fail, success_dec_path_lengths) results = [] for (article1_name, article2_name) in article_pairs[:100]: src_id = int(title_to_linenum[article1_name]) dst_id = int(title_to_linenum[article2_name]) success_dec_path_lengths = [] num_successes = 0 num_fail = 0 try: for i in range(1000): (success_or_fail, dec_search_path_length) = util.run_decentralized_search(src_id, dst_id, \ adj_list_30k, linenum_to_title, util.get_article_distance) # failure if success_or_fail == "FAILURE": num_fail += 1 else: num_successes += 1 success_dec_path_lengths.append(dec_search_path_length) except KeyError: continue x = (article1_name, article2_name, num_successes, num_fail, success_dec_path_lengths) results.append(x) print "%d successes, %d failures" % (num_successes, num_fail) print "Number of pairs actually completed: %d" % len(results) # save object to file load_data.save_object(results, "bin/results/random_dec_search_1ktrials.pk1")
def save_30k_articles(G): NUM = 30000 articles_30k = set() ids_30k = set() curr_hop = 1 first_article = random.choice(articles) first_id = int(title_to_linenum[first_article]) articles_30k.add(first_article) ids_30k.add(first_id) while len(articles_30k) < NUM: NodeVec = snap.TIntV() snap.GetNodesAtHop(G, first_id, curr_hop, NodeVec, True) for next_id in NodeVec: title = linenum_to_title[str(next_id)] articles_30k.add(title) ids_30k.add(next_id) curr_hop += 1 print "It took %d hops to get to %d nodes!" % (curr_hop, NUM) load_data.save_object(list(articles_30k), ARTICLE_NAMES_30K_FILE) # save adj_list_30k new_adj_list = {} for key in adj_list.keys(): if key in ids_30k: if key not in new_adj_list: new_adj_list[key] = np.array([], dtype=np.uint32) for node_id in adj_list[key]: if node_id in ids_30k: new_adj_list[key] = np.append(new_adj_list[key], node_id) load_data.save_object(new_adj_list, ADJ_LIST_30K_FILE)
def generate_gold(): G = load_30k_graph_object() article_pairs = load_article_pairs() results = [] for (article1_name, article2_name) in article_pairs[:100]: src_id = int(title_to_linenum[article1_name]) dst_id = int(title_to_linenum[article2_name]) shortest_path_length = get_graph_shortest_path(G, src_id, dst_id) (ont_dist, lca_height) = get_ontology_distance(article1_name, article2_name) x = (article1_name, article2_name, shortest_path_length, ont_dist, lca_height) results.append(x) load_data.save_object(results, "bin/results/goldset.pk1") #generate_gold() # print "Num LDA topics = %d" % num_lda_topics # print (f1, score_test_or_dev, score_train) # plt.figure(1) # plt.xlabel('Actual height of LCA') # plt.ylabel('Predicted height of LCA') # plt.title('Predicted vs actual height of LCA in ontology tree (for dev set).') # plt.plot(y_actual_dev, y_predicted_dev, 'bo') # x1 = np.arange(min(y_actual_dev), max(y_actual_dev), 0.1) # y1 = [v for v in x1] # plt.plot(x1, y1, 'r-') # plt.show()
def save_pairwise_distances(actual_shortest_path): load_data.save_object(actual_shortest_path, "bin/pairwise_distances.pk1")