def SPFracAUC(network, nonexist_edges, iterations, steps): ''' Function takes a network, list of non-existent edges, the number of iterations, and the percent of edges to sample (steps) and runs the Shortest Path scoring function over each sampled network for the specified number of iterations. :param network: undirected graph :param nonexist_edges: list of non-existent edges from the graph :param iterations: integer representing the number of iterations to run :param steps: list of percent of edges to sample :return: ''' auc = []; prec = [] for j in xrange(iterations): iteration_data = GraphMaker(network, steps) training_graph = iteration_data[0] testing_edges = iteration_data[1] missing_scores = LinkPrediction.ShortestPath(training_graph, testing_edges) nonexist_scores = LinkPrediction.ShortestPath(training_graph, nonexist_edges) #get AUC auc_round = EvaluationMetrics.AUC(nonexist_scores, missing_scores) auc.append(auc_round) #precision - getting top or bottom K links depends on whether or not AUC is >/< 0.5 prec_round = EvaluationMetrics.KPrecision(auc_round, dict(missing_scores, **nonexist_scores), testing_edges) prec.append(prec_round) return auc, prec
def main(): ##### Testing MESH class mv = MESH('MESH Terms.txt', '') #mv.create_MESH_vocab_and_IDmapping(); mv.save_MESH_IDmapping('MESH ID Mapping.txt') mesh_id_mapping = mv.read_MESH_IDmapping('MESH ID Mapping.txt') ##### Testing Corpus class #folder = "corpus/" #num = 100 #corp = Corpus(folder,n=num) ##### Vectorizing #tf_vectorizer, tf_matrix = corp.vectorize_corpus(corp.clean(),voc) #tv = q.transform_query(tf_vectorizer).flatten() ##### Getting data tf = TextFile('Medical Case Reports.txt', '', encode='latin-1') #tf = TextFile('pubmed_result.txt','',encode='latin-1') mesh_terms = tf.get_MESH_terms() ui = tf.get_UI(mesh_id_mapping) keywords = tf.get_keywords() titles = tf.get_titles() ##### Assigning ICD10 codes asg = Assigner('MESH_ICD10_Mapping.csv', mesh_id_mapping) mesh_codes = asg.assign_MESHterms_ICD10(ui) #print("Printing direct mapped codes", mesh_codes) #keywords_codes = asg.assign_keywords_ICD10(keywords) #print("Printing codes using keywords", keywords_codes) #titles_codes = asg.assign_titles_ICD10(titles) #print("Printing codes using titles", titles_codes) #partial_codes = asg.assign_context_aware_codes(stopword_percent_include=0.92) #print("Printing context-aware codes", partial_codes) #total_codes = asg.assign_all_ICD10(ui,keywords,titles,stopword_percent_include=0.92) #print("Printing all codes", total_codes) #asg.write_codes_to_csv(tot,'all_codes.csv') # Case Reports_ICD10_Mapping.csv ##### Link Prediction G = nx.Graph() d = mesh_codes G = lp.create_weighted_bipartite_graph(G, d) ratings = [] for i in nx.connected_components(G): G_new = nx.Graph() print(i) for j in i: if j.isdigit(): for v in d[j]: G_new.add_node(v[0], bipartite='code') G_new.add_edge(j, v[0], weight=v[1]) df = lp.item_based_CF(G_new) ratings.append(df) return
def SFracAUC(network, nonexist_edges, iterations, steps): ''' Function takes a network, list of non-existent edges, the number of iterations, and the percent of edges to sample (steps) and runs the SimRank scoring function over each sampled network for the specified number of iterations. :param network: undirected graph :param nonexist_edges: list of non-existent edges from the graph :param iterations: integer representing the number of iterations to run :param steps: list of percent of edges to sample :return: ''' auc = []; prec = [] for j in xrange(iterations): iteration_data = GraphMaker(network, steps) training_graph = iteration_data[0] testing_edges = iteration_data[1] scores = LinkPrediction.SimRank(training_graph, c=0.8, num_iterations= 10) # get AUC count = 0.0 for i in xrange(1000): TN = random.sample(nonexist_edges, 1)[0] TP = random.sample(testing_edges, 1)[0] if (TN[0], TN[1]) in scores.keys(): TN_val = scores[(TN[0], TN[1])] else: TN_val = 0.0 if (TP[0], TP[1]) in scores.keys(): TP_val = scores[(TP[0], TP[1])] else: TP_val = 0.0 if TP_val > TN_val: count += 1.0 if TP_val == TN_val: count += 0.5 auc_round = count/1000 auc.append(auc_round) # precision - getting top or bottom K links depends on whether or not AUC is >/< 0.5 prec_round = EvaluationMetrics.KPrecision(auc_round, scores, testing_edges) prec.append(prec_round) return auc, prec
''' #####*********Statistic attributes of graphs************* print "*Statistic attributes of graphs:" print "N", nx.number_of_nodes(G) print "M", nx.number_of_edges(G) print "C", nx.average_clustering(G) print "Cw", nx.average_clustering(G, weight='weight') print "<d>", nx.average_shortest_path_length(G) print "r", nx.degree_assortativity_coefficient(G) #print nx.density(G) #print nx.transitivity(G) degree_list = list(G.degree_iter()) #print degree_list avg_degree_1 = 0.0 avg_degree_2 = 0.0 for node in degree_list: avg_degree_1 = avg_degree_1 + node[1] avg_degree_2 = avg_degree_2 + node[1]*node[1] avg_degree = avg_degree_1/len(degree_list) avg_degree_square = (avg_degree_2/len(degree_list)) / (avg_degree*avg_degree) print "<k>", avg_degree print "H", avg_degree_square ''' #print "============Drift_Prediction_Experiment===============================" #G = LinkPrediction.Position_Drift.Graph_Spatial_Temporal_Dynamics(G, 3) #Spatial_Temporal influence based node position drift. #Prediction_Experiment(G, Predictor, Probe_Set, Top_L, 3.0) #Top_K, Deleted_Ratio LinkPrediction.Drift_Prediction_Experiment(G, 'WCN', Probe_Set, Top_L, G.number_of_edges())
def main(): #read in graphs owl_graph = nx.read_gml( 'Network_Data/Trametinib_query_OWL_network.gml').to_undirected() nets_graph = nx.read_gml( 'Network_Data/Trametinib_query_NETS_network.gml').to_undirected() mid_graph = nx.read_gml( 'Network_Data/Trametinib_query_PART_network').to_undirected() #run link predictions for each graph nets_scores = LinkPrediction.katz(nets_graph, beta=0.001, max_power=5, weight=None, dtype=None) nets_nonexist = list(nx.non_edges(nets_graph)) nets_preds = EdgeChecker(nets_scores, nets_nonexist) owl_nonexist = list(nx.non_edges(owl_graph)) owl_scores = LinkPrediction.RPR(owl_graph, alpha=0.15, beta=0) owl_preds = EdgeChecker(owl_scores, owl_nonexist) #explore predictions len(nets_preds) #1652 np.min(nets_preds.values()) np.mean(nets_preds.values()) np.max(nets_preds.values()) #print top 20 edges sorted(Counter(sorted(nets_preds.values())).items(), key=lambda i: i[0]) #get distribution of counts sorted_scores = sorted(owl_preds.items(), key=operator.itemgetter(1), reverse=True) #biggest first sorted_scores = sorted(nets_preds.items(), key=operator.itemgetter(1), reverse=False) #smallest first #investigate the top n items edges = sorted_scores[0:20] ## Write results for use with with ranking methods # graphs graph = nx.read_gml( 'Network_Data/Trametinib_query_OWL_network.gml').to_undirected() # graph = nx.read_gml('Network_Data/Trametinib_query_NETS_network.gml').to_undirected() graph = nx.read_gml( 'Network_Data/DDI_reactome_query_NETS_network.gml').to_undirected() methods = [ LinkPrediction.DegreeProduct(graph, list(nx.non_edges(graph))), LinkPrediction.ShortestPath(graph, list(nx.non_edges(graph))), LinkPrediction.CommonNeighbors(graph, list(nx.non_edges(graph))), LinkPrediction.AdamicAdvar(graph, list(nx.non_edges(graph))), LinkPrediction.Jaccard(graph, list(nx.non_edges(graph))), LinkPrediction.LHN(graph, list(nx.non_edges(graph))), LinkPrediction.ResourceAllocation(graph, list(nx.non_edges(graph))), LinkPrediction.Sorensen(graph, list(nx.non_edges(graph))), LinkPrediction.katz(graph, beta=0.001, max_power=5, weight=None, dtype=None), LinkPrediction.RPR(graph, alpha=0.15, beta=0) ] # method counter for labeling csv files count = 0 for method in methods: updated_res = EdgeChecker(method, list(nx.non_edges(graph))) with open('Results/DDI_reactome/NETS_DDI ' + str(count) + '.csv', 'wb') as csvfile: writer = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL) for key, values in updated_res.items(): writer.writerow([key, values]) count += 1
allones.append(res) pred_function = graph.train(lambda features: np.random.rand(features.shape[0])>.5) res = graph.test_and_evaluate(pred_function, Xa[test_set, 15:17], gold) randompred.append(res) pred_function = graph.train(perceptron, Xa[train_set, 15:17], ya[train_set]) res = graph.test_and_evaluate(pred_function, Xa[test_set, 15:17], gold, pp) ppton.append(res) # asym.append([.8, .9, .5, .3, 2, res[-1]]) # chiang.append([.8, .9, .5, .3, 2, res[-1]]) # continue esigns = {(u, v): graph.E.get((u,v)) if (u,v) in graph.E else graph.E.get((v,u)) for u, adj in graph.Gfull.items() for v in adj} mapping={i: i for i in range(graph.order)} sstart = lp.clock() sadj, test_edges = sp.get_training_matrix(666, mapping, slcc=set(range(graph.order)), tree_edges=graph.Esign.keys(), G=graph.Gfull, EDGE_SIGN=esigns) ngold, pred = sp.predict_edges(sadj, 15, mapping, test_edges, graph.Gfull, esigns, bk=9000) time_elapsed = lp.clock() - sstart C = lp.confusion_matrix(ngold, pred) fp, tn = C[0, 1], C[0, 0] acc, fpr, f1, mcc = [lp.accuracy_score(ngold, pred), fp/(fp+tn), lp.f1_score(ngold, pred, average='weighted', pos_label=None), lp.matthews_corrcoef(ngold, pred)] frac = 1 - len(test_edges)/len(graph.E) asym.append([acc, f1, mcc, fpr, time_elapsed, frac]) ngold, pred, time_elapsed, frac = getWH.run_chiang(graph)