def run_two_stage(args): DTI_network = utils.load_any_obj_pkl(args.dti_path) drug_similarity = utils.load_any_obj_pkl(args.drug_sim_path) target_similarity = utils.load_any_obj_pkl(args.target_sim_path) csn_network = network_construction.construct_signifcant_edge_network( drug_similarity, top_ratio=float(args.sparsity)) tsn_network = network_construction.construct_signifcant_edge_network( target_similarity, top_ratio=float(args.sparsity)) implicit_compounds = network_construction.create_implicit_networks( DTI_network, list(csn_network.nodes())) implicit_targets = network_construction.create_implicit_networks( DTI_network, list(tsn_network.nodes())) learner = seperate_learner.two_stage_learning( DTI_network=DTI_network, compound_list=list(csn_network.nodes()), target_list=list(tsn_network.nodes()), tsn_network=tsn_network, csn_network=csn_network, implicit_t_network=implicit_targets, implicit_c_network=implicit_compounds, wl=int(args.walk_length), nn=int(args.negative_number), wn=int(args.walk_num), worker=int(args.worker), load_emb=False) learner.learn_all_network_embedding() learner.build_node_representation() training_samples, training_labels = learner.construct_training_samples( negative_ratio=10) test_pairs = new_pairs_to_evaludate(list(csn_network.nodes()), list(tsn_network.nodes()), DTI_network) test_samples = learner.concatenate_pair_embeddings(test_pairs) training_samples = normalise_sample_representation.standardscaler_transform( training_samples) test_samples = normalise_sample_representation.standardscaler_transform( test_samples) clf = learner.train_DTI_prediction_svm(training_samples, training_labels, kernal=2) probs = clf.predict_proba(test_samples) new_probs = [row[1] for row in probs] all_evaluation = [] #from tqdm import tqdm for i in range(len(test_pairs)): current_one = [test_pairs[i][0], test_pairs[i][1], new_probs[i]] all_evaluation.append(current_one) output_name = 'output/' + args.output_name + '.pkl' utils.save_any_obj_pkl(all_evaluation, output_name)
def evaluation(): G_dynamic_ori = load_any_obj_pkl( "graph_data/collaborate_network(1G)/collaborate_network_2006_2016.pkl") G_dynamic = load_any_obj_pkl( "graph_data/collaborate_network(2G)/collaborate_network_2007_2016.pkl") method = "DANRL" filepath = "parameter_sensitivity/collaborate_network(2G)/output" files = os.listdir(filepath) filepath0 = "parameter_sensitivity/collaborate_network(2G)/recommendation" files0 = os.listdir(filepath0) i = 0 for file in files: print(i, len(files)) i += 1 file_1 = file[:-4] + "_G_2ori.txt" if file_1 not in files0: emb_dicts = load_any_obj_pkl(os.path.join(filepath, file)) # print(len(emb_dicts)) avg_score = dict() for top_k in range(1, 11, 1): score = [] for t in range(len(emb_dicts) - 2): # 遍历所有time step的embedding model = recommendation( emb_dicts[t], G0=G_dynamic[t], G1=G_dynamic_ori[t + 2], G2=G_dynamic_ori[t + 3], # G3=G_dynamic_ori[t+4] ) score.append(model.evaluate_precision_k(top_k)) avg_score["top_" + str(top_k)] = np.mean(score) # "parameter_sensitivity/collaborate_network(2G)/output" output_filepath = "parameter_sensitivity/collaborate_network(2G)/recommendation" if not os.path.exists(output_filepath): os.makedirs(output_filepath) output = open( os.path.join(output_filepath, file[:-4] + "_G_2ori.txt"), "w") output.write(json.dumps(avg_score) + "\n") output.close()
def loadData2PD(filepath): data = load_any_obj_pkl(filepath)[-1] X = None car_ids = [] for key, value in data.items(): car_ids.append(key) if X is None: X = np.array(value).reshape(1, -1) else: X = np.vstack((X, value.reshape(1, -1))) X = 1.0 * (X - X.mean()) / X.std() return pd.DataFrame(X, index=car_ids)
def load_DynWalks_Embedding(method): data = load_any_obj_pkl("DynWalks/output/hangzhou_20140301_MCC_" + method + "_embs.pkl")[-1] X = None car_ids = [] for key, value in data.items(): car_ids.append(key) if X is None: X = np.array(value).reshape(1, -1) else: X = np.vstack((X, value.reshape(1, -1))) X = 1.0 * (X - X.mean()) / X.std() return pd.DataFrame(X, index=car_ids)
def draw_graph(): graphs = load_any_obj_pkl( "graph_data/collaborate_network(2G)/collaborate_network_2007_2016.pkl") year = 2007 G_s = [] for g in graphs: print("#nodes: " + str(g.number_of_nodes()) + ", #edges: " + str(g.number_of_edges())) G = nx.Graph() G.add_nodes_from(g.nodes()) G.add_edges_from(g.edges()) # nx.write_gexf(G, "graph_data/collaborate_network(2G)/" + str(year) + ".gexf") # year += 1 G_s.append(G) nx.draw(G_s[0], node_size=30, node_color="black", edge_color="gray") plt.show()
def construct_combined_graph(): graphs = load_any_obj_pkl( "graph_data/collaborate_network(1G)/collaborate_network_2006_2016.pkl") for i in range(2, len(graphs)): g0 = graphs[i - 2] g1 = graphs[i - 1] g = graphs[i] print("#nodes: " + str(g.number_of_nodes()) + ", #edges: " + str(g.number_of_edges())) l = [] for edge in g0.edges(): if edge not in g.edges(): n1, n2 = edge[0], edge[1] l.append((n1, n2, g0.get_edge_data(n1, n2)['weight'])) if n1 not in g.nodes(): g.add_node(n1, attribute=g0.nodes[n1]["attribute"]) if n2 not in g.nodes(): g.add_node(n2, attribute=g0.nodes[n2]["attribute"]) for edge in g1.edges(): if edge not in g.edges(): n1, n2 = edge[0], edge[1] l.append((n1, n2, g1.get_edge_data(n1, n2)['weight'])) if n1 not in g.nodes(): g.add_node(n1, attribute=g1.nodes[n1]["attribute"]) if n2 not in g.nodes(): g.add_node(n2, attribute=g1.nodes[n2]["attribute"]) g.add_weighted_edges_from(l) print("#nodes: " + str(g.number_of_nodes()) + ", #edges: " + str(g.number_of_edges())) # nx.draw(g, node_size=20) # plt.show() g = g.subgraph(max(nx.connected_components(g), key=len)).copy() print("#nodes: " + str(g.number_of_nodes()) + ", #edges: " + str(g.number_of_edges())) filename = "graph_data/collaborate_network_" + str( i) + "_edgelist_new.txt" nx.write_edgelist(g, filename, data=False) save_any_obj_pkl( g, "graph_data/collaborate_network(3G)" + str(i + 2006) + "_new.pkl") graphs.append(g) save_any_obj_pkl(graphs, "graph_data/collaborate_network_2008_2016_new.pkl")
def sw_similarity(): csd = utils.load_any_obj_pkl('data/chem_seq_dict.pkl') chem_id = list(csd.keys()) #for item in chem_id: # if #if len(csd[item]) < 5 : # print('get it') print(chem_id[0]) print(chem_id[1]) match = 2 mismatch = -1 scoring = swalign.NucleotideScoringMatrix(match, mismatch) sw = swalign.LocalAlignment( scoring) # you can also choose gap penalties, etc... alignment = sw.align(csd[chem_id[0]], csd[chem_id[1]]) #score = alignment.dump() print(alignment.score)
def create_compound_similarity_network_mp(compounds_smiles_path, species_name='_DB', worker=4, top_ratio=0.04): compounds_smiles = utils.load_any_obj_pkl(compounds_smiles_path) all_compounds = list(compounds_smiles.keys()) #print(Chem.SanitizeMol('CN(CCO[P@](O)(=O)O[P@@](O)(=O)O[Be-](F)(F)F)C1=CC=CC=C1[N+]([O-])=O')) #for item in all_compounds: # m2 = Chem.MolFromSmiles(compounds_smiles[item]) # if m2 == None: # print(item) # print(compounds_smiles[item]) #raise Exception('stop') ccd = calculate_molecular_similarity(compounds_smiles, worker=worker) all_corr = ccd.parallel_calculate_all_correlation() #all_corr = [[str(j) for j in i] for i in all_corr] final_corr = [] for item in all_corr: for a_corr in item: #print(a_corr) final_corr.append(a_corr) save_name = 'data/' + 'compound_similarity' + species_name + '.pkl' utils.save_any_obj_pkl(final_corr, save_name)
elif scholars[0] in G2.nodes() and scholars[1] in G2.nodes(): if scholars in G2.edges(): tp += 1 else: fp += 1 # elif scholars[0] in G3.nodes() and scholars[1] in G3.nodes(): # if scholars in G2.edges(): # tp += 1 # else: # fp += 1 # print(tp, fp) # print("recommend_precision_score=", "{:.9f}".format(tp/(tp+fp))) return tp / (tp + fp) G_dynamic0 = load_any_obj_pkl( "graph_data/collaborate_network(1G)/collaborate_network_2006_2016.pkl") G_dynamic = load_any_obj_pkl( "graph_data/collaborate_network(2G)/collaborate_network_2007_2016.pkl") print("计算共同邻居") cn_list = [] index2node_list = [] for g in G_dynamic: nodes = list(g.nodes()) cn_matrix = np.zeros([len(nodes), len(nodes)]) index2node = dict() for i in range(len(nodes)): index2node[i] = nodes[i] for j in range(i, len(nodes)): cn_matrix[i, j] = len(list(nx.common_neighbors(g, nodes[i],
def load_OpenNE_Embedding(method, year): sid_emb = dict() with open( r"output/collaborate_network(2G)/" + method + "/collaborate_network_" + str(year) + "_embs.txt", "r") as embeddings: embeddings.readline() for embedding in embeddings: l = embedding.split() sid_emb[l[0]] = [float(n) for n in l[1:]] embeddings.close() return sid_emb G_dynamic_ori = load_any_obj_pkl( "graph_data/collaborate_network(1G)/collaborate_network_2006_2016.pkl") G_dynamic = load_any_obj_pkl( "graph_data/collaborate_network(2G)/collaborate_network_2007_2016.pkl") method = "DeepWalk" if method == "DANRL": emb_dicts = load_any_obj_pkl("output/collaborate_network(2G)/" + method + "/collaborate_network_2007_2016_embs.pkl") else: emb_dicts = [] for year in range(2007, 2017): emb_dicts.append(load_OpenNE_Embedding(method, year)) print(len(emb_dicts)) avg_score = dict() for top_k in range(1, 11, 1):
fps1, fps2) similarity_info.append([ self.compounds[compound_index], self.compounds[i], simialrity_coefficient ]) return similarity_info if __name__ == "__main__": #create_compound_similarity_network_mp('data/drugbank_drugs.pkl', species_name = '_DB')#60 #sw_similarity()#cp sa mm #Chem.SanitizeMol #all_keys = list(target_seqs.keys()) #a = target_seqs[all_keys[0]] #print(type(a)) #target_IDs = list(target_seqs.keys()) #print(target_seqs[target_IDs[0]]) #t1 = target_IDs[0] #seqs1 = target_seqs[t1].split('\n') #for item in target_IDs: # seqs1 = target_seqs[item] # print(len(seqs1.split('\n')[-3])) #create_compound_similarity_network_mp('data/drugbank_drugs.pkl', species_name = '_DB') target_seqs = utils.load_any_obj_pkl('data/drugbank_targets.pkl') create_target_similarity_network(target_seqs, 'DB_N_') #alignment, score, start_end_positions = skbio.alignment.local_pairwise_align_protein(Protein(p1_s), Protein(p2_s)) #print(score)
def main(args): # -------- Step 1: prepare data -------- print(f'Summary of all settings: {args}') print('\nStep 1: start loading data ...') t1 = time.time() G_dynamic = load_any_obj_pkl(args.graph) emb_dicts = load_any_obj_pkl(args.emb_file) t2 = time.time() print(f'Step 1: end loading data; time cost: {(t2-t1):.2f}s') # -------- Step 3: downstream task -------- print('\n\nStep 3: start evaluating embeddings ...') t1 = time.time() print(time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime())) if args.task == 'lp' or args.task == 'all': from downstream import lpClassifier, gen_test_edge_wrt_changes # the size of LP testing data depends on the changes between two consecutive snapshots test_edges = [] test_labels = [] for t in range(len(G_dynamic) - 1): # changed edges from t to t+1 as testing edges pos_edges_with_label, neg_edges_with_label = gen_test_edge_wrt_changes( G_dynamic[t], G_dynamic[t + 1], seed=args.seed) test_edges.append([e[:2] for e in pos_edges_with_label] + [e[:2] for e in neg_edges_with_label]) test_labels.append([e[2] for e in pos_edges_with_label] + [e[2] for e in neg_edges_with_label]) # ====== Changed Link Prediction task (via cos sim) by AUC score ====== print( '--- Start changed link prediction task --> use current emb @t to predict **future** changed links @t+1: ' ) for t in range(len(G_dynamic) - 1): print(f'Current time step @t: {t}') print(f'Changed Link Prediction task (via cos sim) by AUC score') ds_task = lpClassifier( emb_dict=emb_dicts[t] ) # emb at t; did not use **future** changed edges ds_task.evaluate_auc( test_edges[t], test_labels[t] ) # evalate prediction of changed edges from t to t+1 # ====== Changed Link Prediction task (Weighted-L1 edge_feat --> LR clf) by AUC score ====== print( f'--- start changed link prediction task 1 --> use current emb @t to predict **future** changed links @t+1: ' ) LR_prev = None for t in range(len(G_dynamic) - 1): print(f'Current time step @t: {t}') print( f'Changed Link Prediction task (Weighted-L1 edge_feat --> LR clf) by AUC score' ) ds_task = lpClassifier(emb_dict=emb_dicts[t]) if t == 0: LR_prev = ds_task.lr_clf_init1(G_dynamic[t]) LR_prev = ds_task.update_LR_auc1( test_edges[t], test_labels[t], LR_prev=LR_prev ) # incremental learning for Changed LP task; LogisticRegression(random_state=2021, penalty='l2', max_iter=1000) # ====== Changed Link Prediction task (Weighted-L2 edge_feat --> LR clf) by AUC score ====== print( f'--- start changed link prediction task 2 --> use current emb @t to predict **future** changed links @t+1: ' ) LR_prev = None for t in range(len(G_dynamic) - 1): print(f'Current time step @t: {t}') print( f'Changed Link Prediction task (Weighted-L2 edge_feat --> LR clf) by AUC score' ) ds_task = lpClassifier(emb_dict=emb_dicts[t]) if t == 0: LR_prev = ds_task.lr_clf_init2(G_dynamic[t]) LR_prev = ds_task.update_LR_auc2( test_edges[t], test_labels[t], LR_prev=LR_prev ) # incremental learning for Changed LP task; LogisticRegression(random_state=2021, penalty='l2', max_iter=1000) # ====== Changed Link Prediction task (Hadamard edge_feat --> LR clf) by AUC score ====== print( f'--- start changed link prediction task 3 --> use current emb @t to predict **future** changed links @t+1: ' ) LR_prev = None for t in range(len(G_dynamic) - 1): print(f'Current time step @t: {t}') print( f'Changed Link Prediction task (Hadamard edge_feat --> LR clf) by AUC score' ) ds_task = lpClassifier(emb_dict=emb_dicts[t]) if t == 0: LR_prev = ds_task.lr_clf_init3(G_dynamic[t]) LR_prev = ds_task.update_LR_auc3( test_edges[t], test_labels[t], LR_prev=LR_prev ) # incremental learning for Changed LP task; LogisticRegression(random_state=2021, penalty='l2', max_iter=1000) # ====== Changed Link Prediction task (Average edge_feat --> LR clf) by AUC score ====== print( f'--- start changed link prediction task 4 --> use current emb @t to predict **future** changed links @t+1: ' ) LR_prev = None for t in range(len(G_dynamic) - 1): print(f'Current time step @t: {t}') print( f'Changed Link Prediction task (Average edge_feat --> LR clf) by AUC score' ) ds_task = lpClassifier(emb_dict=emb_dicts[t]) if t == 0: LR_prev = ds_task.lr_clf_init4(G_dynamic[t]) LR_prev = ds_task.update_LR_auc4( test_edges[t], test_labels[t], LR_prev=LR_prev ) # incremental learning for Changed LP task; LogisticRegression(random_state=2021, penalty='l2', max_iter=1000) print(time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime())) if args.task == 'nr' or args.task == 'all': print( f'--- start changed node recommendation task --> use current emb @t to recommend nodes for **future** changed node in graph @t+1: ' ) from downstream import nrClassifier, gen_test_node_wrt_changes, align_nodes for t in range(len(G_dynamic) - 1): print(f'Current time step @t: {t}') node_list = gen_test_node_wrt_changes( G_dynamic[t], G_dynamic[t + 1] ) # generate the testing nodes that affected by changes and presented in both graphs print( '# of testing nodes that affected by changes and presented in both graphs: ', len(node_list)) rc_next_graph_aligned = align_nodes( G_dynamic[t], G_dynamic[t + 1] ) # remove newly added nodes from G_dynamic[t+1], and add newly removed nodes to G_dynamic[t+1] ds_task = nrClassifier(emb_dict=emb_dicts[t], rc_graph=rc_next_graph_aligned) top_k_list = [5, 10, 50, 100] ds_task.evaluate_pk_and_apk(top_k_list, node_list) # If OOM, try grClassifier_batch (see dowmstream.py) which is slow but requires much smaller memory print(time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime())) if args.task == 'gr' or args.task == 'all': print( f'--- start graph/link reconstraction task --> use current emb @t to reconstruct **current** graph @t: ' ) from downstream import grClassifier for t in range( len(G_dynamic) - 1 ): # ignore the last one, so that the length is consistent with Changed LP print(f'Current time step @t: {t}') all_nodes = list(G_dynamic[t].nodes()) if len(all_nodes) <= 10000: node_list = None # testing all nodes print('# testing for all nodes in current graph') else: node_list = list( np.random.choice(all_nodes, 10000, replace=False)) print( '# current graph is too larger -> randomly sample 10000 testing nodes: ', len(node_list)) ds_task = grClassifier(emb_dict=emb_dicts[t], rc_graph=G_dynamic[t]) top_k_list = [5, 10, 50, 100] ds_task.evaluate_pk_and_apk(top_k_list, node_list) # If OOM, try grClassifier_batch (see dowmstream.py) which is slow but requires much smaller memory t2 = time.time() print(f'STEP3: end evaluating; time cost: {(t2-t1):.2f}s') print(time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime()))