def create_target_similarity_network_normalised(target_seq, name): import math target_names = list(target_seq.keys()) seq_info = [] total = len(target_names) for i in tqdm(range(len(target_names))): seq1 = target_seq[target_names[i]] #print(seq1) #raise Exception('stop') for j in range(i + 1, len(target_names)): #print(i,' ',j) seq2 = target_seqs[target_names[j]] try: alignment, score, start_end_positions = skbio.alignment.local_pairwise_align_ssw( Protein(seq1), Protein(seq2), substitution_matrix=blosum50) except: score = 0 #alignment, score, start_end_positions = skbio.alignment.local_pairwise_align_protein(Protein(p1_s), Protein(p2_s)) new_score = float(score) / (math.sqrt(len(seq1)) * math.sqrt(len(seq2))) seq_info.append([target_names[i], target_names[j], new_score]) #t2 = time.time() #print(t2-t1) #print(seq_info[0:10]) name = 'data/' + name + 'target_similarity.pkl' utils.save_any_obj_pkl(seq_info, name)
def run_two_stage(args): DTI_network = utils.load_any_obj_pkl(args.dti_path) drug_similarity = utils.load_any_obj_pkl(args.drug_sim_path) target_similarity = utils.load_any_obj_pkl(args.target_sim_path) csn_network = network_construction.construct_signifcant_edge_network( drug_similarity, top_ratio=float(args.sparsity)) tsn_network = network_construction.construct_signifcant_edge_network( target_similarity, top_ratio=float(args.sparsity)) implicit_compounds = network_construction.create_implicit_networks( DTI_network, list(csn_network.nodes())) implicit_targets = network_construction.create_implicit_networks( DTI_network, list(tsn_network.nodes())) learner = seperate_learner.two_stage_learning( DTI_network=DTI_network, compound_list=list(csn_network.nodes()), target_list=list(tsn_network.nodes()), tsn_network=tsn_network, csn_network=csn_network, implicit_t_network=implicit_targets, implicit_c_network=implicit_compounds, wl=int(args.walk_length), nn=int(args.negative_number), wn=int(args.walk_num), worker=int(args.worker), load_emb=False) learner.learn_all_network_embedding() learner.build_node_representation() training_samples, training_labels = learner.construct_training_samples( negative_ratio=10) test_pairs = new_pairs_to_evaludate(list(csn_network.nodes()), list(tsn_network.nodes()), DTI_network) test_samples = learner.concatenate_pair_embeddings(test_pairs) training_samples = normalise_sample_representation.standardscaler_transform( training_samples) test_samples = normalise_sample_representation.standardscaler_transform( test_samples) clf = learner.train_DTI_prediction_svm(training_samples, training_labels, kernal=2) probs = clf.predict_proba(test_samples) new_probs = [row[1] for row in probs] all_evaluation = [] #from tqdm import tqdm for i in range(len(test_pairs)): current_one = [test_pairs[i][0], test_pairs[i][1], new_probs[i]] all_evaluation.append(current_one) output_name = 'output/' + args.output_name + '.pkl' utils.save_any_obj_pkl(all_evaluation, output_name)
def construct_original_graph(): files = os.listdir("handle_data/data/collaboration_network") graphs = list() for file in files: path = os.path.join("handle_data/data/collaboration_network", file) g = nx.Graph() l = [] node_attr = dict() with open(path, "r") as lines: for line in lines: line_json = json.loads(line) keys = list(line_json.keys()) s1, s2, w = keys[0], keys[1], line_json[keys[2]] a1, a2 = line_json[keys[0]], line_json[keys[1]] l.append((s1, s2, w)) if s1 not in node_attr: node_attr[s1] = a1 if s2 not in node_attr: node_attr[s2] = a2 g.add_weighted_edges_from(l) attr_list = [] for node, attr in node_attr.items(): attr_list.append(attr) attr_array = np.array(attr_list) attr_normed = attr_array / attr_array.max(axis=0) index = 0 for node, attr in node_attr.items(): node_attr[node] = list(attr_normed[index]) index += 1 for node, attr in node_attr.items(): g.nodes[node]["attribute"] = attr print(g.nodes(data=True)) print("#nodes: " + str(g.number_of_nodes()) + ", #edges: " + str(g.number_of_edges())) # connected_component_subgraphs(G) g = max(nx.connected_component_subgraphs(g), key=len) print(file[:-4] + ": #nodes: " + str(g.number_of_nodes()) + ", #edges: " + str(g.number_of_edges())) save_any_obj_pkl( g, "graph_data/collaborate_network_" + file[:-4] + ".pkl") filename = "graph_data/collaborate_network_" + file[:-4] + "_edgelist.txt" nx.write_edgelist(g, filename, data=False) nx.draw(g, node_size=20) plt.show() graphs.append(g) save_any_obj_pkl(graphs, "graph_data/collaborate_network_2006_2016.pkl")
def construct_combined_graph(): graphs = load_any_obj_pkl( "graph_data/collaborate_network(1G)/collaborate_network_2006_2016.pkl") for i in range(2, len(graphs)): g0 = graphs[i - 2] g1 = graphs[i - 1] g = graphs[i] print("#nodes: " + str(g.number_of_nodes()) + ", #edges: " + str(g.number_of_edges())) l = [] for edge in g0.edges(): if edge not in g.edges(): n1, n2 = edge[0], edge[1] l.append((n1, n2, g0.get_edge_data(n1, n2)['weight'])) if n1 not in g.nodes(): g.add_node(n1, attribute=g0.nodes[n1]["attribute"]) if n2 not in g.nodes(): g.add_node(n2, attribute=g0.nodes[n2]["attribute"]) for edge in g1.edges(): if edge not in g.edges(): n1, n2 = edge[0], edge[1] l.append((n1, n2, g1.get_edge_data(n1, n2)['weight'])) if n1 not in g.nodes(): g.add_node(n1, attribute=g1.nodes[n1]["attribute"]) if n2 not in g.nodes(): g.add_node(n2, attribute=g1.nodes[n2]["attribute"]) g.add_weighted_edges_from(l) print("#nodes: " + str(g.number_of_nodes()) + ", #edges: " + str(g.number_of_edges())) # nx.draw(g, node_size=20) # plt.show() g = g.subgraph(max(nx.connected_components(g), key=len)).copy() print("#nodes: " + str(g.number_of_nodes()) + ", #edges: " + str(g.number_of_edges())) filename = "graph_data/collaborate_network_" + str( i) + "_edgelist_new.txt" nx.write_edgelist(g, filename, data=False) save_any_obj_pkl( g, "graph_data/collaborate_network(3G)" + str(i + 2006) + "_new.pkl") graphs.append(g) save_any_obj_pkl(graphs, "graph_data/collaborate_network_2008_2016_new.pkl")
def create_compound_similarity_network(compounds_smiles, top_ratio=0.04): compounds_name = list(compounds_smiles.keys()) similarity_info = [] total = len(compounds_name) for i in range(len(compounds_name)): #t1 = time.time() #print(compounds_smiles[compounds_name[i]]) #print(i,' out of ',total) m1 = Chem.MolFromSmiles(compounds_smiles[compounds_name[i]]) fps1 = Chem.RDKFingerprint(m1) for j in range(i + 1, len(compounds_name)): m2 = Chem.MolFromSmiles(compounds_smiles[compounds_name[i]]) fps2 = Chem.RDKFingerprint(m2) simialrity_coefficient = DataStructs.FingerprintSimilarity( fps1, fps2) similarity_info.append( [compounds_name[i], compounds_name[j], simialrity_coefficient]) #t2 = time.time() #print(t2-t1) utils.save_any_obj_pkl('data/all_interactions.pkl')
def create_compound_similarity_network_mp(compounds_smiles_path, species_name='_DB', worker=4, top_ratio=0.04): compounds_smiles = utils.load_any_obj_pkl(compounds_smiles_path) all_compounds = list(compounds_smiles.keys()) #print(Chem.SanitizeMol('CN(CCO[P@](O)(=O)O[P@@](O)(=O)O[Be-](F)(F)F)C1=CC=CC=C1[N+]([O-])=O')) #for item in all_compounds: # m2 = Chem.MolFromSmiles(compounds_smiles[item]) # if m2 == None: # print(item) # print(compounds_smiles[item]) #raise Exception('stop') ccd = calculate_molecular_similarity(compounds_smiles, worker=worker) all_corr = ccd.parallel_calculate_all_correlation() #all_corr = [[str(j) for j in i] for i in all_corr] final_corr = [] for item in all_corr: for a_corr in item: #print(a_corr) final_corr.append(a_corr) save_name = 'data/' + 'compound_similarity' + species_name + '.pkl' utils.save_any_obj_pkl(final_corr, save_name)
G_dynamic = load_any_obj_pkl( "graph_data/collaborate_network(2G)/collaborate_network_2007_2016.pkl") model = DANRL(G_dynamic=G_dynamic, limit=ratio_most_affected_nodes, local_global=1, num_walks=num_walks, walk_length=walk_length, window=window_size, emb_dim=embedding_dimensional, n_negative=num_negative) emb_dicts = model.train() save_any_obj_pkl( obj=emb_dicts, path= "output/collaborate_network(2G)/DANRL/collaborate_network_2007_2016_embs.pkl" ) # import os # import numpy as np # col_net = "collaborate_network(2G)" # G_dynamic = load_any_obj_pkl("graph_data/"+ col_net + "/collaborate_network_2007_2016.pkl") # for dim in [16, 64, 128, 256]: # for win in range(16, 42, 2): # model = DANRL(G_dynamic=G_dynamic, # limit=0.2, # local_global=1, # num_walks=20, # walk_length=80, # window=win,
else: for i in range(2, 51, 1): # clu = KMeans(n_clusters=5, init="k-means++", n_init=10, max_iter=300, random_state=0) # clu = AgglomerativeClustering(n_clusters=i, affinity="euclidean", linkage='average') clu = SpectralClustering(n_clusters=i, gamma=0.01) clu.fit(X) labels = clu.labels_ du[str(i)] = Dunn_Validity_Index(labels=labels, data=X, n_clusters=i) sil[str(i)] = silhouette_score(X, labels) db[str(i)] = davies_bouldin_score(X, labels) ch[str(i)] = calinski_harabaz_score(X, labels) save_any_obj_pkl( sil, "metric_result\\" + clustering_method + "_" + embedding_method + "_sil.pkl") save_any_obj_pkl( db, "metric_result\\" + clustering_method + "_" + embedding_method + "_db.pkl") save_any_obj_pkl( ch, "metric_result\\" + clustering_method + "_" + embedding_method + "_ch.pkl") save_any_obj_pkl( du, "metric_result\\" + clustering_method + "_" + embedding_method + "_du.pkl") save_dict(filepath="metric_result\clustering_method\\" + clustering_method + "_" + "silhouette_score.txt", mode="a", dic={embedding_method: sil})
def train(self): w2v = gensim.models.Word2Vec(sentences=None, size=self.emb_dim, window=self.window, sg=1, hs=0, negative=self.n_negative, ns_exponent=0.75, alpha=0.025, min_alpha=0.0001, min_count=1, sample=0.001, iter=4, workers=self.workers, seed=self.seed, corpus_file=None, sorted_vocab=1, batch_words=10000, compute_loss=False, max_vocab_size=None, max_final_vocab=None, trim_rule=None) for t in range(len(self.G_dynamic)): if t == 0: G0 = self.G_dynamic[t] sentences = simulate_walks(nx_graph=G0, num_walks=self.num_walks, weighted=True, walk_length=self.walk_length) sentences = [[str(j) for j in i] for i in sentences] print("-start node embedding on Graph 0" + "/" + str(len(self.G_dynamic))) w2v.build_vocab(sentences=sentences, update=False) # init traning, so update False # 利用Word2Vec模型进行训练 w2v.train(sentences=sentences, total_examples=w2v.corpus_count, epochs=w2v.iter) # follow w2v constructor print("-end node embedding on Graph 0" + "/" + str(len(self.G_dynamic))) emb_dict = {} # {nodeID: emb_vector, ...} for node in self.G_dynamic[t].nodes(): emb_dict[node] = w2v.wv[str(node)] save_any_obj_pkl( obj=emb_dict, path= "output/collaborate_network(2G)/DynAttriWalks/collaborate_network_" + str(t) + "_embs.pkl") self.emb_dicts.append(emb_dict) else: G0 = self.G_dynamic[t - 1] # previous graph 之前的graph G1 = self.G_dynamic[t] # current graph 现在的graph print("-start selecting nodes on Graph " + str(t) + "/" + str(len(self.G_dynamic))) node_update_list, self.reservoir, node_del, node_add = node_selecting_scheme( graph_t0=G0, graph_t1=G1, reservoir_dict=self.reservoir, limit=self.limit, local_global=self.local_global) print("-end selecting nodes on Graph " + str(t) + "/" + str(len(self.G_dynamic))) sentences = simulate_walks(nx_graph=G1, num_walks=self.num_walks, weighted=True, walk_length=self.walk_length, selected_nodes=node_update_list) sentences = [[str(j) for j in i] for i in sentences] print("-start node embedding on Graph " + str(t) + "/" + str(len(self.G_dynamic))) w2v.build_vocab(sentences=sentences, update=True) # online update # 利用Word2Vec模型进行训练 w2v.train(sentences=sentences, total_examples=w2v.corpus_count, epochs=w2v.iter) print("-end node embedding on Graph " + str(t) + "/" + str(len(self.G_dynamic))) emb_dict = {} # {nodeID: emb_vector, ...} for node in self.G_dynamic[t].nodes(): emb_dict[node] = w2v.wv[str(node)] save_any_obj_pkl( obj=emb_dict, path= "output/collaborate_network(2G)/DynAttriWalks/collaborate_network_" + str(t) + "_embs.pkl") self.emb_dicts.append(emb_dict) return self.emb_dicts