def _optimizer_init(self, batch_size: int, max_margin: float) -> NoReturn: """ Create self.opt. Parameters ---------- batch_size : int Minibatch size. max_margin : float Max margin parameter in hinge loss. """ print("Create optimizer") with tf.compat.v1.name_scope('optimizer'): self.opt = DecagonOptimizer( embeddings=self.model.embeddings, latent_inters=self.model.latent_inters, latent_varies=self.model.latent_varies, degrees=self.degrees, edge_types=self.edge_types, edge_type2dim=self.edge_type2dim, placeholders=self.placeholders, batch_size=batch_size, margin=max_margin )
print("Create model") model = DecagonModel( placeholders=placeholders, num_feat=num_feat, nonzero_feat=nonzero_feat, edge_types=edge_types, decoders=edge_type2decoder, ) print("Create optimizer") with tf.name_scope('optimizer'): opt = DecagonOptimizer(embeddings=model.embeddings, latent_inters=model.latent_inters, latent_varies=model.latent_varies, degrees=degrees, edge_types=edge_types, edge_type2dim=edge_type2dim, placeholders=placeholders, batch_size=FLAGS.batch_size, margin=FLAGS.max_margin) print("Initialize session") sess = tf.Session() sess.run(tf.global_variables_initializer()) feed_dict = {} # ============================================================================================= # # TRAINING # Metric structures initialization output_data = {} out_file = 'results_training/TRAIN_'+words[2]+DSE*('_DSE_'+str(n_se_mono))+BDM*('_BDM')\ +'_genes_'+str(n_genes)+'_drugs_'+str(n_drugs)+'_se_'+str(n_se_combo)+'_epochs_'+\ str(FLAGS.epochs)+'_dropout_'+str(FLAGS.dropout)+'_valsize_'+\
placeholders=placeholders, num_feat=num_feat, nonzero_feat=nonzero_feat, edge_types=edge_types, decoders=edge_type2decoder, ) # optimizer.py print("Create optimizer") with tf.name_scope('optimizer'): #opt作为decagon的优化器 opt = DecagonOptimizer( embeddings=model.embeddings, #decagon model.embedding latent_inters=model.latent_inters, #decagon model.latent_inters latent_varies=model.latent_varies, #decagon model.latent_varies degrees=degrees, #(dict) edge_types=edge_types, edge_type2dim=edge_type2dim, placeholders=placeholders, #传参函数 batch_size=FLAGS.batch_size, #512 margin=FLAGS.max_margin) #0.1 #损失函数 print("Initialize session") sess = tf.Session() sess.run(tf.global_variables_initializer()) #初始化 feed_dict = {} #传参词典 #初始化 ########################################################### # # Train model #
def main_execution(): combo_to_drugs_ids, combo_to_side_effects = load_drug_bank_combo_side_effect_file( fichier='polypharmacy/drugbank/drugbank-combo.csv') nodes = set([u for e in combo_to_drugs_ids.values() for u in e]) n_drugs = len(nodes) relation_types = set([r for r in combo_to_side_effects.values()]) n_drugdrug_rel_types = len(relation_types) drugs_to_positions_in_matrices_dict = { node: i for i, node in enumerate(nodes) } drug_drug_adj_list = [] # matrice d'adjacence de chaque drug_drug for i, el in enumerate(relation_types): # pour chaque side effect mat = np.zeros((n_drugs, n_drugs)) for d1, d2 in combinations(list(nodes), 2): temp_cle = '{}_{}'.format(d1, d2) if temp_cle in combo_to_side_effects.keys(): if combo_to_side_effects[temp_cle] == el: # chaque fois on a une réelle s.e entre les 2 drogues dans la matrice mat[drugs_to_positions_in_matrices_dict[d1], drugs_to_positions_in_matrices_dict[d2]] = \ mat[drugs_to_positions_in_matrices_dict[d2], drugs_to_positions_in_matrices_dict[d1]] = 1. # Inscrire une interaction drug_drug_adj_list.append(sp.csr_matrix(mat)) drug_degrees_list = [ np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in drug_drug_adj_list ] adj_mats_orig = { (0, 0): drug_drug_adj_list + [x.transpose(copy=True) for x in drug_drug_adj_list], } degrees = { 0: drug_degrees_list + drug_degrees_list, } # features (drugs) drug_feat = sp.identity(n_drugs) drug_nonzero_feat, drug_num_feat = drug_feat.shape drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo()) # data representation num_feat = { 0: drug_num_feat, } nonzero_feat = { 0: drug_nonzero_feat, } feat = { 0: drug_feat, } edge_type2dim = { k: [adj.shape for adj in adjs] for k, adjs in adj_mats_orig.items() } edge_type2decoder = { (0, 0): 'dedicom', } edge_types = {k: len(v) for k, v in adj_mats_orig.items()} num_edge_types = sum(edge_types.values()) print("Edge types:", "%d" % num_edge_types) print("Defining placeholders") placeholders = construct_placeholders(edge_types) ########################################################### # # Create minibatch iterator, model and optimizer # ########################################################### print("Create minibatch iterator") minibatch = EdgeMinibatchIterator(adj_mats=adj_mats_orig, feat=feat, edge_types=edge_types, batch_size=FLAGS.batch_size, val_test_size=val_test_size) print("Create model") model = DecagonModel( placeholders=placeholders, num_feat=num_feat, nonzero_feat=nonzero_feat, edge_types=edge_types, decoders=edge_type2decoder, ) print("Create optimizer") with tf.name_scope('optimizer'): opt = DecagonOptimizer(embeddings=model.embeddings, latent_inters=model.latent_inters, latent_varies=model.latent_varies, degrees=degrees, edge_types=edge_types, edge_type2dim=edge_type2dim, placeholders=placeholders, batch_size=FLAGS.batch_size, margin=FLAGS.max_margin) print("Initialize session") sess = tf.Session() sess.run(tf.global_variables_initializer()) feed_dict = {} ########################################################### # # Train model # ########################################################### print("Train model") for epoch in range(FLAGS.epochs): minibatch.shuffle() itr = 0 while not minibatch.end(): # Construct feed dictionary feed_dict = minibatch.next_minibatch_feed_dict( placeholders=placeholders) feed_dict = minibatch.update_feed_dict(feed_dict=feed_dict, dropout=FLAGS.dropout, placeholders=placeholders) t = time.time() # Training step: run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.batch_edge_type_idx], feed_dict=feed_dict) train_cost = outs[1] batch_edge_type = outs[2] if itr % PRINT_PROGRESS_EVERY == 0: val_auc, val_auprc, val_apk = get_accuracy_scores( feed_dict, placeholders, sess, opt, minibatch, adj_mats_orig, minibatch.val_edges, minibatch.val_edges_false, minibatch.idx2edge_type[minibatch.current_edge_type_idx]) print("Epoch:", "%04d" % (epoch + 1), "Iter:", "%04d" % (itr + 1), "Edge:", "%04d" % batch_edge_type, "train_loss=", "{:.5f}".format(train_cost), "val_roc=", "{:.5f}".format(val_auc), "val_auprc=", "{:.5f}".format(val_auprc), "val_apk=", "{:.5f}".format(val_apk), "time=", "{:.5f}".format(time.time() - t)) itr += 1 print("Optimization finished!") for et in range(num_edge_types): roc_score, auprc_score, apk_score = get_accuracy_scores( feed_dict, placeholders, sess, opt, minibatch, adj_mats_orig, minibatch.test_edges, minibatch.test_edges_false, minibatch.idx2edge_type[et]) print("Edge type=", "[%02d, %02d, %02d]" % minibatch.idx2edge_type[et]) print("Edge type:", "%04d" % et, "Test AUROC score", "{:.5f}".format(roc_score)) print("Edge type:", "%04d" % et, "Test AUPRC score", "{:.5f}".format(auprc_score)) print("Edge type:", "%04d" % et, "Test AP@k score", "{:.5f}".format(apk_score)) print()
def main_execution(combo_file='./polypharmacy/bio-decagon-combo.csv', targets_file='./polypharmacy/bio-decagon-targets.csv', genes_genes_file='./polypharmacy/bio-decagon-ppi.csv', new_train_test_split=False): print('Load Combo to Side Effects') if combo_file.find('decagon') != -1: combo_to_drugs_ids, combo_to_side_effects, combo_to_side_effects_names, side_effects_ids_to_names = \ load_decagon_combo_side_effect_file(fichier=combo_file) print('Load drugs to targets') drugs_id_to_targets_id = load_decagon_file_targets_id( fichier=targets_file) else: combo_to_drugs_ids, combo_to_side_effects = load_drug_bank_combo_side_effect_file( fichier=combo_file) print('Load drugs to targets') drugs_id_to_targets_id, drugs_id_to_drugs_name = load_file_targets_id( fichier=targets_file) print('Load genes to genes (targets) interactions net') genes_genes_net, genes_node_to_idx = load_genes_genes_interactions( fichier=genes_genes_file) print('Build genes-genes adjacency matrix') genes_adj = nx.adjacency_matrix(genes_genes_net) genes_degrees = np.array(genes_adj.sum(axis=0)).squeeze() if new_train_test_split: print('Load the new train test validation split') combo_to_drugs_ids_train, combo_to_drugs_ids_test, combo_to_drugs_ids_valid = train_test_valid_split_3( ) drug_nodes_train = set( [u for e in combo_to_drugs_ids_train.values() for u in e]) drug_nodes_test = set( [u for e in combo_to_drugs_ids_test.values() for u in e]) drug_nodes_valid = set( [u for e in combo_to_drugs_ids_valid.values() for u in e]) print('Build drugs-drugs matrix representation') drug_nodes = set([u for e in combo_to_drugs_ids.values() for u in e]) n_drugs = len(drug_nodes) relation_types = set( [r for se in combo_to_side_effects.values() for r in se]) drugs_nodes_to_idx = {node: i for i, node in enumerate(drug_nodes)} print('Build general drugs-drugs matrix representation') drug_drug_adj_list = [] # matrice d'adjacence de chaque drug_drug for i, el in enumerate(relation_types): # pour chaque side effect mat = np.zeros((n_drugs, n_drugs)) for d1, d2 in combinations(list(drug_nodes), 2): temp_cle = '{}_{}'.format(d1, d2) if temp_cle in combo_to_side_effects.keys(): if el in combo_to_side_effects[temp_cle]: # list of list on check si le s.e apparait au moins une fois dans la liste mat[drugs_nodes_to_idx[d1], drugs_nodes_to_idx[d2]] = \ mat[drugs_nodes_to_idx[d2], drugs_nodes_to_idx[d1]] = 1. # Inscrire une interaction drug_drug_adj_list.append(sp.csr_matrix(mat)) drug_degrees_list = [ np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in drug_drug_adj_list ] if new_train_test_split: print('Build train drugs-drugs matrix representation') drug_drug_adj_list_train = [ ] # matrice d'adjacence de chaque drug_drug for i, el in enumerate(relation_types): # pour chaque side effect mat = np.zeros((n_drugs, n_drugs)) for d1, d2 in combinations(list(drug_nodes_train), 2): temp_cle = '{}_{}'.format(d1, d2) if temp_cle in combo_to_side_effects.keys(): if el in combo_to_side_effects[temp_cle]: # list of list on check si le s.e apparait au moins une fois dans la liste mat[drugs_nodes_to_idx[d1], drugs_nodes_to_idx[d2]] = \ mat[drugs_nodes_to_idx[d2], drugs_nodes_to_idx[d1]] = 1. # Inscrire une interaction drug_drug_adj_list_train.append(sp.csr_matrix(mat)) drug_degrees_list_train = [ np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in drug_drug_adj_list_train ] print('Build test drugs-drugs matrix representation') drug_drug_adj_list_test = [] # matrice d'adjacence de chaque drug_drug for i, el in enumerate(relation_types): # pour chaque side effect mat = np.zeros((n_drugs, n_drugs)) for d1, d2 in combinations(list(drug_nodes_test), 2): temp_cle = '{}_{}'.format(d1, d2) if temp_cle in combo_to_side_effects.keys(): if el in combo_to_side_effects[temp_cle]: # list of list on check si le s.e apparait au moins une fois dans la liste mat[drugs_nodes_to_idx[d1], drugs_nodes_to_idx[d2]] = \ mat[drugs_nodes_to_idx[d2], drugs_nodes_to_idx[d1]] = 1. # Inscrire une interaction drug_drug_adj_list_test.append(sp.csr_matrix(mat)) drug_degrees_list_test = [ np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in drug_drug_adj_list_test ] print('Build valid drugs-drugs matrix representation') drug_drug_adj_list_valid = [ ] # matrice d'adjacence de chaque drug_drug for i, el in enumerate(relation_types): # pour chaque side effect mat = np.zeros((n_drugs, n_drugs)) for d1, d2 in combinations(list(drug_nodes_valid), 2): temp_cle = '{}_{}'.format(d1, d2) if temp_cle in combo_to_side_effects.keys(): if el in combo_to_side_effects[temp_cle]: # list of list on check si le s.e apparait au moins une fois dans la liste mat[drugs_nodes_to_idx[d1], drugs_nodes_to_idx[d2]] = \ mat[drugs_nodes_to_idx[d2], drugs_nodes_to_idx[d1]] = 1. # Inscrire une interaction drug_drug_adj_list_valid.append(sp.csr_matrix(mat)) drug_degrees_list_valid = [ np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in drug_drug_adj_list_valid ] print('Build general genes-drugs matrix representation') genes_nodes = set([gene_node for gene_node in genes_node_to_idx.keys()]) n_genes = len(genes_nodes) mat = np.zeros((n_genes, n_drugs)) for drug in drug_nodes: if drug in drugs_id_to_targets_id.keys(): for target in drugs_id_to_targets_id[drug]: if target in genes_node_to_idx.keys(): mat[genes_node_to_idx[target], drugs_nodes_to_idx[drug]] = 1. genes_drugs_adj = sp.csr_matrix(mat) drugs_genes_adj = genes_drugs_adj.transpose(copy=True) if new_train_test_split: print('Build train genes-drugs matrix representation') for drug in drug_nodes_train: if drug in drugs_id_to_targets_id.keys(): for target in drugs_id_to_targets_id[drug]: if target in genes_node_to_idx.keys(): mat[genes_node_to_idx[target], drugs_nodes_to_idx[drug]] = 1. genes_drugs_adj_train = sp.csr_matrix(mat) drugs_genes_adj_train = genes_drugs_adj_train.transpose(copy=True) print('Build test genes-drugs matrix representation') for drug in drug_nodes_test: if drug in drugs_id_to_targets_id.keys(): for target in drugs_id_to_targets_id[drug]: if target in genes_node_to_idx.keys(): mat[genes_node_to_idx[target], drugs_nodes_to_idx[drug]] = 1. genes_drugs_adj_test = sp.csr_matrix(mat) drugs_genes_adj_test = genes_drugs_adj_test.transpose(copy=True) print('Build valid genes-drugs matrix representation') for drug in drug_nodes_valid: if drug in drugs_id_to_targets_id.keys(): for target in drugs_id_to_targets_id[drug]: if target in genes_node_to_idx.keys(): mat[genes_node_to_idx[target], drugs_nodes_to_idx[drug]] = 1. genes_drugs_adj_valid = sp.csr_matrix(mat) drugs_genes_adj_valid = genes_drugs_adj_valid.transpose(copy=True) print('Build general Adjacency matrix data representation') adj_mats_orig = { (0, 0): [genes_adj, genes_adj.transpose(copy=True)], (0, 1): [genes_drugs_adj], (1, 0): [drugs_genes_adj], (1, 1): drug_drug_adj_list + [x.transpose(copy=True) for x in drug_drug_adj_list], } if new_train_test_split: print('Build train Adjacency matrix data representation') adj_mats_orig_train = { (0, 0): [genes_adj, genes_adj.transpose(copy=True)], (0, 1): [genes_drugs_adj_train], (1, 0): [drugs_genes_adj_train], (1, 1): drug_drug_adj_list_train + [x.transpose(copy=True) for x in drug_drug_adj_list_train], } print('Build test Adjacency matrix data representation') adj_mats_orig_test = { (0, 0): [genes_adj, genes_adj.transpose(copy=True)], (0, 1): [genes_drugs_adj_test], (1, 0): [drugs_genes_adj_test], (1, 1): drug_drug_adj_list_test + [x.transpose(copy=True) for x in drug_drug_adj_list_test], } print('Build valid Adjacency matrix data representation') adj_mats_orig_valid = { (0, 0): [genes_adj, genes_adj.transpose(copy=True)], (0, 1): [genes_drugs_adj_valid], (1, 0): [drugs_genes_adj_valid], (1, 1): drug_drug_adj_list_valid + [x.transpose(copy=True) for x in drug_drug_adj_list_valid], } degrees = { 0: [genes_degrees, genes_degrees], 1: drug_degrees_list + drug_degrees_list, } print('featureless (genes)') gene_feat = sp.identity(n_genes) gene_nonzero_feat, gene_num_feat = gene_feat.shape gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo()) print('features (drugs)') drug_feat = sp.identity(n_drugs) drug_nonzero_feat, drug_num_feat = drug_feat.shape drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo()) print('Features data representation') num_feat = { 0: gene_num_feat, 1: drug_num_feat, } nonzero_feat = { 0: gene_nonzero_feat, 1: drug_nonzero_feat, } feat = { 0: gene_feat, 1: drug_feat, } edge_type2dim = { k: [adj.shape for adj in adjs] for k, adjs in adj_mats_orig.items() } edge_type2decoder = { (0, 0): 'bilinear', (0, 1): 'bilinear', (1, 0): 'bilinear', (1, 1): 'dedicom', } edge_types = {k: len(v) for k, v in adj_mats_orig.items()} num_edge_types = sum(edge_types.values()) print("Edge types:", "%d" % num_edge_types) print("Defining placeholders") placeholders = construct_placeholders(edge_types) ########################################################### # # Create minibatch iterator, model and optimizer # ########################################################### if new_train_test_split: print("Create minibatch iterator") minibatch = EdgeMinibatchIteratorNewSplit( adj_mats=adj_mats_orig, adj_mats_train=adj_mats_orig_train, adj_mats_test=adj_mats_orig_test, adj_mats_valid=adj_mats_orig_valid, feat=feat, edge_types=edge_types, batch_size=FLAGS.batch_size, val_test_size=val_test_size) else: print("Create minibatch iterator") minibatch = EdgeMinibatchIterator(adj_mats=adj_mats_orig, feat=feat, edge_types=edge_types, batch_size=FLAGS.batch_size, val_test_size=val_test_size) print("Create model") model = DecagonModel( placeholders=placeholders, num_feat=num_feat, nonzero_feat=nonzero_feat, edge_types=edge_types, decoders=edge_type2decoder, ) print("Create optimizer") with tf.name_scope('optimizer'): opt = DecagonOptimizer(embeddings=model.embeddings, latent_inters=model.latent_inters, latent_varies=model.latent_varies, degrees=degrees, edge_types=edge_types, edge_type2dim=edge_type2dim, placeholders=placeholders, batch_size=FLAGS.batch_size, margin=FLAGS.max_margin) print("Initialize session") sess = tf.Session() sess.run(tf.global_variables_initializer()) feed_dict = {} ########################################################### # # Train model # ########################################################### print("Train model") for epoch in range(FLAGS.epochs): minibatch.shuffle() itr = 0 while not minibatch.end(): # Construct feed dictionary feed_dict = minibatch.next_minibatch_feed_dict( placeholders=placeholders) feed_dict = minibatch.update_feed_dict(feed_dict=feed_dict, dropout=FLAGS.dropout, placeholders=placeholders) t = time.time() # Training step: run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.batch_edge_type_idx], feed_dict=feed_dict) train_cost = outs[1] batch_edge_type = outs[2] if itr % PRINT_PROGRESS_EVERY == 0: val_auc, val_auprc, val_apk = get_accuracy_scores( feed_dict, placeholders, sess, opt, minibatch, adj_mats_orig, minibatch.val_edges, minibatch.val_edges_false, minibatch.idx2edge_type[minibatch.current_edge_type_idx]) print("Epoch:", "%04d" % (epoch + 1), "Iter:", "%04d" % (itr + 1), "Edge:", "%04d" % batch_edge_type, "train_loss=", "{:.5f}".format(train_cost), "val_roc=", "{:.5f}".format(val_auc), "val_auprc=", "{:.5f}".format(val_auprc), "val_apk=", "{:.5f}".format(val_apk), "time=", "{:.5f}".format(time.time() - t)) itr += 1 print("Optimization finished!") for et in range(num_edge_types): roc_score, auprc_score, apk_score = get_accuracy_scores( feed_dict, placeholders, sess, opt, minibatch, adj_mats_orig, minibatch.test_edges, minibatch.test_edges_false, minibatch.idx2edge_type[et]) print("Edge type=", "[%02d, %02d, %02d]" % minibatch.idx2edge_type[et]) print("Edge type:", "%04d" % et, "Test AUROC score", "{:.5f}".format(roc_score)) print("Edge type:", "%04d" % et, "Test AUPRC score", "{:.5f}".format(auprc_score)) print("Edge type:", "%04d" % et, "Test AP@k score", "{:.5f}".format(apk_score)) print()
def main(args): parser = argparse.ArgumentParser() parser.add_argument( "--decagon_data_file_directory", type=str, help= "path to directory where bio-decagon-*.csv files are located, with trailing slash. " "Default is current directory", default='./') parser.add_argument( "--saved_files_directory", type=str, help= "path to directory where saved files files are located, with trailing slash. " "Default is current directory. If a decagon_model.ckpt* exists in this directory, it will " "be loaded and evaluated, and no training will be done.", default='./') parser.add_argument("--verbose", help="increase output verbosity", action="store_true", default=False) args = parser.parse_args(args) decagon_data_file_directory = args.decagon_data_file_directory verbose = args.verbose script_start_time = datetime.now() # create pre-processed file that only has side effect with >=500 occurrences all_combos_df = pd.read_csv('%sbio-decagon-combo.csv' % decagon_data_file_directory) side_effects_500 = all_combos_df["Polypharmacy Side Effect"].value_counts() side_effects_500 = side_effects_500[side_effects_500 >= 500].index.tolist() all_combos_df = all_combos_df[ all_combos_df["Polypharmacy Side Effect"].isin(side_effects_500)] all_combos_df.to_csv('%sbio-decagon-combo-over500only.csv' % decagon_data_file_directory, index=False) # use pre=processed file that only contains the most common side effects (those with >= 500 drug pairs) drug_drug_net, combo2stitch, combo2se, se2name = load_combo_se( fname=('%sbio-decagon-combo-over500only.csv' % decagon_data_file_directory)) # net is a networkx graph with genes(proteins) as nodes and protein-protein-interactions as edges # node2idx maps node id to node index gene_net, node2idx = load_ppi(fname=('%sbio-decagon-ppi.csv' % decagon_data_file_directory)) # stitch2se maps (individual) stitch ids to a list of side effect ids # se2name_mono maps side effect ids that occur in the mono file to side effect names (shorter than se2name) stitch2se, se2name_mono = load_mono_se(fname=('%sbio-decagon-mono.csv' % decagon_data_file_directory)) # stitch2proteins maps stitch ids (drug) to protein (gene) ids drug_gene_net, stitch2proteins = load_targets( fname=('%sbio-decagon-targets-all.csv' % decagon_data_file_directory)) # se2class maps side effect id to class name # this was 0.05 in the original code, but the paper says that 10% each are used for testing and validation val_test_size = 0.1 n_genes = gene_net.number_of_nodes() gene_adj = nx.adjacency_matrix(gene_net) gene_degrees = np.array(gene_adj.sum(axis=0)).squeeze() ordered_list_of_drugs = list(drug_drug_net.nodes.keys()) ordered_list_of_side_effects = list(se2name.keys()) ordered_list_of_proteins = list(gene_net.nodes.keys()) n_drugs = len(ordered_list_of_drugs) drug_gene_adj = sp.lil_matrix(np.zeros((n_drugs, n_genes))) for drug in stitch2proteins: for protein in stitch2proteins[drug]: # there are quite a few drugs in here that aren't in our list of 645, # and proteins that aren't in our list of 19081 if drug in ordered_list_of_drugs and protein in ordered_list_of_proteins: drug_index = ordered_list_of_drugs.index(drug) gene_index = ordered_list_of_proteins.index(protein) drug_gene_adj[drug_index, gene_index] = 1 drug_gene_adj = drug_gene_adj.tocsr() # needs to be drug vs. gene matrix (645x19081) gene_drug_adj = drug_gene_adj.transpose(copy=True) drug_drug_adj_list = [] if not os.path.isfile("adjacency_matrices/sparse_matrix0000.npz"): # pre-initialize all the matrices print("Initializing drug-drug adjacency matrix list") start_time = datetime.now() print("Starting at %s" % str(start_time)) n = len(ordered_list_of_side_effects) for i in range(n): drug_drug_adj_list.append( sp.lil_matrix(np.zeros((n_drugs, n_drugs)))) if verbose: print("%s percent done" % str(100.0 * i / n)) print("Done initializing at %s after %s" % (datetime.now(), datetime.now() - start_time)) start_time = datetime.now() combo_finish_time = start_time print("Creating adjacency matrices for side effects") print("Starting at %s" % str(start_time)) combo_count = len(combo2se) combo_counter = 0 # for side_effect_type in ordered_list_of_side_effects: # for drug1, drug2 in combinations(list(range(n_drugs)), 2): for combo in combo2se.keys(): side_effect_list = combo2se[combo] for present_side_effect in side_effect_list: # find the matrix we need to update side_effect_number = ordered_list_of_side_effects.index( present_side_effect) # find the drugs for which we need to make the update drug_tuple = combo2stitch[combo] drug1_index = ordered_list_of_drugs.index(drug_tuple[0]) drug2_index = ordered_list_of_drugs.index(drug_tuple[1]) # update drug_drug_adj_list[side_effect_number][drug1_index, drug2_index] = 1 if verbose and combo_counter % 1000 == 0: print( "Finished combo %s after %s . %d percent of combos done" % (combo_counter, str(combo_finish_time - start_time), (100.0 * combo_counter / combo_count))) combo_finish_time = datetime.now() combo_counter = combo_counter + 1 print("Done creating adjacency matrices at %s after %s" % (datetime.now(), datetime.now() - start_time)) start_time = datetime.now() print("Saving matrices to file") print("Starting at %s" % str(start_time)) # save matrices to file if not os.path.isdir("adjacency_matrices"): os.mkdir("adjacency_matrices") for i in range(len(drug_drug_adj_list)): sp.save_npz('adjacency_matrices/sparse_matrix%04d.npz' % (i, ), drug_drug_adj_list[i].tocoo()) print("Done saving matrices to file at %s after %s" % (datetime.now(), datetime.now() - start_time)) else: print("Loading adjacency matrices from file.") for i in range(len(ordered_list_of_side_effects)): drug_drug_adj_list.append( sp.load_npz('adjacency_matrices/sparse_matrix%04d.npz' % i)) for i in range(len(drug_drug_adj_list)): drug_drug_adj_list[i] = drug_drug_adj_list[i].tocsr() start_time = datetime.now() print("Setting up for training") print("Starting at %s" % str(start_time)) drug_degrees_list = [ np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in drug_drug_adj_list ] # data representation global adj_mats_orig adj_mats_orig = { (0, 0): [gene_adj, gene_adj.transpose(copy=True) ], # protein-protein interactions (and inverses) (0, 1): [gene_drug_adj], # protein-drug relationships (inverse of targets) (1, 0): [drug_gene_adj], # drug-protein relationships (targets) # This creates an "inverse" relationship for every polypharmacy side effect, using the transpose of the # relationship's adjacency matrix, resulting in 2x the number of side effects (and adjacency matrices). (1, 1): drug_drug_adj_list + [x.transpose(copy=True) for x in drug_drug_adj_list], } degrees = { 0: [gene_degrees, gene_degrees], 1: drug_degrees_list + drug_degrees_list, } # featureless (genes) gene_feat = sp.identity(n_genes) gene_nonzero_feat, gene_num_feat = gene_feat.shape gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo()) # features (drugs) drug_feat = sp.identity(n_drugs) drug_nonzero_feat, drug_num_feat = drug_feat.shape drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo()) # data representation num_feat = { 0: gene_num_feat, 1: drug_num_feat, } nonzero_feat = { 0: gene_nonzero_feat, 1: drug_nonzero_feat, } feat = { 0: gene_feat, 1: drug_feat, } edge_type2dim = { k: [adj.shape for adj in adjs] for k, adjs in adj_mats_orig.items() } edge_type2decoder = { (0, 0): 'bilinear', (0, 1): 'bilinear', (1, 0): 'bilinear', (1, 1): 'dedicom', } edge_types = {k: len(v) for k, v in adj_mats_orig.items()} global num_edge_types num_edge_types = sum(edge_types.values()) print("Edge types:", "%d" % num_edge_types) ########################################################### # # Settings and placeholders # ########################################################### # Important -- Do not evaluate/print validation performance every iteration as it can take # substantial amount of time PRINT_PROGRESS_EVERY = 10000 print("Defining placeholders") construct_placeholders(edge_types) ########################################################### # # Create minibatch iterator, model and optimizer # ########################################################### global minibatch_iterator iterator_pickle_file_name = args.saved_files_directory + "minibatch_iterator.pickle" if os.path.isfile(iterator_pickle_file_name): print("Load minibatch iterator pickle") with open(iterator_pickle_file_name, 'rb') as pickle_file: minibatch_iterator = pickle.load(pickle_file) else: print("Create minibatch iterator") minibatch_iterator = EdgeMinibatchIterator(adj_mats=adj_mats_orig, feat=feat, edge_types=edge_types, batch_size=FLAGS.batch_size, val_test_size=val_test_size) print("Pickling minibatch iterator") with open(iterator_pickle_file_name, 'wb') as pickle_file: pickle.dump(minibatch_iterator, pickle_file) print("Create model") model = DecagonModel( placeholders=placeholders, num_feat=num_feat, nonzero_feat=nonzero_feat, edge_types=edge_types, decoders=edge_type2decoder, ) print("Create optimizer") global optimizer with tf.name_scope('optimizer'): optimizer = DecagonOptimizer(embeddings=model.embeddings, latent_inters=model.latent_inters, latent_varies=model.latent_varies, degrees=degrees, edge_types=edge_types, edge_type2dim=edge_type2dim, placeholders=placeholders, batch_size=FLAGS.batch_size, margin=FLAGS.max_margin) print("Done setting up at %s after %s" % (datetime.now(), datetime.now() - start_time)) print("Initialize session") global sess sess = tf.Session() decagon_model_file_name = args.saved_files_directory + "decagon_model.ckpt" saved_model_available = os.path.isfile(decagon_model_file_name + ".index") if saved_model_available: saver = tf.train.Saver() saver.restore(sess, decagon_model_file_name) print("Model restored.") if not saved_model_available: print("Training model") start_time = datetime.now() print("Starting at %s" % str(start_time)) sess.run(tf.global_variables_initializer()) feed_dict = {} ########################################################### # # Train model # ########################################################### saver = tf.train.Saver() print("Train model") epoch_losses = [] for epoch in range(FLAGS.epochs): minibatch_iterator.shuffle() itr = 0 while not minibatch_iterator.end(): # Construct feed dictionary feed_dict = minibatch_iterator.next_minibatch_feed_dict( placeholders=placeholders) feed_dict = minibatch_iterator.update_feed_dict( feed_dict=feed_dict, dropout=FLAGS.dropout, placeholders=placeholders) t = time.time() # Training step: run single weight update outs = sess.run([ optimizer.opt_op, optimizer.cost, optimizer.batch_edge_type_idx ], feed_dict=feed_dict) train_cost = outs[1] batch_edge_type = outs[2] if itr % PRINT_PROGRESS_EVERY == 0: val_auc, val_auprc, val_apk = get_accuracy_scores( minibatch_iterator.val_edges, minibatch_iterator.val_edges_false, minibatch_iterator.idx2edge_type[ minibatch_iterator.current_edge_type_idx], feed_dict) print("Epoch:", "%04d" % (epoch + 1), "Iter:", "%04d" % (itr + 1), "Edge:", "%04d" % batch_edge_type, "train_loss=", "{:.5f}".format(train_cost), "val_roc=", "{:.5f}".format(val_auc), "val_auprc=", "{:.5f}".format(val_auprc), "val_apk=", "{:.5f}".format(val_apk), "time=", "{:.5f}".format(time.time() - t)) itr += 1 validation_loss = get_validation_loss( edges_pos=minibatch_iterator.val_edges, edges_neg=minibatch_iterator.val_edges_false, feed_dict=feed_dict) print( "Epoch:", "%04d" % (epoch + 1), "Validation loss (average cross entropy): {}".format( validation_loss)) epoch_losses.append(validation_loss) if len(epoch_losses) >= 3: if round(epoch_losses[-1], 3) >= round( epoch_losses[-2], 3) >= round(epoch_losses[-3], 3): break print("Saving model after epoch:", epoch) save_path = saver.save( sess, args.saved_files_directory + "decagon_model" + str(epoch) + ".ckpt") print("Model saved in path: %s" % save_path) print("Optimization finished!") print("Done training model %s after %s" % (datetime.now(), datetime.now() - start_time)) print("Saving model") save_path = saver.save(sess, decagon_model_file_name) print("Model saved in path: %s" % save_path) print("Pickling minibatch iterator") with open(iterator_pickle_file_name, 'wb') as pickle_file: pickle.dump(minibatch_iterator, pickle_file) start_time = datetime.now() print("Evaluating model") print("Starting at %s" % str(start_time)) for edge_type in range(num_edge_types): # get all edges in test set with this type feed_dict = minibatch_iterator.test_feed_dict( edge_type, placeholders=placeholders) feed_dict = minibatch_iterator.update_feed_dict( feed_dict, FLAGS.dropout, placeholders) edge_tuple = minibatch_iterator.idx2edge_type[edge_type] _, _, all_scores, all_labels, subjects, predicates, objects = get_predictions( edges_pos=minibatch_iterator.test_edges, edges_neg=minibatch_iterator.test_edges_false, edge_type=edge_tuple, feed_dict=feed_dict) print("subject\tpredicate\tobject\tpredicted\tactual") for i in range(len(all_scores)): subject = subjects[i] if edge_tuple[0] == 1: subject = ordered_list_of_drugs[subject] else: subject = ordered_list_of_proteins[subject] object = objects[i] if edge_tuple[1] == 1: object = ordered_list_of_drugs[object] else: object = ordered_list_of_proteins[object] predicate = predicates[i] if edge_tuple[:2] == (1, 1): side_effect_index = edge_tuple[2] is_inverse = False if side_effect_index >= 963: side_effect_index = side_effect_index - 963 is_inverse = True predicate = ordered_list_of_side_effects[side_effect_index] if is_inverse: predicate = predicate + "_2" print("{}\t{}\t{}\t{}\t{}".format(subject, predicate, object, all_scores[i], all_labels[i])) print() print("Done evaluating at %s after %s" % (datetime.now(), datetime.now() - start_time)) print("Script running time: %s" % (datetime.now() - script_start_time))
model = DecagonModel( placeholders=placeholders, num_feat=num_feat, nonzero_feat=nonzero_feat, edge_types=edge_types, decoders=edge_type2decoder, ) print("Create optimizer") with tf.compat.v1.name_scope('optimizer'): opt = DecagonOptimizer( embeddings=model.embeddings, latent_inters=model.latent_inters, latent_varies=model.latent_varies, degrees=degrees, edge_types=edge_types, edge_type2dim=edge_type2dim, placeholders=placeholders, batch_size=PARAMS['batch_size'], margin=PARAMS['max_margin'] ) print("Initialize session") sess = tf.compat.v1.Session() sess.run(tf.compat.v1.global_variables_initializer()) feed_dict = {} ########################################################### # # Train model #