示例#1
0
    def _minibatch_iterator_init(self, path_to_split: str, batch_size: int,
                                 val_test_size: float) -> NoReturn:
        """
        Create minibatch iterator (self.minibatch).

        Parameters
        ----------
        path_to_split : str
            Path to save train, test and validate edges.
            If it consist needed edges, they will be loaded.
            Else they will be calculated and saved.
        batch_size : int
            Minibatch size.
        val_test_size : float
            Proportion to split edges into train, test and validate.

        """
        print('Create minibatch iterator')
        need_sample_edges = not (os.path.isdir(path_to_split) and
                                 len(os.listdir(path_to_split)) == 6)
        self.minibatch = EdgeMinibatchIterator(
            adj_mats=self.adj_mats,
            feat=self.feat,
            edge_types=self.edge_types,
            symmetry_types_groups=self.symmetry_types_groups,
            batch_size=batch_size,
            val_test_size=val_test_size,
            path_to_split=path_to_split,
            need_sample_edges=need_sample_edges
        )
示例#2
0
flags.DEFINE_boolean('bias', True, 'Bias term.')

print 'Defining placeholders'
placeholders = construct_placeholders(edge_types)

###########################################################
#
# Create minibatch iterator, model and optimizer
#
###########################################################

print 'Create minibatch iterator'
minibatch = EdgeMinibatchIterator(
    adj_mats=adj_mats_orig,
    feat=feat,
    edge_types=edge_types,
    directed=edge_type2directed,
    batch_size=FLAGS.batch_size
)

print 'Create model'
model = DecagonModel(
    placeholders=placeholders,
    num_feat=num_feat,
    nonzero_feat=nonzero_feat,
    edge_types=edge_types,
    decoders=edge_type2decoder,
)

print 'Create optimizer'
with tf.name_scope('optimizer'):
示例#3
0
    flags.DEFINE_integer('hidden1', 64, 'Number of units in hidden layer 1.')
    flags.DEFINE_integer('hidden2', 32, 'Number of units in hidden layer 2.')
    flags.DEFINE_float('weight_decay', 0.001,
                       'Weight for L2 loss on embedding matrix.')
    flags.DEFINE_float('dropout', 0.1, 'Dropout rate (1 - keep probability).')
    flags.DEFINE_float('max_margin', 0.1, 'Max margin parameter in hinge loss')
    flags.DEFINE_integer('batch_size', 512, 'minibatch size.')
    flags.DEFINE_boolean('bias', True, 'Bias term.')

    print("Defining placeholders")
    placeholders = construct_placeholders(edge_types)

    print("Create minibatch iterator")
    minibatch = EdgeMinibatchIterator(adj_mats=adj_mats_orig,
                                      feat=feat,
                                      edge_types=edge_types,
                                      batch_size=FLAGS.batch_size,
                                      val_test_size=val_test_size)

    print("Create model")
    model = DecagonModel(
        placeholders=placeholders,
        num_feat=num_feat,
        nonzero_feat=nonzero_feat,
        edge_types=edge_types,
        decoders=edge_type2decoder,
    )

    print("Create optimizer")
    with tf.name_scope('optimizer'):
        opt = DecagonOptimizer(embeddings=model.embeddings,
示例#4
0
def main_execution():
    combo_to_drugs_ids, combo_to_side_effects = load_drug_bank_combo_side_effect_file(
        fichier='polypharmacy/drugbank/drugbank-combo.csv')
    nodes = set([u for e in combo_to_drugs_ids.values() for u in e])
    n_drugs = len(nodes)
    relation_types = set([r for r in combo_to_side_effects.values()])
    n_drugdrug_rel_types = len(relation_types)
    drugs_to_positions_in_matrices_dict = {
        node: i
        for i, node in enumerate(nodes)
    }

    drug_drug_adj_list = []  # matrice d'adjacence de chaque drug_drug
    for i, el in enumerate(relation_types):  # pour chaque side effect
        mat = np.zeros((n_drugs, n_drugs))
        for d1, d2 in combinations(list(nodes), 2):
            temp_cle = '{}_{}'.format(d1, d2)
            if temp_cle in combo_to_side_effects.keys():
                if combo_to_side_effects[temp_cle] == el:
                    # chaque fois on a une réelle s.e entre les 2 drogues dans la matrice
                    mat[drugs_to_positions_in_matrices_dict[d1], drugs_to_positions_in_matrices_dict[d2]] = \
                        mat[drugs_to_positions_in_matrices_dict[d2], drugs_to_positions_in_matrices_dict[d1]] = 1.
                    # Inscrire une interaction
        drug_drug_adj_list.append(sp.csr_matrix(mat))
    drug_degrees_list = [
        np.array(drug_adj.sum(axis=0)).squeeze()
        for drug_adj in drug_drug_adj_list
    ]

    adj_mats_orig = {
        (0, 0):
        drug_drug_adj_list +
        [x.transpose(copy=True) for x in drug_drug_adj_list],
    }
    degrees = {
        0: drug_degrees_list + drug_degrees_list,
    }

    # features (drugs)
    drug_feat = sp.identity(n_drugs)
    drug_nonzero_feat, drug_num_feat = drug_feat.shape
    drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo())

    # data representation
    num_feat = {
        0: drug_num_feat,
    }
    nonzero_feat = {
        0: drug_nonzero_feat,
    }
    feat = {
        0: drug_feat,
    }

    edge_type2dim = {
        k: [adj.shape for adj in adjs]
        for k, adjs in adj_mats_orig.items()
    }
    edge_type2decoder = {
        (0, 0): 'dedicom',
    }

    edge_types = {k: len(v) for k, v in adj_mats_orig.items()}
    num_edge_types = sum(edge_types.values())
    print("Edge types:", "%d" % num_edge_types)
    print("Defining placeholders")
    placeholders = construct_placeholders(edge_types)

    ###########################################################
    #
    # Create minibatch iterator, model and optimizer
    #
    ###########################################################

    print("Create minibatch iterator")
    minibatch = EdgeMinibatchIterator(adj_mats=adj_mats_orig,
                                      feat=feat,
                                      edge_types=edge_types,
                                      batch_size=FLAGS.batch_size,
                                      val_test_size=val_test_size)

    print("Create model")
    model = DecagonModel(
        placeholders=placeholders,
        num_feat=num_feat,
        nonzero_feat=nonzero_feat,
        edge_types=edge_types,
        decoders=edge_type2decoder,
    )

    print("Create optimizer")
    with tf.name_scope('optimizer'):
        opt = DecagonOptimizer(embeddings=model.embeddings,
                               latent_inters=model.latent_inters,
                               latent_varies=model.latent_varies,
                               degrees=degrees,
                               edge_types=edge_types,
                               edge_type2dim=edge_type2dim,
                               placeholders=placeholders,
                               batch_size=FLAGS.batch_size,
                               margin=FLAGS.max_margin)

    print("Initialize session")
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    feed_dict = {}

    ###########################################################
    #
    # Train model
    #
    ###########################################################

    print("Train model")
    for epoch in range(FLAGS.epochs):

        minibatch.shuffle()
        itr = 0
        while not minibatch.end():
            # Construct feed dictionary
            feed_dict = minibatch.next_minibatch_feed_dict(
                placeholders=placeholders)
            feed_dict = minibatch.update_feed_dict(feed_dict=feed_dict,
                                                   dropout=FLAGS.dropout,
                                                   placeholders=placeholders)

            t = time.time()

            # Training step: run single weight update
            outs = sess.run([opt.opt_op, opt.cost, opt.batch_edge_type_idx],
                            feed_dict=feed_dict)
            train_cost = outs[1]
            batch_edge_type = outs[2]

            if itr % PRINT_PROGRESS_EVERY == 0:
                val_auc, val_auprc, val_apk = get_accuracy_scores(
                    feed_dict, placeholders, sess, opt, minibatch,
                    adj_mats_orig, minibatch.val_edges,
                    minibatch.val_edges_false,
                    minibatch.idx2edge_type[minibatch.current_edge_type_idx])

                print("Epoch:", "%04d" % (epoch + 1), "Iter:",
                      "%04d" % (itr + 1), "Edge:", "%04d" % batch_edge_type,
                      "train_loss=", "{:.5f}".format(train_cost), "val_roc=",
                      "{:.5f}".format(val_auc), "val_auprc=",
                      "{:.5f}".format(val_auprc), "val_apk=",
                      "{:.5f}".format(val_apk), "time=",
                      "{:.5f}".format(time.time() - t))

            itr += 1

    print("Optimization finished!")

    for et in range(num_edge_types):
        roc_score, auprc_score, apk_score = get_accuracy_scores(
            feed_dict, placeholders, sess, opt, minibatch, adj_mats_orig,
            minibatch.test_edges, minibatch.test_edges_false,
            minibatch.idx2edge_type[et])
        print("Edge type=", "[%02d, %02d, %02d]" % minibatch.idx2edge_type[et])
        print("Edge type:", "%04d" % et, "Test AUROC score",
              "{:.5f}".format(roc_score))
        print("Edge type:", "%04d" % et, "Test AUPRC score",
              "{:.5f}".format(auprc_score))
        print("Edge type:", "%04d" % et, "Test AP@k score",
              "{:.5f}".format(apk_score))
        print()
示例#5
0
    num_edge_types = sum(edge_types.values())
    print("Edge types:", "%d" % num_edge_types)

    # Important -- Do not evaluate/print validation performance every iteration as it can take
    # substantial amount of time
    PRINT_PROGRESS_EVERY = 20

    print("Defining placeholders")
    placeholders = construct_placeholders(edge_types)

    print("Create minibatch iterator")
    minibatch = EdgeMinibatchIterator(
        adj_mats=adj_mats_orig,
        seed=seed,
        feat=feat,
        edge_types=edge_types,
        data_set=data_set,
        batch_size=FLAGS.batch_size,
        val_test_size=val_test_size,
    )

    print("Create model")
    model = DecagonModel(
        data_set=data_set,
        placeholders=placeholders,
        num_feat=num_feat,
        nonzero_feat=nonzero_feat,
        edge_types=edge_types,
        decoders=edge_type2decoder,
    )
def main_execution(combo_file='./polypharmacy/bio-decagon-combo.csv',
                   targets_file='./polypharmacy/bio-decagon-targets.csv',
                   genes_genes_file='./polypharmacy/bio-decagon-ppi.csv',
                   new_train_test_split=False):
    print('Load Combo to Side Effects')
    if combo_file.find('decagon') != -1:
        combo_to_drugs_ids, combo_to_side_effects, combo_to_side_effects_names, side_effects_ids_to_names = \
            load_decagon_combo_side_effect_file(fichier=combo_file)
        print('Load drugs to targets')
        drugs_id_to_targets_id = load_decagon_file_targets_id(
            fichier=targets_file)
    else:
        combo_to_drugs_ids, combo_to_side_effects = load_drug_bank_combo_side_effect_file(
            fichier=combo_file)
        print('Load drugs to targets')
        drugs_id_to_targets_id, drugs_id_to_drugs_name = load_file_targets_id(
            fichier=targets_file)

    print('Load genes to genes (targets) interactions net')
    genes_genes_net, genes_node_to_idx = load_genes_genes_interactions(
        fichier=genes_genes_file)

    print('Build genes-genes adjacency matrix')
    genes_adj = nx.adjacency_matrix(genes_genes_net)
    genes_degrees = np.array(genes_adj.sum(axis=0)).squeeze()

    if new_train_test_split:
        print('Load the new train test validation split')
        combo_to_drugs_ids_train, combo_to_drugs_ids_test, combo_to_drugs_ids_valid = train_test_valid_split_3(
        )
        drug_nodes_train = set(
            [u for e in combo_to_drugs_ids_train.values() for u in e])
        drug_nodes_test = set(
            [u for e in combo_to_drugs_ids_test.values() for u in e])
        drug_nodes_valid = set(
            [u for e in combo_to_drugs_ids_valid.values() for u in e])

    print('Build drugs-drugs matrix representation')
    drug_nodes = set([u for e in combo_to_drugs_ids.values() for u in e])
    n_drugs = len(drug_nodes)
    relation_types = set(
        [r for se in combo_to_side_effects.values() for r in se])
    drugs_nodes_to_idx = {node: i for i, node in enumerate(drug_nodes)}

    print('Build general drugs-drugs matrix representation')
    drug_drug_adj_list = []  # matrice d'adjacence de chaque drug_drug
    for i, el in enumerate(relation_types):  # pour chaque side effect
        mat = np.zeros((n_drugs, n_drugs))
        for d1, d2 in combinations(list(drug_nodes), 2):
            temp_cle = '{}_{}'.format(d1, d2)
            if temp_cle in combo_to_side_effects.keys():
                if el in combo_to_side_effects[temp_cle]:
                    # list of list on check si le s.e apparait au moins une fois dans la liste
                    mat[drugs_nodes_to_idx[d1], drugs_nodes_to_idx[d2]] = \
                        mat[drugs_nodes_to_idx[d2], drugs_nodes_to_idx[d1]] = 1.
                    # Inscrire une interaction
        drug_drug_adj_list.append(sp.csr_matrix(mat))
    drug_degrees_list = [
        np.array(drug_adj.sum(axis=0)).squeeze()
        for drug_adj in drug_drug_adj_list
    ]

    if new_train_test_split:
        print('Build train drugs-drugs matrix representation')
        drug_drug_adj_list_train = [
        ]  # matrice d'adjacence de chaque drug_drug
        for i, el in enumerate(relation_types):  # pour chaque side effect
            mat = np.zeros((n_drugs, n_drugs))
            for d1, d2 in combinations(list(drug_nodes_train), 2):
                temp_cle = '{}_{}'.format(d1, d2)
                if temp_cle in combo_to_side_effects.keys():
                    if el in combo_to_side_effects[temp_cle]:
                        # list of list on check si le s.e apparait au moins une fois dans la liste
                        mat[drugs_nodes_to_idx[d1], drugs_nodes_to_idx[d2]] = \
                            mat[drugs_nodes_to_idx[d2], drugs_nodes_to_idx[d1]] = 1.
                    # Inscrire une interaction
            drug_drug_adj_list_train.append(sp.csr_matrix(mat))
        drug_degrees_list_train = [
            np.array(drug_adj.sum(axis=0)).squeeze()
            for drug_adj in drug_drug_adj_list_train
        ]

        print('Build test drugs-drugs matrix representation')
        drug_drug_adj_list_test = []  # matrice d'adjacence de chaque drug_drug
        for i, el in enumerate(relation_types):  # pour chaque side effect
            mat = np.zeros((n_drugs, n_drugs))
            for d1, d2 in combinations(list(drug_nodes_test), 2):
                temp_cle = '{}_{}'.format(d1, d2)
                if temp_cle in combo_to_side_effects.keys():
                    if el in combo_to_side_effects[temp_cle]:
                        # list of list on check si le s.e apparait au moins une fois dans la liste
                        mat[drugs_nodes_to_idx[d1], drugs_nodes_to_idx[d2]] = \
                            mat[drugs_nodes_to_idx[d2], drugs_nodes_to_idx[d1]] = 1.
                    # Inscrire une interaction
            drug_drug_adj_list_test.append(sp.csr_matrix(mat))
        drug_degrees_list_test = [
            np.array(drug_adj.sum(axis=0)).squeeze()
            for drug_adj in drug_drug_adj_list_test
        ]

        print('Build valid drugs-drugs matrix representation')
        drug_drug_adj_list_valid = [
        ]  # matrice d'adjacence de chaque drug_drug
        for i, el in enumerate(relation_types):  # pour chaque side effect
            mat = np.zeros((n_drugs, n_drugs))
            for d1, d2 in combinations(list(drug_nodes_valid), 2):
                temp_cle = '{}_{}'.format(d1, d2)
                if temp_cle in combo_to_side_effects.keys():
                    if el in combo_to_side_effects[temp_cle]:
                        # list of list on check si le s.e apparait au moins une fois dans la liste
                        mat[drugs_nodes_to_idx[d1], drugs_nodes_to_idx[d2]] = \
                            mat[drugs_nodes_to_idx[d2], drugs_nodes_to_idx[d1]] = 1.
                    # Inscrire une interaction
            drug_drug_adj_list_valid.append(sp.csr_matrix(mat))
        drug_degrees_list_valid = [
            np.array(drug_adj.sum(axis=0)).squeeze()
            for drug_adj in drug_drug_adj_list_valid
        ]

    print('Build general genes-drugs matrix representation')
    genes_nodes = set([gene_node for gene_node in genes_node_to_idx.keys()])
    n_genes = len(genes_nodes)
    mat = np.zeros((n_genes, n_drugs))
    for drug in drug_nodes:
        if drug in drugs_id_to_targets_id.keys():
            for target in drugs_id_to_targets_id[drug]:
                if target in genes_node_to_idx.keys():
                    mat[genes_node_to_idx[target],
                        drugs_nodes_to_idx[drug]] = 1.
    genes_drugs_adj = sp.csr_matrix(mat)
    drugs_genes_adj = genes_drugs_adj.transpose(copy=True)

    if new_train_test_split:
        print('Build train genes-drugs matrix representation')
        for drug in drug_nodes_train:
            if drug in drugs_id_to_targets_id.keys():
                for target in drugs_id_to_targets_id[drug]:
                    if target in genes_node_to_idx.keys():
                        mat[genes_node_to_idx[target],
                            drugs_nodes_to_idx[drug]] = 1.
        genes_drugs_adj_train = sp.csr_matrix(mat)
        drugs_genes_adj_train = genes_drugs_adj_train.transpose(copy=True)

        print('Build test genes-drugs matrix representation')
        for drug in drug_nodes_test:
            if drug in drugs_id_to_targets_id.keys():
                for target in drugs_id_to_targets_id[drug]:
                    if target in genes_node_to_idx.keys():
                        mat[genes_node_to_idx[target],
                            drugs_nodes_to_idx[drug]] = 1.
        genes_drugs_adj_test = sp.csr_matrix(mat)
        drugs_genes_adj_test = genes_drugs_adj_test.transpose(copy=True)

        print('Build valid genes-drugs matrix representation')
        for drug in drug_nodes_valid:
            if drug in drugs_id_to_targets_id.keys():
                for target in drugs_id_to_targets_id[drug]:
                    if target in genes_node_to_idx.keys():
                        mat[genes_node_to_idx[target],
                            drugs_nodes_to_idx[drug]] = 1.
        genes_drugs_adj_valid = sp.csr_matrix(mat)
        drugs_genes_adj_valid = genes_drugs_adj_valid.transpose(copy=True)

    print('Build general Adjacency matrix data representation')
    adj_mats_orig = {
        (0, 0): [genes_adj, genes_adj.transpose(copy=True)],
        (0, 1): [genes_drugs_adj],
        (1, 0): [drugs_genes_adj],
        (1, 1):
        drug_drug_adj_list +
        [x.transpose(copy=True) for x in drug_drug_adj_list],
    }

    if new_train_test_split:
        print('Build train Adjacency matrix data representation')
        adj_mats_orig_train = {
            (0, 0): [genes_adj, genes_adj.transpose(copy=True)],
            (0, 1): [genes_drugs_adj_train],
            (1, 0): [drugs_genes_adj_train],
            (1, 1):
            drug_drug_adj_list_train +
            [x.transpose(copy=True) for x in drug_drug_adj_list_train],
        }

        print('Build test Adjacency matrix data representation')
        adj_mats_orig_test = {
            (0, 0): [genes_adj, genes_adj.transpose(copy=True)],
            (0, 1): [genes_drugs_adj_test],
            (1, 0): [drugs_genes_adj_test],
            (1, 1):
            drug_drug_adj_list_test +
            [x.transpose(copy=True) for x in drug_drug_adj_list_test],
        }

        print('Build valid Adjacency matrix data representation')
        adj_mats_orig_valid = {
            (0, 0): [genes_adj, genes_adj.transpose(copy=True)],
            (0, 1): [genes_drugs_adj_valid],
            (1, 0): [drugs_genes_adj_valid],
            (1, 1):
            drug_drug_adj_list_valid +
            [x.transpose(copy=True) for x in drug_drug_adj_list_valid],
        }

    degrees = {
        0: [genes_degrees, genes_degrees],
        1: drug_degrees_list + drug_degrees_list,
    }

    print('featureless (genes)')
    gene_feat = sp.identity(n_genes)
    gene_nonzero_feat, gene_num_feat = gene_feat.shape
    gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo())

    print('features (drugs)')
    drug_feat = sp.identity(n_drugs)
    drug_nonzero_feat, drug_num_feat = drug_feat.shape
    drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo())

    print('Features data representation')
    num_feat = {
        0: gene_num_feat,
        1: drug_num_feat,
    }
    nonzero_feat = {
        0: gene_nonzero_feat,
        1: drug_nonzero_feat,
    }
    feat = {
        0: gene_feat,
        1: drug_feat,
    }

    edge_type2dim = {
        k: [adj.shape for adj in adjs]
        for k, adjs in adj_mats_orig.items()
    }
    edge_type2decoder = {
        (0, 0): 'bilinear',
        (0, 1): 'bilinear',
        (1, 0): 'bilinear',
        (1, 1): 'dedicom',
    }

    edge_types = {k: len(v) for k, v in adj_mats_orig.items()}
    num_edge_types = sum(edge_types.values())
    print("Edge types:", "%d" % num_edge_types)

    print("Defining placeholders")
    placeholders = construct_placeholders(edge_types)

    ###########################################################
    #
    # Create minibatch iterator, model and optimizer
    #
    ###########################################################

    if new_train_test_split:
        print("Create minibatch iterator")
        minibatch = EdgeMinibatchIteratorNewSplit(
            adj_mats=adj_mats_orig,
            adj_mats_train=adj_mats_orig_train,
            adj_mats_test=adj_mats_orig_test,
            adj_mats_valid=adj_mats_orig_valid,
            feat=feat,
            edge_types=edge_types,
            batch_size=FLAGS.batch_size,
            val_test_size=val_test_size)
    else:
        print("Create minibatch iterator")
        minibatch = EdgeMinibatchIterator(adj_mats=adj_mats_orig,
                                          feat=feat,
                                          edge_types=edge_types,
                                          batch_size=FLAGS.batch_size,
                                          val_test_size=val_test_size)

    print("Create model")
    model = DecagonModel(
        placeholders=placeholders,
        num_feat=num_feat,
        nonzero_feat=nonzero_feat,
        edge_types=edge_types,
        decoders=edge_type2decoder,
    )

    print("Create optimizer")
    with tf.name_scope('optimizer'):
        opt = DecagonOptimizer(embeddings=model.embeddings,
                               latent_inters=model.latent_inters,
                               latent_varies=model.latent_varies,
                               degrees=degrees,
                               edge_types=edge_types,
                               edge_type2dim=edge_type2dim,
                               placeholders=placeholders,
                               batch_size=FLAGS.batch_size,
                               margin=FLAGS.max_margin)

    print("Initialize session")
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    feed_dict = {}

    ###########################################################
    #
    # Train model
    #
    ###########################################################

    print("Train model")
    for epoch in range(FLAGS.epochs):

        minibatch.shuffle()
        itr = 0
        while not minibatch.end():
            # Construct feed dictionary
            feed_dict = minibatch.next_minibatch_feed_dict(
                placeholders=placeholders)
            feed_dict = minibatch.update_feed_dict(feed_dict=feed_dict,
                                                   dropout=FLAGS.dropout,
                                                   placeholders=placeholders)

            t = time.time()

            # Training step: run single weight update
            outs = sess.run([opt.opt_op, opt.cost, opt.batch_edge_type_idx],
                            feed_dict=feed_dict)
            train_cost = outs[1]
            batch_edge_type = outs[2]

            if itr % PRINT_PROGRESS_EVERY == 0:
                val_auc, val_auprc, val_apk = get_accuracy_scores(
                    feed_dict, placeholders, sess, opt, minibatch,
                    adj_mats_orig, minibatch.val_edges,
                    minibatch.val_edges_false,
                    minibatch.idx2edge_type[minibatch.current_edge_type_idx])

                print("Epoch:", "%04d" % (epoch + 1), "Iter:",
                      "%04d" % (itr + 1), "Edge:", "%04d" % batch_edge_type,
                      "train_loss=", "{:.5f}".format(train_cost), "val_roc=",
                      "{:.5f}".format(val_auc), "val_auprc=",
                      "{:.5f}".format(val_auprc), "val_apk=",
                      "{:.5f}".format(val_apk), "time=",
                      "{:.5f}".format(time.time() - t))

            itr += 1

    print("Optimization finished!")

    for et in range(num_edge_types):
        roc_score, auprc_score, apk_score = get_accuracy_scores(
            feed_dict, placeholders, sess, opt, minibatch, adj_mats_orig,
            minibatch.test_edges, minibatch.test_edges_false,
            minibatch.idx2edge_type[et])
        print("Edge type=", "[%02d, %02d, %02d]" % minibatch.idx2edge_type[et])
        print("Edge type:", "%04d" % et, "Test AUROC score",
              "{:.5f}".format(roc_score))
        print("Edge type:", "%04d" % et, "Test AUPRC score",
              "{:.5f}".format(auprc_score))
        print("Edge type:", "%04d" % et, "Test AP@k score",
              "{:.5f}".format(apk_score))
        print()
示例#7
0
class RunDecagon(metaclass=ABCMeta):
    """
    Abstract class of Decagon runner.
    Different subclasses define specific behavior
    (e.g. run on synthetic data or real).


    Attributes
    ----------
    adj_mats : Dict[Tuple[int, int], List[sp.csr_matrix]]
        From edge type to list of adjacency matrices for each edge class
        (e.g. (1, 1): list of drug-drug adjacency matrices for each se class).
        In our case all matrix in adj_mats are symmetric.
    degrees : Dict[int, List[int]]
        Number of connections for each node (0: genes, 1: drugs).

    edge_type2dim : Dict[Tuple[int, int], List[int]
        From edge type to list of shapes all its adjacency matrices.
    edge_type2decoder : Dict[Tuple[int, int], str]
        From edge type to decoder type
        (we use different decompositions for different edges types).
    edge_types : Dict[Tuple[int, int], int]
        From edge type to number of classes of these edge type
        (e. g. (1, 1): number of se).
    num_edge_types : int
        Number of all edge types (considering all classes).
    symmetry_types_groups : List[List]
        Should contains lists with len in {1, 2}.
        All types of edges splits into groups of symmetry.
        E. g. symmetry_types_groups = [[(0, 0)], [(0, 1), (1, 0)], [(1, 1)]].
        Two types from one group of symmetry have same edges, differing only in direction
        (e.g (0, 1) has protein -> drug edges and (1, 0) has drug -> protein edges).

    num_feat : Dict[int, int]
        Number of elements in feature vector for 0: -genes, for 1: -drugs.
    nonzero_feat : Dict[int, int]
        Number of all features for 0: -gene and 1: -drug nodes.
    feat : Dict[int, sp.csr_matrix]
        From edge type (0 = gene, 1 = drug) to feature matrix.
        Row in feature matrix = embedding of one node.

    minibatch : EdgeMinibatchIterator
        Minibatch iterator.
    placeholders : Dict[str, tf.compat.v1.placeholder]
        Variables for input data in decagon model.
    model : DecagonModel
        Decagon model (encoder + decoder).
    opt : DecagonOptimizer
        Optimizer of decagon weigts.
    """

    def __init__(self):
        self.adj_mats = None
        self.degrees = None
        self.num_feat = None
        self.nonzero_feat = None
        self.feat = None
        self.edge_type2dim = None
        self.edge_type2decoder = None
        self.edge_types = None
        self.num_edge_types = None

        self.minibatch = None
        self.opt = None
        self.placeholders = None
        self.model = None
        self.feed_dict = None

        pass

    def _adjacency(self, adj_path: str) -> NoReturn:
        """
        Create self.adj_mats, self.degrees.

        Parameters
        ----------
        adj_path : str
            path for saving/loading adjacency matrices.

        Notes
        -----
        self.adj_mats: Dict[Tuple[int, int], List[sp.csr_matrix]]
            From edge type to list of adjacency matrices for each edge class
            (e.g. (1, 1): list of drug-drug adjacency matrices for each se class)
            In our case all matrix in adj_mats are symmetric
        self.degrees: Dict[int, List[int]]
            Number of connections for each node (0: genes, 1: drugs)

        """
        raise NotImplementedError()

    def _nodes_features(self) -> NoReturn:
        """
        Create self.num_feat, self.nonzero_feat, self.feat.

        Returns
        -------

        Notes
        -----
        self.num_feat : Dict[int, int]
            Number of elements in feature vector for 0: -genes, for 1: -drugs.
        self.nonzero_feat : Dict[int, int]
            Number of all features for 0: -gene and 1: -drug nodes.
            All features should be nonzero!??
            TODO: What to do with zero features??
            E.g., it is in format 0: num of genes in graph, 1: num of drugs.
        self.feat : Dict[int, sp.csr_matrix]
            From edge type (0 = gene, 1 = drug) to feature matrix.
            Row in feature matrix = embedding of one node.

        """
        raise NotImplementedError()

    def _edge_types_info(self) -> NoReturn:
        """
        Create self.edge_type2dim, self.edge_type2decoder, self.edge_types,
        self.num_edge_types.

        Notes
        -----
        self.edge_type2dim : Dict[Tuple[int, int], List[int]
            From edge type to list of shapes all its adjacency matrices.
        self.edge_type2decoder : Dict[Tuple[int, int], str]
            From edge type to decoder type
            (we use different decompositions for different edges types).
        self.edge_types : Dict[Tuple[int, int], int]
            From edge type to number of classes of these edge type
            (e. g. (1, 1): number of se).
        self.num_edge_types : int
            Number of all edge types (considering all classes).

        """
        self.edge_type2dim = {k: [adj.shape for adj in adjs] for k, adjs in
                              self.adj_mats.items()}
        self.edge_type2decoder = {
            (0, 0): 'bilinear',
            (0, 1): 'bilinear',
            (1, 0): 'bilinear',
            (1, 1): 'dedicom',
        }
        self.symmetry_types_groups = [
            [(0, 0)],
            [(0, 1), (1, 0)],
            [(1, 1)]
        ]

        self.edge_types = {k: len(v) for k, v in self.adj_mats.items()}
        self.num_edge_types = sum(self.edge_types.values())
        print(f'Edge types {self.num_edge_types}')

    def _minibatch_iterator_init(self, path_to_split: str, batch_size: int,
                                 val_test_size: float) -> NoReturn:
        """
        Create minibatch iterator (self.minibatch).

        Parameters
        ----------
        path_to_split : str
            Path to save train, test and validate edges.
            If it consist needed edges, they will be loaded.
            Else they will be calculated and saved.
        batch_size : int
            Minibatch size.
        val_test_size : float
            Proportion to split edges into train, test and validate.

        """
        print('Create minibatch iterator')
        need_sample_edges = not (os.path.isdir(path_to_split) and
                                 len(os.listdir(path_to_split)) == 6)
        self.minibatch = EdgeMinibatchIterator(
            adj_mats=self.adj_mats,
            feat=self.feat,
            edge_types=self.edge_types,
            symmetry_types_groups=self.symmetry_types_groups,
            batch_size=batch_size,
            val_test_size=val_test_size,
            path_to_split=path_to_split,
            need_sample_edges=need_sample_edges
        )

    def _construct_placeholders(self) -> NoReturn:
        """
        Create self.placeholders.

        Notes
        _____
        Placeholders - input data in tf1.

        """
        print("Defining placeholders")
        self.placeholders = {
            'batch': tf.compat.v1.placeholder(tf.int32, name='batch'),
            'batch_edge_type_idx':
                tf.compat.v1.placeholder(tf.int32, shape=(),
                                         name='batch_edge_type_idx'),
            'batch_row_edge_type':
                tf.compat.v1.placeholder(tf.int32, shape=(),
                                         name='batch_row_edge_type'),
            'batch_col_edge_type':
                tf.compat.v1.placeholder(tf.int32, shape=(),
                                         name='batch_col_edge_type'),
            'degrees': tf.compat.v1.placeholder(tf.int32),
            'dropout': tf.compat.v1.placeholder_with_default(0., shape=()),
        }

        adj_placeholders = {'adj_mats_%d,%d,%d' % (i, j, k):
                                tf.compat.v1.sparse_placeholder(tf.float32)
                            for i, j in self.edge_types
                            for k in range(self.edge_types[i, j])}
        self.placeholders.update(adj_placeholders)

        features_placeholders = {'feat_%d' % i:
                                     tf.compat.v1.sparse_placeholder(tf.float32)
                                 for i, _ in self.edge_types}
        self.placeholders.update(features_placeholders)

    def _model_init(self) -> NoReturn:
        """
        Create self.model.

        """
        print("Create model")
        self.model = DecagonModel(
            placeholders=self.placeholders,
            num_feat=self.num_feat,
            nonzero_feat=self.nonzero_feat,
            edge_types=self.edge_types,
            decoders=self.edge_type2decoder,
        )

    def _optimizer_init(self, batch_size: int, max_margin: float) -> NoReturn:
        """
        Create self.opt.

        Parameters
        ----------
        batch_size : int
            Minibatch size.
        max_margin : float
            Max margin parameter in hinge loss.

        """
        print("Create optimizer")
        with tf.compat.v1.name_scope('optimizer'):
            self.opt = DecagonOptimizer(
                embeddings=self.model.embeddings,
                latent_inters=self.model.latent_inters,
                latent_varies=self.model.latent_varies,
                degrees=self.degrees,
                edge_types=self.edge_types,
                edge_type2dim=self.edge_type2dim,
                placeholders=self.placeholders,
                batch_size=batch_size,
                margin=max_margin
            )

    def _get_accuracy_scores(self, sess: tf.compat.v1.Session,
                             edges_pos: Dict[Tuple[int, int], List[np.array]],
                             edges_neg: Dict[Tuple[int, int], List[np.array]],
                             edge_type: Tuple[int, int, int]):
        """
        Calculate metrics (AUROC, AUPRC, AP@50)

        Parameters
        ----------
        sess : tf.compat.v1.Session
            Initialized tf session.
        edges_pos : Dict[Tuple[int, int], List[np.array]]
            From edge type to np.arrays of real edges for every edge class in this type.
        edges_neg : Dict[Tuple[int, int], List[np.array]]
            From edge type to np.arrays of fake edges for every edge class in this type.
        edge_type : Tuple[int, int, int]
            Edge type with class.
            Two first elements --- edge type, last element --- class in this type.
        Returns
        -------

        """
        self.feed_dict.update({self.placeholders['dropout']: 0})
        self.feed_dict.update({self.placeholders['batch_edge_type_idx']:
                                   self.minibatch.edge_type2idx[edge_type]})
        self.feed_dict.update({self.placeholders['batch_row_edge_type']: edge_type[0]})
        self.feed_dict.update({self.placeholders['batch_col_edge_type']: edge_type[1]})

        rec = sess.run(self.opt.predictions, feed_dict=self.feed_dict)

        uv = edges_pos[edge_type[:2]][edge_type[2]]
        u = uv[:, 0]
        v = uv[:, 1]
        preds = expit(rec[u, v])
        assert np.all(self.adj_mats[edge_type[:2]][edge_type[2]][u, v] == 1), \
            'Positive examples (real edges) are not exist'

        uv = edges_neg[edge_type[:2]][edge_type[2]]
        u = uv[:, 0]
        v = uv[:, 1]
        preds_neg = expit(rec[u, v])
        assert np.all(self.adj_mats[edge_type[:2]][edge_type[2]][u, v] == 0), \
            'Negative examples (fake edges) are real'

        # Predicted probs
        preds_all = np.hstack([preds, preds_neg])
        # preds_all = np.nan_to_num(preds_all)
        # Real probs: 1 for pos, 0 for neg
        labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds_neg))])
        roc_sc = metrics.roc_auc_score(labels_all, preds_all)
        aupr_sc = metrics.average_precision_score(labels_all, preds_all)

        # Real existing edges (local indexes)
        actual = range(len(preds))
        # All local indexes with probability (sorted)
        predicted = sorted(range(len(preds_all)), reverse=True,
                           key=lambda i: preds_all[i])
        apk_sc = rank_metrics.apk(actual, predicted, k=50)

        return roc_sc, aupr_sc, apk_sc

    def _run_epoch(self, sess: tf.compat.v1.Session, dropout: float,
                   print_progress_every: int, epoch: int, log: bool
                   ) -> NoReturn:
        """
        Run one epoch.

        Parameters
        ----------
        sess : tf.compat.v1.Session
            Initialized tf session.
        dropout : float
            Dropout rate (1 - keep probability).
        print_progress_every : int
            Print statistic every print_progress_every iterations.
        epoch : int
            Number of current epoch (for printing statistic).
        log : bool
            Whether to log or not.
        """
        self.minibatch.shuffle()
        for batch_edges, current_edge_type, current_edge_type_idx in self.minibatch:
            # Construct feed dictionary
            self.feed_dict = self.minibatch.batch_feed_dict(
                batch_edges=batch_edges,
                batch_edge_type=current_edge_type_idx,
                dropout=dropout,
                placeholders=self.placeholders)

            t = time.time()

            # Training step: run single weight update
            outs = sess.run([self.opt.opt_op, self.opt.cost,
                             self.opt.batch_edge_type_idx],
                            feed_dict=self.feed_dict)
            train_cost = outs[1]
            batch_edge_type = outs[2]

            if self.minibatch.iter % print_progress_every == 0:
                val_auc, val_auprc, val_apk = self._get_accuracy_scores(
                    sess, self.minibatch.val_edges,
                    self.minibatch.val_edges_false,
                    current_edge_type)

                print("Epoch:", "%04d" % (epoch + 1), "Iter:",
                      "%04d" % (self.minibatch.iter + 1), "Edge:", "%04d" % batch_edge_type,
                      "train_loss=", "{:.5f}".format(train_cost),
                      "val_roc=", "{:.5f}".format(val_auc), "val_auprc=",
                      "{:.5f}".format(val_auprc),
                      "val_apk=", "{:.5f}".format(val_apk), "time=",
                      "{:.5f}".format(time.time() - t))
                if log:
                    import neptune
                    neptune.log_metric("val_roc", val_auc,
                                       timestamp=time.time())
                    neptune.log_metric("val_apk", val_apk,
                                       timestamp=time.time())
                    neptune.log_metric("val_auprc", val_auprc,
                                       timestamp=time.time())
                    neptune.log_metric("train_loss", train_cost,
                                       timestamp=time.time())

    def run(self, adj_path: str, path_to_split: str, val_test_size: float,
            batch_size: int, num_epochs: int, dropout: float, max_margin: float,
            print_progress_every: int, log: bool, on_cpu: bool, seed: int = 123,
            upload_saved: bool = False) -> NoReturn:
        """
        Run Decagon.

        Parameters
        ----------
        upload_saved : bool
            Default = False
            Whether to log or not.
        adj_path : str
            path for saving/loading adjacency matrices.
        path_to_split : str
            path to save train, test and validate edges.
            If it consist needed edges, they will be loaded.
            Else they will be calculated and saved.
        batch_size : int
            Minibatch size.
        val_test_size : float
            proportion to split edges into train, test and validate.
        num_epochs : int
            number of training epochs.
        dropout : float
            Dropout rate (1 - keep probability).
        print_progress_every : int
            Print statistic every print_progress_every iterations.
        log : bool
            Whether to log or not.
        on_cpu : bool
            Run on cpu instead of gpu.
        max_margin : float
            Max margin parameter in hinge loss.
        seed : int
            Random seed.

        """
        np.random.seed(seed)
        # check if all path exists
        if adj_path and not os.path.exists(adj_path):
            os.makedirs(adj_path)

        if not os.path.exists(path_to_split):
            os.makedirs(path_to_split)

        if not os.path.exists(os.path.dirname(MODEL_SAVE_PATH)):
            os.makedirs(os.path.dirname(MODEL_SAVE_PATH))

        if on_cpu:
            os.environ['CUDA_VISIBLE_DEVICES'] = ""

        self._adjacency(adj_path)
        self._nodes_features()
        self._edge_types_info()
        self._construct_placeholders()
        self._minibatch_iterator_init(path_to_split, batch_size, val_test_size)
        self._model_init()
        self._optimizer_init(batch_size, max_margin)
        print("Initialize session")
        saver = tf.compat.v1.train.Saver()
        sess = tf.compat.v1.Session()
        sess.run(tf.compat.v1.global_variables_initializer())
        self.feed_dict = {}

        if upload_saved:
            saver.restore(sess, MODEL_TO_UPLOAD)
            sess.run(tf.compat.v1.global_variables_initializer())
            self.minibatch.shuffle()
            for batch_edges, current_edge_type, current_edge_type_idx in self.minibatch:
                # Construct feed dictionary
                self.feed_dict = self.minibatch.batch_feed_dict(
                    batch_edges=batch_edges,
                    batch_edge_type=current_edge_type_idx,
                    dropout=dropout,
                    placeholders=self.placeholders)
            saver.restore(sess, MODEL_SAVE_PATH)

        dir_to_save_model = f"{MODEL_SAVE_PATH}/model_{datetime.now().isoformat()[:-7]}"
        os.makedirs(dir_to_save_model, exist_ok=True)
        for epoch in range(num_epochs):
            self._run_epoch(sess, dropout, print_progress_every, epoch, log)
            saver.save(sess, f"{dir_to_save_model}/epoch_{epoch}.ckpt")
        print("Optimization finished!")
        for et in range(self.num_edge_types):
            roc_score, auprc_score, apk_score = self._get_accuracy_scores(
                sess, self.minibatch.test_edges,
                self.minibatch.test_edges_false,
                self.minibatch.idx2edge_type[et])
            print("Edge type=",
                  "[%02d, %02d, %02d]" % self.minibatch.idx2edge_type[et])
            print("Edge type:", "%04d" % et, "Test AUROC score",
                  "{:.5f}".format(roc_score))
            print("Edge type:", "%04d" % et, "Test AUPRC score",
                  "{:.5f}".format(auprc_score))
            print("Edge type:", "%04d" % et, "Test AP@k score",
                  "{:.5f}".format(apk_score))
            print()
            if log:
                import neptune
                neptune.log_metric("ROC-AUC", roc_score)
                neptune.log_metric("AUPRC", auprc_score)
                neptune.log_metric("AP@k score", apk_score)
示例#8
0
print('\n==== IMPORTED VARIABLES ====')
with open(in_file, 'rb') as f:
    DS = pickle.load(f)
    for key in DS.keys():
        globals()[key] = DS[key]
        print(key, "Imported successfully")
print('\n')
n_genes = len(gene2idx)
n_drugs = len(drug2idx)
n_se_combo = len(se_combo_name2idx)
# ============================================================================================= #
# CREATE MINIBATCH
print("Create minibatch iterator\n")
minibatch = EdgeMinibatchIterator(adj_mats=adj_mats_orig,
                                  feat=feat,
                                  edge_types=edge_types,
                                  batch_size=args.batch_size,
                                  val_test_size=args.val_test_size)
# ============================================================================================= #
# EXPORT DATA
out_file = 'data/data_structures/MINIBATCH/MINIBATCH_' + words[2]+\
            '_genes_' + str(n_genes) + '_drugs_'+ str(n_drugs) + '_se_' + str(n_se_combo)+\
            '_batchsize_'+str(args.batch_size)+'_valsize_'+str(args.val_test_size)
print('Output file: ', out_file, '\n')
memUse = ps.memory_info()
data = {}
data['minibatch'] = minibatch
data['mb_vms'] = memUse.vms
data['mb_rss'] = memUse.rss
data['mb_time'] = time.time() - start
with open(out_file, 'wb') as f:
示例#9
0
def main(args):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--decagon_data_file_directory",
        type=str,
        help=
        "path to directory where bio-decagon-*.csv files are located, with trailing slash. "
        "Default is current directory",
        default='./')
    parser.add_argument(
        "--saved_files_directory",
        type=str,
        help=
        "path to directory where saved files files are located, with trailing slash. "
        "Default is current directory. If a decagon_model.ckpt* exists in this directory, it will "
        "be loaded and evaluated, and no training will be done.",
        default='./')
    parser.add_argument("--verbose",
                        help="increase output verbosity",
                        action="store_true",
                        default=False)
    args = parser.parse_args(args)

    decagon_data_file_directory = args.decagon_data_file_directory
    verbose = args.verbose
    script_start_time = datetime.now()

    # create pre-processed file that only has side effect with >=500 occurrences
    all_combos_df = pd.read_csv('%sbio-decagon-combo.csv' %
                                decagon_data_file_directory)
    side_effects_500 = all_combos_df["Polypharmacy Side Effect"].value_counts()
    side_effects_500 = side_effects_500[side_effects_500 >= 500].index.tolist()
    all_combos_df = all_combos_df[
        all_combos_df["Polypharmacy Side Effect"].isin(side_effects_500)]
    all_combos_df.to_csv('%sbio-decagon-combo-over500only.csv' %
                         decagon_data_file_directory,
                         index=False)

    # use pre=processed file that only contains the most common side effects (those with >= 500 drug pairs)
    drug_drug_net, combo2stitch, combo2se, se2name = load_combo_se(
        fname=('%sbio-decagon-combo-over500only.csv' %
               decagon_data_file_directory))
    # net is a networkx graph with genes(proteins) as nodes and protein-protein-interactions as edges
    # node2idx maps node id to node index
    gene_net, node2idx = load_ppi(fname=('%sbio-decagon-ppi.csv' %
                                         decagon_data_file_directory))
    # stitch2se maps (individual) stitch ids to a list of side effect ids
    # se2name_mono maps side effect ids that occur in the mono file to side effect names (shorter than se2name)
    stitch2se, se2name_mono = load_mono_se(fname=('%sbio-decagon-mono.csv' %
                                                  decagon_data_file_directory))
    # stitch2proteins maps stitch ids (drug) to protein (gene) ids
    drug_gene_net, stitch2proteins = load_targets(
        fname=('%sbio-decagon-targets-all.csv' % decagon_data_file_directory))
    # se2class maps side effect id to class name

    # this was 0.05 in the original code, but the paper says that 10% each are used for testing and validation
    val_test_size = 0.1
    n_genes = gene_net.number_of_nodes()
    gene_adj = nx.adjacency_matrix(gene_net)
    gene_degrees = np.array(gene_adj.sum(axis=0)).squeeze()

    ordered_list_of_drugs = list(drug_drug_net.nodes.keys())
    ordered_list_of_side_effects = list(se2name.keys())
    ordered_list_of_proteins = list(gene_net.nodes.keys())

    n_drugs = len(ordered_list_of_drugs)

    drug_gene_adj = sp.lil_matrix(np.zeros((n_drugs, n_genes)))
    for drug in stitch2proteins:
        for protein in stitch2proteins[drug]:
            # there are quite a few drugs in here that aren't in our list of 645,
            # and proteins that aren't in our list of 19081
            if drug in ordered_list_of_drugs and protein in ordered_list_of_proteins:
                drug_index = ordered_list_of_drugs.index(drug)
                gene_index = ordered_list_of_proteins.index(protein)
                drug_gene_adj[drug_index, gene_index] = 1

    drug_gene_adj = drug_gene_adj.tocsr()

    # needs to be drug vs. gene matrix (645x19081)
    gene_drug_adj = drug_gene_adj.transpose(copy=True)

    drug_drug_adj_list = []
    if not os.path.isfile("adjacency_matrices/sparse_matrix0000.npz"):
        # pre-initialize all the matrices
        print("Initializing drug-drug adjacency matrix list")
        start_time = datetime.now()
        print("Starting at %s" % str(start_time))

        n = len(ordered_list_of_side_effects)
        for i in range(n):
            drug_drug_adj_list.append(
                sp.lil_matrix(np.zeros((n_drugs, n_drugs))))
            if verbose:
                print("%s percent done" % str(100.0 * i / n))
        print("Done initializing at %s after %s" %
              (datetime.now(), datetime.now() - start_time))

        start_time = datetime.now()
        combo_finish_time = start_time
        print("Creating adjacency matrices for side effects")
        print("Starting at %s" % str(start_time))
        combo_count = len(combo2se)
        combo_counter = 0

        # for side_effect_type in ordered_list_of_side_effects:
        # for drug1, drug2 in combinations(list(range(n_drugs)), 2):

        for combo in combo2se.keys():
            side_effect_list = combo2se[combo]
            for present_side_effect in side_effect_list:
                # find the matrix we need to update
                side_effect_number = ordered_list_of_side_effects.index(
                    present_side_effect)
                # find the drugs for which we need to make the update
                drug_tuple = combo2stitch[combo]
                drug1_index = ordered_list_of_drugs.index(drug_tuple[0])
                drug2_index = ordered_list_of_drugs.index(drug_tuple[1])
                # update
                drug_drug_adj_list[side_effect_number][drug1_index,
                                                       drug2_index] = 1

            if verbose and combo_counter % 1000 == 0:
                print(
                    "Finished combo %s after %s . %d percent of combos done" %
                    (combo_counter, str(combo_finish_time - start_time),
                     (100.0 * combo_counter / combo_count)))
            combo_finish_time = datetime.now()
            combo_counter = combo_counter + 1

        print("Done creating adjacency matrices at %s after %s" %
              (datetime.now(), datetime.now() - start_time))

        start_time = datetime.now()
        print("Saving matrices to file")
        print("Starting at %s" % str(start_time))

        # save matrices to file
        if not os.path.isdir("adjacency_matrices"):
            os.mkdir("adjacency_matrices")
        for i in range(len(drug_drug_adj_list)):
            sp.save_npz('adjacency_matrices/sparse_matrix%04d.npz' % (i, ),
                        drug_drug_adj_list[i].tocoo())
        print("Done saving matrices to file at %s after %s" %
              (datetime.now(), datetime.now() - start_time))
    else:
        print("Loading adjacency matrices from file.")
        for i in range(len(ordered_list_of_side_effects)):
            drug_drug_adj_list.append(
                sp.load_npz('adjacency_matrices/sparse_matrix%04d.npz' % i))

    for i in range(len(drug_drug_adj_list)):
        drug_drug_adj_list[i] = drug_drug_adj_list[i].tocsr()

    start_time = datetime.now()
    print("Setting up for training")
    print("Starting at %s" % str(start_time))

    drug_degrees_list = [
        np.array(drug_adj.sum(axis=0)).squeeze()
        for drug_adj in drug_drug_adj_list
    ]

    # data representation
    global adj_mats_orig
    adj_mats_orig = {
        (0, 0): [gene_adj, gene_adj.transpose(copy=True)
                 ],  # protein-protein interactions (and inverses)
        (0, 1):
        [gene_drug_adj],  # protein-drug relationships (inverse of targets)
        (1, 0): [drug_gene_adj],  # drug-protein relationships (targets)
        # This creates an "inverse" relationship for every polypharmacy side effect, using the transpose of the
        # relationship's adjacency matrix, resulting in 2x the number of side effects (and adjacency matrices).
        (1, 1):
        drug_drug_adj_list +
        [x.transpose(copy=True) for x in drug_drug_adj_list],
    }
    degrees = {
        0: [gene_degrees, gene_degrees],
        1: drug_degrees_list + drug_degrees_list,
    }

    # featureless (genes)
    gene_feat = sp.identity(n_genes)
    gene_nonzero_feat, gene_num_feat = gene_feat.shape
    gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo())

    # features (drugs)
    drug_feat = sp.identity(n_drugs)
    drug_nonzero_feat, drug_num_feat = drug_feat.shape
    drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo())

    # data representation
    num_feat = {
        0: gene_num_feat,
        1: drug_num_feat,
    }
    nonzero_feat = {
        0: gene_nonzero_feat,
        1: drug_nonzero_feat,
    }
    feat = {
        0: gene_feat,
        1: drug_feat,
    }

    edge_type2dim = {
        k: [adj.shape for adj in adjs]
        for k, adjs in adj_mats_orig.items()
    }
    edge_type2decoder = {
        (0, 0): 'bilinear',
        (0, 1): 'bilinear',
        (1, 0): 'bilinear',
        (1, 1): 'dedicom',
    }

    edge_types = {k: len(v) for k, v in adj_mats_orig.items()}
    global num_edge_types
    num_edge_types = sum(edge_types.values())
    print("Edge types:", "%d" % num_edge_types)

    ###########################################################
    #
    # Settings and placeholders
    #
    ###########################################################

    # Important -- Do not evaluate/print validation performance every iteration as it can take
    # substantial amount of time
    PRINT_PROGRESS_EVERY = 10000

    print("Defining placeholders")
    construct_placeholders(edge_types)

    ###########################################################
    #
    # Create minibatch iterator, model and optimizer
    #
    ###########################################################

    global minibatch_iterator
    iterator_pickle_file_name = args.saved_files_directory + "minibatch_iterator.pickle"
    if os.path.isfile(iterator_pickle_file_name):
        print("Load minibatch iterator pickle")
        with open(iterator_pickle_file_name, 'rb') as pickle_file:
            minibatch_iterator = pickle.load(pickle_file)
    else:
        print("Create minibatch iterator")
        minibatch_iterator = EdgeMinibatchIterator(adj_mats=adj_mats_orig,
                                                   feat=feat,
                                                   edge_types=edge_types,
                                                   batch_size=FLAGS.batch_size,
                                                   val_test_size=val_test_size)
        print("Pickling minibatch iterator")
        with open(iterator_pickle_file_name, 'wb') as pickle_file:
            pickle.dump(minibatch_iterator, pickle_file)

    print("Create model")
    model = DecagonModel(
        placeholders=placeholders,
        num_feat=num_feat,
        nonzero_feat=nonzero_feat,
        edge_types=edge_types,
        decoders=edge_type2decoder,
    )

    print("Create optimizer")
    global optimizer
    with tf.name_scope('optimizer'):
        optimizer = DecagonOptimizer(embeddings=model.embeddings,
                                     latent_inters=model.latent_inters,
                                     latent_varies=model.latent_varies,
                                     degrees=degrees,
                                     edge_types=edge_types,
                                     edge_type2dim=edge_type2dim,
                                     placeholders=placeholders,
                                     batch_size=FLAGS.batch_size,
                                     margin=FLAGS.max_margin)

    print("Done setting up at %s after %s" %
          (datetime.now(), datetime.now() - start_time))

    print("Initialize session")
    global sess
    sess = tf.Session()

    decagon_model_file_name = args.saved_files_directory + "decagon_model.ckpt"
    saved_model_available = os.path.isfile(decagon_model_file_name + ".index")
    if saved_model_available:
        saver = tf.train.Saver()
        saver.restore(sess, decagon_model_file_name)
        print("Model restored.")
    if not saved_model_available:
        print("Training model")
        start_time = datetime.now()
        print("Starting at %s" % str(start_time))

        sess.run(tf.global_variables_initializer())
        feed_dict = {}

        ###########################################################
        #
        # Train model
        #
        ###########################################################

        saver = tf.train.Saver()

        print("Train model")
        epoch_losses = []
        for epoch in range(FLAGS.epochs):

            minibatch_iterator.shuffle()
            itr = 0
            while not minibatch_iterator.end():
                # Construct feed dictionary
                feed_dict = minibatch_iterator.next_minibatch_feed_dict(
                    placeholders=placeholders)
                feed_dict = minibatch_iterator.update_feed_dict(
                    feed_dict=feed_dict,
                    dropout=FLAGS.dropout,
                    placeholders=placeholders)

                t = time.time()

                # Training step: run single weight update
                outs = sess.run([
                    optimizer.opt_op, optimizer.cost,
                    optimizer.batch_edge_type_idx
                ],
                                feed_dict=feed_dict)
                train_cost = outs[1]
                batch_edge_type = outs[2]

                if itr % PRINT_PROGRESS_EVERY == 0:
                    val_auc, val_auprc, val_apk = get_accuracy_scores(
                        minibatch_iterator.val_edges,
                        minibatch_iterator.val_edges_false,
                        minibatch_iterator.idx2edge_type[
                            minibatch_iterator.current_edge_type_idx],
                        feed_dict)

                    print("Epoch:", "%04d" % (epoch + 1), "Iter:",
                          "%04d" % (itr + 1), "Edge:",
                          "%04d" % batch_edge_type, "train_loss=",
                          "{:.5f}".format(train_cost), "val_roc=",
                          "{:.5f}".format(val_auc), "val_auprc=",
                          "{:.5f}".format(val_auprc), "val_apk=",
                          "{:.5f}".format(val_apk), "time=",
                          "{:.5f}".format(time.time() - t))

                itr += 1
            validation_loss = get_validation_loss(
                edges_pos=minibatch_iterator.val_edges,
                edges_neg=minibatch_iterator.val_edges_false,
                feed_dict=feed_dict)
            print(
                "Epoch:", "%04d" % (epoch + 1),
                "Validation loss (average cross entropy): {}".format(
                    validation_loss))

            epoch_losses.append(validation_loss)
            if len(epoch_losses) >= 3:
                if round(epoch_losses[-1], 3) >= round(
                        epoch_losses[-2], 3) >= round(epoch_losses[-3], 3):
                    break

            print("Saving model after epoch:", epoch)
            save_path = saver.save(
                sess, args.saved_files_directory + "decagon_model" +
                str(epoch) + ".ckpt")
            print("Model saved in path: %s" % save_path)

        print("Optimization finished!")
        print("Done training model %s after %s" %
              (datetime.now(), datetime.now() - start_time))

        print("Saving model")
        save_path = saver.save(sess, decagon_model_file_name)
        print("Model saved in path: %s" % save_path)

        print("Pickling minibatch iterator")
        with open(iterator_pickle_file_name, 'wb') as pickle_file:
            pickle.dump(minibatch_iterator, pickle_file)

    start_time = datetime.now()
    print("Evaluating model")
    print("Starting at %s" % str(start_time))

    for edge_type in range(num_edge_types):
        # get all edges in test set with this type
        feed_dict = minibatch_iterator.test_feed_dict(
            edge_type, placeholders=placeholders)
        feed_dict = minibatch_iterator.update_feed_dict(
            feed_dict, FLAGS.dropout, placeholders)
        edge_tuple = minibatch_iterator.idx2edge_type[edge_type]

        _, _, all_scores, all_labels, subjects, predicates, objects = get_predictions(
            edges_pos=minibatch_iterator.test_edges,
            edges_neg=minibatch_iterator.test_edges_false,
            edge_type=edge_tuple,
            feed_dict=feed_dict)

        print("subject\tpredicate\tobject\tpredicted\tactual")
        for i in range(len(all_scores)):
            subject = subjects[i]
            if edge_tuple[0] == 1:
                subject = ordered_list_of_drugs[subject]
            else:
                subject = ordered_list_of_proteins[subject]

            object = objects[i]
            if edge_tuple[1] == 1:
                object = ordered_list_of_drugs[object]
            else:
                object = ordered_list_of_proteins[object]

            predicate = predicates[i]
            if edge_tuple[:2] == (1, 1):
                side_effect_index = edge_tuple[2]
                is_inverse = False
                if side_effect_index >= 963:
                    side_effect_index = side_effect_index - 963
                    is_inverse = True
                predicate = ordered_list_of_side_effects[side_effect_index]
                if is_inverse:
                    predicate = predicate + "_2"

            print("{}\t{}\t{}\t{}\t{}".format(subject, predicate, object,
                                              all_scores[i], all_labels[i]))

    print()

    print("Done evaluating at %s after %s" %
          (datetime.now(), datetime.now() - start_time))

    print("Script running time: %s" % (datetime.now() - script_start_time))
示例#10
0
文件: main.py 项目: karl-crl/decagon
    ###########################################################
    #
    # Create minibatch iterator, model and optimizer
    #
    ###########################################################

    print("Create minibatch iterator")
    path_to_split = f'data/split/{val_test_size}'
    need_sample_edges = not (os.path.isdir(path_to_split) and
                             len(os.listdir(path_to_split)) == 6)
    minibatch = EdgeMinibatchIterator(
        adj_mats=adj_mats_orig,
        feat=feat,
        edge_types=edge_types,
        batch_size=PARAMS['batch_size'],
        val_test_size=val_test_size,
        path_to_split=path_to_split,
        need_sample_edges=need_sample_edges
    )

    print("Create model")
    model = DecagonModel(
        placeholders=placeholders,
        num_feat=num_feat,
        nonzero_feat=nonzero_feat,
        edge_types=edge_types,
        decoders=edge_type2decoder,
    )

    print("Create optimizer")