예제 #1
0
def running_one_time(n, dropout, seed, n_conv_feat, lr, l2_regu, nb_layers,
                     ord_row):
    """
    function to run the architecture one time
    Inputs:
       n: number of runs of the architecture for the same initialization with the weights and biases of the last run,
       seed: seed to use for the random sampling between training, testing and validation sets,
       n_conv_feat: number of weights to use for the GCNN layer. Default value = 36,
       l2_regu: coefficient to use in front of the l2 regularization term. Default value = 1,
       dropout: dropout rate on the GCNN output. Default = 0.5,
       lr: learning rate. Default value = 0.001
       nb_layers: number of GCNN layers
       ord_row: number of Chebyshev polynomials for the GCNN layers
    Outputs:
       auc_test_list, auc_train_list, auc_val_list: lists of the AUC values on the test, train and validation sets for the n runs,
       pred_train_list, pred_test_list, pred_val_list: lists of the prediction values on the test, train and validation sets for the n runs,
       labels_test, labels_train, labels_val: labels of the test, train and validation sets
    """
    #initialization of the matrix, of the training, testing and validation sets
    training_set_mask, testing_set_mask, idx_training, idx_testing = preprocessing_dataset.split_train_test(
        0.8, M_str, seed, labels)
    new_labels_train = np.copy(labels)
    new_labels_train[idx_testing] = -1
    #creation of a validation set from the training set. Split the training set into 4 parts, one for validation
    training_set_mask, validation_set_mask, idx_training, idx_validation = preprocessing_dataset.split_train_validation_4(
        3, M_str, seed, new_labels_train)

    labels_train = labels * training_set_mask
    labels_test = labels * testing_set_mask
    labels_val = labels * validation_set_mask

    indexes_train = np.concatenate((idx_testing, idx_validation), axis=0)
    indexes_validation = np.concatenate((idx_training, idx_testing), axis=0)
    indexes_test = np.concatenate((idx_training, idx_validation), axis=0)

    labels_test_reduce = np.delete(labels_test, indexes_test, 0)
    labels_train_reduce = np.delete(labels_train, indexes_train, 0)
    labels_val_reduce = np.delete(labels_val, indexes_validation, 0)

    learning_obj = Train_test_matrix_completion(M,
                                                Lrow_age,
                                                Lrow_sex,
                                                Lrow_agesex,
                                                A_age,
                                                A_sex,
                                                A_sexage,
                                                mask_age,
                                                mask_sex,
                                                mask_agesex,
                                                mask_nosignificance,
                                                labels_train,
                                                labels_test,
                                                labels_val,
                                                testing_set_mask,
                                                training_set_mask,
                                                validation_set_mask,
                                                order_chebyshev_row=ord_row,
                                                n_conv_feat=n_conv_feat,
                                                dropout=dropout,
                                                learning_rate=lr,
                                                l2_regu=l2_regu,
                                                nb_layers=nb_layers)

    num_iter_test = 10
    num_total_iter_training = n
    num_iter = 0

    list_training_loss = list()
    list_training_norm_grad = list()
    list_test_pred_error = list()

    auc_train_list = []
    pred_train_list = []
    auc_test_list = []
    pred_test_list = []
    auc_val_list = []
    pred_val_list = []

    for k in range(num_iter, num_total_iter_training):

        tic = time.time()
        # run of the algorithm on the training set
        list_of_outputs = learning_obj.session.run(
            [
                learning_obj.optimizer, learning_obj.loss,
                learning_obj.norm_grad, learning_obj.classification_train,
                learning_obj.binary_entropy
            ] + learning_obj.var_grad
        )  #learning_obj.loss_frob, learning_obj.loss_trace_row, learning_obj.frob_norm_H, learning_obj.frob_norm_W, learning_obj.binary_entropy
        current_training_loss = list_of_outputs[1]
        norm_grad = list_of_outputs[2]
        pred_train = list_of_outputs[3]

        pred_train = np.delete(pred_train, indexes_train, 0)

        fpr_train, tpr_train, thresholds_train = roc_curve(
            labels_train_reduce, pred_train)
        roc_auc_train = auc(fpr_train, tpr_train)

        X_grad = list_of_outputs[3:]
        training_time = time.time() - tic

        list_training_loss.append(current_training_loss)
        list_training_norm_grad.append(norm_grad)

        if (np.mod(num_iter, num_iter_test) == 0):
            msg = "[TRN] iter = %03i, cost = %3.2e, |grad| = %.2e (%3.2es), AUC = %3.2e" \
                                        % (num_iter, list_training_loss[-1], list_training_norm_grad[-1], training_time, roc_auc_train)
            print msg

            auc_train_list.append(roc_auc_train)
            pred_train_list.append(pred_train)

            tic = time.time()
            # run of the algorithm on the validation set
            pred_val = learning_obj.session.run(
                [learning_obj.classification_val])  #
            test_time = time.time() - tic
            pred_val = np.delete(pred_val[0], indexes_validation, 0)
            fpr, tpr, thresholds = roc_curve(labels_val_reduce, pred_val)
            roc_auc_val = auc(fpr, tpr)
            auc_val_list.append(roc_auc_val)
            pred_val_list.append(pred_val)

            msg = "[VAL] iter = %03i, AUC = %3.2e" % (num_iter, roc_auc_val)
            print msg

            tic = time.time()
            # run of the algorithm on the test set
            pred_error, pred = learning_obj.session.run([
                learning_obj.predictions_error,
                learning_obj.classification_test
            ])  #
            test_time = time.time() - tic

            pred = np.delete(pred, indexes_test, 0)
            list_test_pred_error.append(pred_error)
            fpr, tpr, thresholds = roc_curve(labels_test_reduce, pred)
            roc_auc = auc(fpr, tpr)
            auc_test_list.append(roc_auc)
            pred_test_list.append(pred)

            msg = "[TST] iter = %03i, cost = %3.2e, AUC = %3.2e" % (
                num_iter, list_test_pred_error[-1], roc_auc)
            print msg

        num_iter += 1
    return (auc_test_list, auc_train_list, auc_val_list, pred_train_list,
            pred_test_list, pred_val_list, labels_test_reduce,
            labels_train_reduce, labels_val_reduce)
def run(seed, gamma, beta, hidden, lr, NB_EPOCH=300):
    """
    Main function. Run the architecture for the initialization defined by seed and by the hyperparameters gamma, beta, hidden, lr
    Inputs:
        seed : seed to defined the initialization of the training/testing/validation split,
        gamma, beta, hidden, lr: hyperparameters of the architecture
        NB_EPOCH: number of runs to do of the same architecture with different weight initializations. Default: 1000
    Outputs:
        auc_test, auc_train, auc_val: AUC on the test, train and validation sets
    """
    tf.reset_default_graph()
    training_set_mask, testing_set_mask, idx_training, idx_testing = preprocessing_dataset.split_train_test(
        0.8, M_str, seed, labels)
    #create a training and test mask on the data
    Otraining = preprocessing_dataset.load_mask(training_set_mask, M_str,
                                                nrRows, nrCols)
    Otest = preprocessing_dataset.load_mask(testing_set_mask, M_str, nrRows,
                                            nrCols)

    new_labels_train = np.copy(labels)
    new_labels_train[idx_testing] = -1
    #split train set into 4 parts to create a validation set
    training_set_mask, validation_set_mask, idx_training, idx_validation = preprocessing_dataset.split_train_validation_4(
        3, M_str, seed, new_labels_train)
    Otraining = preprocessing_dataset.load_mask(training_set_mask, M_str,
                                                nrRows, nrCols)
    Ovalidation = preprocessing_dataset.load_mask(validation_set_mask, M_str,
                                                  nrRows, nrCols)

    Otraining = np.concatenate((Otraining, training_set_mask), axis=1)
    Ocol = np.zeros((Otest.shape[0], 1))
    Otest_support = np.concatenate((Otest, Ocol), axis=1)
    Ovalidation_support = np.concatenate((Ovalidation, Ocol), axis=1)
    Osupport_t = Otraining + Otest_support + Ovalidation_support
    Ovalidation = np.concatenate((Ovalidation, validation_set_mask), axis=1)
    Otest = np.concatenate((Otest, testing_set_mask), axis=1)

    u_features, v_features, train_labels, train_u_indices, train_v_indices, val_labels, val_u_indices, val_v_indices, test_labels, test_u_indices, test_v_indices = load_data_monti_tadpole(
        M, Otraining, Otest, Ovalidation)

    m, n = M.shape

    # global normalization
    support = []
    support_t = []

    path_support_women = "women_synth_noteasy.csv"
    women_support, _, _ = read_tadpole.load_csv_no_header(path_support_women)
    women_support = preprocessing_dataset.str_to_float(women_support)
    women_support = women_support * M_sup
    women_support = sp.csr_matrix(women_support, dtype=np.float32)
    support.append(women_support)
    support_t.append(women_support.T)

    path_support_men = "men_synth_noteasy.csv"
    men_support, _, _ = read_tadpole.load_csv_no_header(path_support_men)
    men_support = preprocessing_dataset.str_to_float(men_support)
    men_support = men_support * M_sup
    men_support = sp.csr_matrix(men_support, dtype=np.float32)
    support.append(men_support)
    support_t.append(men_support.T)

    path_support_women_84 = "age_84_92_women_synth_noteasy.csv"
    women_84_support, _, _ = read_tadpole.load_csv_no_header(
        path_support_women_84)
    women_84_support = preprocessing_dataset.str_to_float(women_84_support)
    women_84_support = women_84_support * M_sup
    women_84_support = sp.csr_matrix(women_84_support, dtype=np.float32)
    support.append(women_84_support)
    support_t.append(women_84_support.T)

    path_support_men_84 = "age_84_92_men_synth_noteasy.csv"
    men_84_support, _, _ = read_tadpole.load_csv_no_header(path_support_men_84)
    men_84_support = preprocessing_dataset.str_to_float(men_84_support)
    men_84_support = men_84_support * M_sup
    men_84_support = sp.csr_matrix(men_84_support, dtype=np.float32)
    support.append(men_84_support)
    support_t.append(men_84_support.T)

    path_support_84 = "age_84_92_synth_noteasy.csv"
    age84_support, _, _ = read_tadpole.load_csv_no_header(path_support_84)
    age84_support = preprocessing_dataset.str_to_float(age84_support)
    age84_support = age84_support * M_sup
    age84_support = sp.csr_matrix(age84_support, dtype=np.float32)
    support.append(age84_support)
    support_t.append(age84_support.T)

    path_support_women_79 = "age_79_84_women_synth_noteasy.csv"
    women_79_support, _, _ = read_tadpole.load_csv_no_header(
        path_support_women_79)
    women_79_support = preprocessing_dataset.str_to_float(women_79_support)
    women_79_support = women_79_support * M_sup
    women_79_support = sp.csr_matrix(women_79_support, dtype=np.float32)
    support.append(women_79_support)
    support_t.append(women_79_support.T)

    path_support_men_79 = "age_79_84_men_synth_noteasy.csv"
    men_79_support, _, _ = read_tadpole.load_csv_no_header(path_support_men_79)
    men_79_support = preprocessing_dataset.str_to_float(men_79_support)
    men_79_support = men_79_support * M_sup
    men_79_support = sp.csr_matrix(men_79_support, dtype=np.float32)
    support.append(men_79_support)
    support_t.append(men_79_support.T)

    path_support_79 = "age_79_84_synth_noteasy.csv"
    age79_support, _, _ = read_tadpole.load_csv_no_header(path_support_79)
    age79_support = preprocessing_dataset.str_to_float(age79_support)
    age79_support = age79_support * M_sup
    age79_support = sp.csr_matrix(age79_support, dtype=np.float32)
    support.append(age79_support)
    support_t.append(age79_support.T)

    path_support_women_74 = "age_74_79_women_synth_noteasy.csv"
    women_74_support, _, _ = read_tadpole.load_csv_no_header(
        path_support_women_74)
    women_74_support = preprocessing_dataset.str_to_float(women_74_support)
    women_74_support = women_74_support * M_sup
    women_74_support = sp.csr_matrix(women_74_support, dtype=np.float32)
    support.append(women_74_support)
    support_t.append(women_74_support.T)

    path_support_men_74 = "age_74_79_men_synth_noteasy.csv"
    men_74_support, _, _ = read_tadpole.load_csv_no_header(path_support_men_74)
    men_74_support = preprocessing_dataset.str_to_float(men_74_support)
    men_74_support = men_74_support * M_sup
    men_74_support = sp.csr_matrix(men_74_support, dtype=np.float32)
    support.append(men_74_support)
    support_t.append(men_74_support.T)

    path_support_74 = "age_74_79_synth_noteasy.csv"
    age74_support, _, _ = read_tadpole.load_csv_no_header(path_support_74)
    age74_support = preprocessing_dataset.str_to_float(age74_support)
    age74_support = age74_support * M_sup
    age74_support = sp.csr_matrix(age74_support, dtype=np.float32)
    support.append(age74_support)
    support_t.append(age74_support.T)

    path_support_women_69 = "age_69_74_women_synth_noteasy.csv"
    women_69_support, _, _ = read_tadpole.load_csv_no_header(
        path_support_women_69)
    women_69_support = preprocessing_dataset.str_to_float(women_69_support)
    women_69_support = women_69_support * M_sup
    women_69_support = sp.csr_matrix(women_69_support, dtype=np.float32)
    support.append(women_69_support)
    support_t.append(women_69_support.T)

    path_support_men_69 = "age_69_74_men_synth_noteasy.csv"
    men_69_support, _, _ = read_tadpole.load_csv_no_header(path_support_men_69)
    men_69_support = preprocessing_dataset.str_to_float(men_69_support)
    men_69_support = men_69_support * M_sup
    men_69_support = sp.csr_matrix(men_69_support, dtype=np.float32)
    support.append(men_69_support)
    support_t.append(men_69_support.T)

    path_support_69 = "age_69_74_synth_noteasy.csv"
    age69_support, _, _ = read_tadpole.load_csv_no_header(path_support_69)
    age69_support = preprocessing_dataset.str_to_float(age69_support)
    age69_support = age69_support * M_sup
    age69_support = sp.csr_matrix(age69_support, dtype=np.float32)
    support.append(age69_support)
    support_t.append(age69_support.T)

    path_support_women_64 = "age_64_69_women_synth_noteasy.csv"
    women_64_support, _, _ = read_tadpole.load_csv_no_header(
        path_support_women_64)
    women_64_support = preprocessing_dataset.str_to_float(women_64_support)
    women_64_support = women_64_support * M_sup
    women_64_support = sp.csr_matrix(women_64_support, dtype=np.float32)
    support.append(women_64_support)
    support_t.append(women_64_support.T)

    path_support_men_64 = "age_64_69_men_synth_noteasy.csv"
    men_64_support, _, _ = read_tadpole.load_csv_no_header(path_support_men_64)
    men_64_support = preprocessing_dataset.str_to_float(men_64_support)
    men_64_support = men_64_support * M_sup
    men_64_support = sp.csr_matrix(men_64_support, dtype=np.float32)
    support.append(men_64_support)
    support_t.append(men_64_support.T)

    path_support_64 = "age_64_69_synth_noteasy.csv"
    age64_support, _, _ = read_tadpole.load_csv_no_header(path_support_64)
    age64_support = preprocessing_dataset.str_to_float(age64_support)
    age64_support = age64_support * M_sup
    age64_support = sp.csr_matrix(age64_support, dtype=np.float32)
    support.append(age64_support)
    support_t.append(age64_support.T)

    path_support_women_59 = "age_59_64_women_synth_noteasy.csv"
    women_59_support, _, _ = read_tadpole.load_csv_no_header(
        path_support_women_59)
    women_59_support = preprocessing_dataset.str_to_float(women_59_support)
    women_59_support = women_59_support * M_sup
    women_59_support = sp.csr_matrix(women_59_support, dtype=np.float32)
    support.append(women_59_support)
    support_t.append(women_59_support.T)

    path_support_men_59 = "age_59_64_men_synth_noteasy.csv"
    men_59_support, _, _ = read_tadpole.load_csv_no_header(path_support_men_59)
    men_59_support = preprocessing_dataset.str_to_float(men_59_support)
    men_59_support = men_59_support * M_sup
    men_59_support = sp.csr_matrix(men_59_support, dtype=np.float32)
    support.append(men_59_support)
    support_t.append(men_59_support.T)

    path_support_59 = "age_59_64_synth_noteasy.csv"
    age59_support, _, _ = read_tadpole.load_csv_no_header(path_support_59)
    age59_support = preprocessing_dataset.str_to_float(age59_support)
    age59_support = age59_support * M_sup
    age59_support = sp.csr_matrix(age59_support, dtype=np.float32)
    support.append(age59_support)
    support_t.append(age59_support.T)

    path_support_women_54 = "age_54_59_women_synth_noteasy.csv"
    women_54_support, _, _ = read_tadpole.load_csv_no_header(
        path_support_women_54)
    women_54_support = preprocessing_dataset.str_to_float(women_54_support)
    women_54_support = women_54_support * M_sup
    women_54_support = sp.csr_matrix(women_54_support, dtype=np.float32)
    support.append(women_54_support)
    support_t.append(women_54_support.T)

    path_support_men_54 = "age_54_59_men_synth_noteasy.csv"
    men_54_support, _, _ = read_tadpole.load_csv_no_header(path_support_men_54)
    men_54_support = preprocessing_dataset.str_to_float(men_54_support)
    men_54_support = men_54_support * M_sup
    men_54_support = sp.csr_matrix(men_54_support, dtype=np.float32)
    support.append(men_54_support)
    support_t.append(men_54_support.T)

    path_support_54 = "age_54_59_synth_noteasy.csv"
    age54_support, _, _ = read_tadpole.load_csv_no_header(path_support_54)
    age54_support = preprocessing_dataset.str_to_float(age54_support)
    age54_support = age54_support * M_sup
    age54_support = sp.csr_matrix(age54_support, dtype=np.float32)
    support.append(age54_support)
    support_t.append(age54_support.T)

    num_support = len(support)
    mask_support_t = []
    Osupport_t = sp.csr_matrix(Osupport_t, dtype=np.int)
    for i in range(num_support):
        mask_support_t.append(Osupport_t.T)

    mask_support_t = sp.hstack(mask_support_t, format='csr')

    support = sp.hstack(support, format='csr')
    support_t = sp.hstack(support_t, format='csr')

    # Collect all user and item nodes for test set
    test_u = list(set(test_u_indices))
    test_v = list(set(test_v_indices))
    test_u_dict = {n: i for i, n in enumerate(test_u)}
    test_v_dict = {n: i for i, n in enumerate(test_v)}

    test_u_indices = np.array([test_u_dict[o] for o in test_u_indices])
    test_v_indices = np.array([test_v_dict[o] for o in test_v_indices])
    test_support = support[np.array(test_u)]
    for i in range(test_support.shape[0]):
        for j in range(563, test_support.shape[1], 564):
            test_support[i, j] = 0.0
    test_support_t = sp.csr_matrix.multiply(support_t, mask_support_t)

    # Collect all user and item nodes for validation set
    val_u = list(set(val_u_indices))
    val_v = list(set(val_v_indices))
    val_u_dict = {n: i for i, n in enumerate(val_u)}
    val_v_dict = {n: i for i, n in enumerate(val_v)}

    val_u_indices = np.array([val_u_dict[o] for o in val_u_indices])
    val_v_indices = np.array([val_v_dict[o] for o in val_v_indices])
    val_support = support[np.array(val_u)]
    for i in range(val_support.shape[0]):
        for j in range(563, val_support.shape[1], 564):
            val_support[i, j] = 0.0
    val_support_t = sp.csr_matrix.multiply(support_t, mask_support_t)

    # Collect all user and item nodes for train set
    train_u = list(set(train_u_indices))
    train_v = list(set(train_v_indices))
    train_u_dict = {n: i for i, n in enumerate(train_u)}
    train_v_dict = {n: i for i, n in enumerate(train_v)}

    train_u_indices = np.array([train_u_dict[o] for o in train_u_indices])
    train_v_indices = np.array([train_v_dict[o] for o in train_v_indices])
    train_support = support[np.array(train_u)]
    train_support_t = sp.csr_matrix.multiply(support_t, mask_support_t)

    placeholders = {
        'u_features':
        tf.sparse_placeholder(tf.float32,
                              shape=np.array(u_features.shape,
                                             dtype=np.int64)),
        'v_features':
        tf.sparse_placeholder(tf.float32,
                              shape=np.array(v_features.shape,
                                             dtype=np.int64)),
        'u_features_nonzero':
        tf.placeholder(tf.int32, shape=()),
        'v_features_nonzero':
        tf.placeholder(tf.int32, shape=()),
        'labels':
        tf.placeholder(tf.float32, shape=(None, )),
        'indices_labels':
        tf.placeholder(tf.int32, shape=(None, )),
        'user_indices':
        tf.placeholder(tf.int32, shape=(None, )),
        'item_indices':
        tf.placeholder(tf.int32, shape=(None, )),
        'dropout':
        tf.placeholder_with_default(0., shape=()),
        'weight_decay':
        tf.placeholder_with_default(0., shape=()),
        'support':
        tf.sparse_placeholder(tf.float32, shape=(None, None)),
        'support_t':
        tf.sparse_placeholder(tf.float32, shape=(None, None)),
    }
    div = hidden[0] // num_support
    if hidden[0] % num_support != 0:
        print(
            """\nWARNING: HIDDEN[0] (=%d) of stack layer is adjusted to %d such that
                      it can be evenly split in %d splits.\n""" %
            (hidden[0], num_support * div, num_support))
    hidden[0] = num_support * div

    # create model
    model = MG_GAE(placeholders,
                   input_dim=u_features.shape[1],
                   num_support=num_support,
                   hidden=hidden,
                   num_users=m,
                   num_items=n,
                   learning_rate=lr,
                   gamma=gamma,
                   beta=beta,
                   logging=True)

    # Convert sparse placeholders to tuples to construct feed_dict
    test_support = sparse_to_tuple(test_support)
    test_support_t = sparse_to_tuple(test_support_t)

    val_support = sparse_to_tuple(val_support)
    val_support_t = sparse_to_tuple(val_support_t)

    train_support = sparse_to_tuple(train_support)
    train_support_t = sparse_to_tuple(train_support_t)

    u_features = sparse_to_tuple(u_features)
    v_features = sparse_to_tuple(v_features)

    assert u_features[2][1] == v_features[2][
        1], 'Number of features of users and items must be the same!'

    num_features = u_features[2][1]
    u_features_nonzero = u_features[1].shape[0]
    v_features_nonzero = v_features[1].shape[0]

    indices_labels = [563] * train_labels.shape[0]
    indices_labels_val = [563] * val_labels.shape[0]
    indices_labels_test = [563] * test_labels.shape[0]

    # Feed_dicts for validation and test set stay constant over different update steps
    train_feed_dict = construct_feed_dict(placeholders, u_features, v_features,
                                          u_features_nonzero,
                                          v_features_nonzero, train_support,
                                          train_support_t, train_labels,
                                          indices_labels, train_u_indices,
                                          train_v_indices, 0.)
    # No dropout for validation and test runs
    val_feed_dict = construct_feed_dict(placeholders, u_features, v_features,
                                        u_features_nonzero, v_features_nonzero,
                                        val_support, val_support_t, val_labels,
                                        indices_labels_val, val_u_indices,
                                        val_v_indices, 0.)

    test_feed_dict = construct_feed_dict(placeholders, u_features, v_features,
                                         u_features_nonzero,
                                         v_features_nonzero, test_support,
                                         test_support_t, test_labels,
                                         indices_labels_test, test_u_indices,
                                         test_v_indices, 0.)

    # Collect all variables to be logged into summary
    merged_summary = tf.summary.merge_all()

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    auc_train = []
    auc_test = []
    auc_val = []
    test_pred = []
    for epoch in range(NB_EPOCH):

        t = time.time()

        # Run single weight update
        outs = sess.run([
            model.training_op, model.loss, model.indices, model.labels,
            model.outputs, model.labels_class, model.classification,
            model.inputs, model.gcn_u, model.gcn_v, model.loss_frob,
            model.binary_entropy, model.u_inputs, model.v_inputs, model.weight,
            model.input_u, model.input_v, model.u_indices, model.v_indices
        ],
                        feed_dict=train_feed_dict)
        train_avg_loss = outs[1]
        label_train = outs[5]
        output_train = outs[6]

        fpr_train, tpr_train, thresholds_train = roc_curve(
            label_train, output_train, pos_label=label_train.max())
        roc_auc_train = auc(fpr_train, tpr_train)
        auc_train.append(roc_auc_train)

        val_avg_loss, val_classification, val_labels_corres = sess.run(
            [model.loss, model.classification, model.labels_class],
            feed_dict=val_feed_dict)  #test_feed_dict)#
        fpr_val, tpr_val, thresholds_train = roc_curve(
            val_labels_corres, val_classification, pos_label=label_train.max())
        roc_auc_val = auc(fpr_val, tpr_val)
        auc_val.append(roc_auc_val)

        test_avg_loss, test_classification, test_labels_corres = sess.run(
            [model.loss, model.classification, model.labels_class],
            feed_dict=test_feed_dict)
        fpr_test, tpr_test, thresholds_test = roc_curve(
            test_labels_corres,
            test_classification,
            pos_label=label_train.max())
        roc_auc_test = auc(fpr_test, tpr_test)
        auc_test.append(roc_auc_test)
        test_pred.append(test_classification)
        if VERBOSE:
            print("[*] Epoch:", '%04d' % (epoch + 1), "train_loss=",
                  "{:.5f}".format(train_avg_loss), "train_auc=",
                  "{:.5f}".format(roc_auc_train), "val_loss=",
                  "{:.5f}".format(val_avg_loss), "val_auc=",
                  "{:.5f}".format(roc_auc_val), "\t\ttime=",
                  "{:.5f}".format(time.time() - t))
            print('test auc = ', roc_auc_test)

    sess.close()

    return auc_test, auc_train, auc_val
age_file_path = 'ages_synthetic_data_noteasy.csv'
age_column, nrRowsa, nrColsa = preprocessing_dataset.load_csv_no_header_float(age_file_path)
age_column=age_column.reshape(-1)

labels = preprocessing_dataset.str_to_float(labels)
labels_save=labels.reshape(-1)

M_init= np.concatenate((M_str, labels), axis=1)
M_support = np.concatenate((M_str, labels), axis=1)

M = preprocessing_dataset.normalization_gae(M_init)
M_sup=preprocessing_dataset.normalization_for_supportreal(M_support)


seed=0
training_set_mask, testing_set_mask, idx_training, idx_testing=preprocessing_dataset.split_train_test(0.8, M_str, seed, labels)
#create a training and test mask on the data
Otraining = preprocessing_dataset.load_mask(training_set_mask, M_str, nrRows, nrCols)
Otest= preprocessing_dataset.load_mask(testing_set_mask, M_str, nrRows, nrCols)

new_labels_train=np.copy(labels)
new_labels_train[idx_testing]=-1
#split train set into 4 parts to create a validation set
training_set_mask, validation_set_mask, idx_training, idx_validation=preprocessing_dataset.split_train_validation_4(3, M_str, seed, new_labels_train)
Otraining = preprocessing_dataset.load_mask(training_set_mask, M_str, nrRows, nrCols)
Ovalidation= preprocessing_dataset.load_mask(validation_set_mask, M_str, nrRows, nrCols)

Otraining = np.concatenate((Otraining, training_set_mask), axis=1)
Ocol = np.zeros((Otest.shape[0], 1 ))
Otest_support = np.concatenate((Otest, Ocol), axis=1)
Ovalidation_support = np.concatenate((Ovalidation, Ocol), axis=1)
예제 #4
0
def running_one_time(validation, rank, l2_regu, gamma, gamma_H, gamma_W,
                     gamma_e, n, seed, cheby, n_conv_feat, lr, ord_row):
    """
    function to run the architecture one time
    Inputs:
       validation :  boolean, to include a validation set or not
       rank : rank of the SVD decomposition
       l2_regu: coefficient to use in front of the l2 regularization term. Default value = 1,
       gamma, gamma_H, gamma_W, gamma_e: coefficients in front of the other loss terms of the loss function,
       n: number of runs of the architecture for the same initialization with the weights and biases of the last run,
       seed: seed to use for the random sampling between training, testing and validation sets,
       cheby: boolean, use of a GCNN or a GCN layer. 0: GCN, 1: GCNN. Default value = 1,
       n_conv_feat: number of weights to use for the GCNN layer. Default value = 36,
       lr: learning rate. Default value = 0.001
       ord_row: int, number of chebyshev polynomials to use for the GCNN
    Outputs:
       auc_test_list, auc_train_list, auc_val_list: lists of the AUC values on the test, train and validation sets for the n runs,
       pred_train_list, pred_test_list, pred_val_list: lists of the prediction values on the test, train and validation sets for the n runs,
       labels_test, labels_train, labels_val: labels of the test, train and validation sets
    """
    #initialization
    training_set_mask, testing_set_mask, idx_training, idx_testing = preprocessing_dataset.split_train_test(
        0.8, M_str, seed, labels)
    #create a training and test mask on the data
    Otraining = preprocessing_dataset.load_mask(training_set_mask, M_str,
                                                nrRows, nrCols)
    Otest = preprocessing_dataset.load_mask(testing_set_mask, M_str, nrRows,
                                            nrCols)

    if validation:
        #split train set into 4 parts to create a validation set
        new_labels_train = np.copy(labels)
        new_labels_train[idx_testing] = -1
        training_set_mask, validation_set_mask, idx_training, idx_validation = preprocessing_dataset.split_train_validation_4(
            3, M_str, seed, new_labels_train)
        Otraining = preprocessing_dataset.load_mask(training_set_mask, M_str,
                                                    nrRows, nrCols)
        Ovalidation = preprocessing_dataset.load_mask(validation_set_mask,
                                                      M_str, nrRows, nrCols)
    else:
        validation_set_mask = []
        idx_validation = []

    Odata, _ = preprocessing_dataset.split_train_into_2(Otraining, seed)
    Otraining = np.concatenate((Otraining, training_set_mask), axis=1)
    if validation:
        Oinitial = np.concatenate(
            (Odata + Otest + Ovalidation, training_set_mask), axis=1)
    else:
        Oinitial = np.concatenate((Odata + Otest, training_set_mask), axis=1)

    M_init_final = preprocessing_dataset.init_mask(Oinitial, M_init, labels)
    #apply SVD initially for detecting the main components of our initialization
    initial_W, initial_H = preprocessing_dataset.svd_W_H(M_init_final, rank)

    learning_obj = Train_test_matrix_completion(M,
                                                Lrow,
                                                Wrow,
                                                Otraining,
                                                initial_W,
                                                initial_H,
                                                labels,
                                                training_set_mask,
                                                testing_set_mask,
                                                validation_set_mask,
                                                mask_features,
                                                validation,
                                                order_chebyshev_row=ord_row,
                                                cheby=cheby,
                                                n_conv_feat=n_conv_feat,
                                                l2_regu=l2_regu,
                                                gamma=gamma,
                                                gamma_H=gamma_H,
                                                gamma_W=gamma_W,
                                                gamma_e=gamma_e,
                                                learning_rate=lr)

    num_iter_test = 10
    num_total_iter_training = n
    num_iter = 0

    list_training_loss = list()
    list_training_norm_grad = list()
    list_test_pred_error = list()

    list_test_times = list()
    list_grad_X = list()

    auc_train_list = []
    pred_train_list = []

    auc_val_list = []
    pred_val_list = []
    auc_test_list = []
    pred_test_list = []

    num_iter = 0
    for k in range(num_iter, num_total_iter_training):

        tic = time.time()
        list_of_outputs = learning_obj.session.run([
            learning_obj.optimizer, learning_obj.loss, learning_obj.norm_grad,
            learning_obj.prediction_train, learning_obj.labels_training,
            learning_obj.loss_frob, learning_obj.loss_trace_row, learning_obj.
            frob_norm_H, learning_obj.frob_norm_W, learning_obj.binary_entropy
        ] + learning_obj.var_grad)
        current_training_loss = list_of_outputs[1]
        norm_grad = list_of_outputs[2]
        pred_train = list_of_outputs[3]
        labels_train = list_of_outputs[4]
        loss_frob = list_of_outputs[5]
        loss_trace = list_of_outputs[6]
        loss_norm_H = list_of_outputs[7]
        loss_norm_W = list_of_outputs[8]
        loss_entropy = list_of_outputs[9]
        indexes_train = np.concatenate((idx_testing, idx_validation), axis=0)

        pred_train = np.delete(pred_train, indexes_train, 0)
        labels_train = np.delete(labels_train, indexes_train, 0)

        accuracy_train, tn_train, fn_train, tp_train, fp_train, spe_train, sen_train = preprocessing_dataset.accuracy_computation(
            pred_train, training_set_mask, labels)
        fpr_train, tpr_train, thresholds_train = roc_curve(
            labels_train, pred_train)
        roc_auc_train = auc(fpr_train, tpr_train)

        X_grad = list_of_outputs[10:]
        training_time = time.time() - tic

        list_training_loss.append(current_training_loss)
        list_training_norm_grad.append(norm_grad)

        if (np.mod(num_iter, num_iter_test) == 0):
            msg = "[TRN] iter = %03i, cost = %3.2e, |grad| = %.2e (%3.2es), accuracy = %3.2e, auc = %3.2e" \
                                        % (num_iter, list_training_loss[-1], list_training_norm_grad[-1], training_time, accuracy_train, roc_auc_train)
            print msg

            auc_train_list.append(roc_auc_train)
            pred_train_list.append(pred_train)

            #Test Code
            tic = time.time()
            pred_error, pred, labels_test = learning_obj.session.run([
                learning_obj.predictions_error, learning_obj.predictions,
                learning_obj.labels_test
            ])
            test_time = time.time() - tic

            if validation:
                pred_val, labels_val = learning_obj.session.run(
                    [learning_obj.predictions_val, learning_obj.labels_val])

                indexes_validation = np.concatenate(
                    (idx_training, idx_testing), axis=0)
                pred_val = np.delete(pred_val, indexes_validation, 0)
                labels_val = np.delete(labels_val, indexes_validation, 0)
                fpr, tpr, thresholds = roc_curve(labels_val, pred_val)
                roc_auc_val = auc(fpr, tpr)
                auc_val_list.append(roc_auc_val)
                pred_val_list.append(pred_val)
                msg = "[VAL] iter = %03i, AUC = %3.2e" % (num_iter,
                                                          roc_auc_val)
                print msg
            else:
                pred_val_list = []
                labels_val = []

            indexes_test = np.concatenate((idx_training, idx_validation),
                                          axis=0)

            pred = np.delete(pred, indexes_test, 0)
            labels_test = np.delete(labels_test, indexes_test, 0)

            list_test_pred_error.append(pred_error)

            accuracy, tn, fn, tp, fp, spe, sen = preprocessing_dataset.accuracy_computation(
                pred, testing_set_mask, labels)
            fpr, tpr, thresholds = roc_curve(labels_test, pred)
            roc_auc = auc(fpr, tpr)

            auc_test_list.append(roc_auc)
            pred_test_list.append(pred)

            msg = "[TST] iter = %03i, cost = %3.2e, Accuracy = %3.2e (%3.2es), AUC = %3.2e" % (
                num_iter, list_test_pred_error[-1], accuracy, test_time,
                roc_auc)
            print msg

        num_iter += 1
    return (auc_test_list, auc_train_list, auc_val_list, pred_train_list,
            pred_test_list, pred_val_list, labels_test, labels_train,
            labels_val)
예제 #5
0
def running_one_time(l2_regu, gamma_age, gamma_sex, gamma_agesex, gamma_W, gamma_e,n, seed, cheby, n_conv_feat, dropout, lr):
    """
    function to run the architecture one time
    Inputs:
       l2_regu: coefficient to use in front of the l2 regularization term. Default value = 1,
       gamma_age, gamma_sex, gamma_agesex, gamma_W, gamma_e: coefficients in front of the other loss terms of the loss function,
       n: number of runs of the architecture for the same initialization with the weights and biases of the last run,
       seed: seed to use for the random sampling between training, testing and validation sets,
       cheby: boolean, use of a GCNN or a GCN layer. 0: GCN, 1: GCNN. Default value = 1,
       n_conv_feat: number of weights to use for the GCNN layer. Default value = 36,
       dropout: dropout rate on the GCNN output. Default = 0.5,
       lr: learning rate. Default value = 0.001
    Outputs:
       auc_test_list, auc_train_list, auc_val_list: lists of the AUC values on the test, train and validation sets for the n runs,
       pred_train_list, pred_test_list, pred_val_list: lists of the prediction values on the test, train and validation sets for the n runs,
       labels_test, labels_train, labels_val: labels of the test, train and validation sets
    """
    #initialization of the matrix, of the training, testing and validation sets
    training_set_mask, testing_set_mask, idx_training, idx_testing=preprocessing_dataset.split_train_test(0.8, M_str, seed, labels)
    #create a training and test mask on the data
    Otraining = preprocessing_dataset.load_mask(training_set_mask, M_str, nrRows, nrCols)
    Otest = preprocessing_dataset.load_mask(testing_set_mask, M_str, nrRows, nrCols)
    #creation of a validation set from the training set. Split the training set into 4 parts, one for validation
    new_labels_train=np.copy(labels)
    new_labels_train[idx_testing]=-1
    training_set_mask, validation_set_mask, idx_training, idx_validation=preprocessing_dataset.split_train_validation_4(3, M_str, seed, new_labels_train)
    Otraining = preprocessing_dataset.load_mask(training_set_mask, M_str, nrRows, nrCols)
    Ovalidation= preprocessing_dataset.load_mask(validation_set_mask, M_str, nrRows, nrCols)

    Odata, _=preprocessing_dataset.split_train_into_2(Otraining, seed)
    Otraining = np.concatenate((Otraining, training_set_mask), axis=1)
    Oinitial=np.concatenate((Odata+Otest+Ovalidation, training_set_mask), axis=1)

    M_init_final=preprocessing_dataset.init_mask(Oinitial, M_init, labels)

    learning_obj = Train_test_matrix_completion(M, M_init_final, A_age, A_sex, A_sexage, mask_age, mask_sex, mask_agesex , mask_nosignificance, Lrow_age, Lrow_sex, Lrow_agesex, Otraining,
                                                    labels, training_set_mask, testing_set_mask, validation_set_mask, mask_features,
                                                    order_chebyshev_row = ord_row,cheby=cheby, n_conv_feat=n_conv_feat, l2_regu=l2_regu,dropout=dropout,
                                                    gamma_age=gamma_age, gamma_sex=gamma_sex, gamma_agesex=gamma_agesex, gamma_W=gamma_W, gamma_e=gamma_e, learning_rate=lr)

    num_iter_test = 10
    num_total_iter_training = n

    num_iter = 0

    list_training_loss = list()
    list_training_norm_grad = list()
    list_test_pred_error = list()

    auc_train_list=[]
    pred_train_list=[]
    auc_test_list=[]
    pred_test_list=[]
    auc_val_list=[]
    pred_val_list=[]

    for k in range(num_iter, num_total_iter_training):

        tic = time.time()
        # run of the algorithm on the training set
        list_of_outputs = learning_obj.session.run([learning_obj.optimizer, learning_obj.loss, learning_obj.norm_grad, learning_obj.prediction_train, learning_obj.labels_training, learning_obj.loss_frob, learning_obj.frob_norm_W, learning_obj.binary_entropy, learning_obj.loss_trace_row_age, learning_obj.loss_trace_row_agesex, learning_obj.loss_trace_row_sex] + learning_obj.var_grad)
        current_training_loss = list_of_outputs[1]
        norm_grad = list_of_outputs[2]
        pred_train = list_of_outputs[3]
        labels_train =list_of_outputs[4]
        loss_frob = list_of_outputs[5]
        loss_frob_W = list_of_outputs[6]
        loss_entropy =list_of_outputs[7]
        loss_age =list_of_outputs[8]
        loss_agesex = list_of_outputs[9]
        loss_sex= list_of_outputs[10]

        indexes_train = np.concatenate((idx_testing, idx_validation), axis=0)

        pred_train = np.delete(pred_train, indexes_train,0)
        labels_train = np.delete(labels_train, indexes_train,0)

        accuracy_train, tn_train, fn_train, tp_train, fp_train, spe_train, sen_train=preprocessing_dataset.accuracy_computation(pred_train, training_set_mask, labels)
        fpr_train, tpr_train, thresholds_train=roc_curve(labels_train, pred_train)
        roc_auc_train = auc(fpr_train, tpr_train)

        X_grad = list_of_outputs[10:]
        training_time = time.time() - tic

        list_training_loss.append(current_training_loss)
        list_training_norm_grad.append(norm_grad)

        if (np.mod(num_iter, num_iter_test)==0):
            msg = "[TRN] iter = %03i, cost = %3.2e, |grad| = %.2e (%3.2es), accuracy = %3.2e, auc = %3.2e" \
                                        % (num_iter, list_training_loss[-1], list_training_norm_grad[-1], training_time, accuracy_train, roc_auc_train)
            print msg

            auc_train_list.append(roc_auc_train)
            pred_train_list.append(pred_train)
            #Test Code
            tic = time.time()
            # run of the algorithm on the validation and test sets
            pred_error, pred, labels_test, pred_val, labels_val= learning_obj.session.run([learning_obj.predictions_error, learning_obj.predictions, learning_obj.labels_test,
                                                                     learning_obj.predictions_val, learning_obj.labels_val])
            test_time = time.time() - tic

            indexes_validation = np.concatenate((idx_training, idx_testing), axis=0)
            pred_val = np.delete(pred_val, indexes_validation,0)
            labels_val = np.delete(labels_val, indexes_validation,0)
            fpr, tpr, thresholds=roc_curve(labels_val, pred_val)
            roc_auc_val = auc(fpr, tpr)
            auc_val_list.append(roc_auc_val)
            pred_val_list.append(pred_val)
            msg =  "[VAL] iter = %03i, AUC = %3.2e" % (num_iter, roc_auc_val)
            print msg
            indexes_test = np.concatenate((idx_training, idx_validation), axis=0)

            pred = np.delete(pred, indexes_test,0)
            labels_test = np.delete(labels_test, indexes_test,0)
            list_test_pred_error.append(pred_error)

            accuracy, tn, fn, tp, fp, spe, sen= preprocessing_dataset.accuracy_computation(pred, testing_set_mask, labels)
            fpr, tpr, thresholds=roc_curve(labels_test, pred)
            roc_auc = auc(fpr, tpr)

            auc_test_list.append(roc_auc)
            pred_test_list.append(pred)

            msg =  "[TST] iter = %03i, cost = %3.2e, Accuracy = %3.2e (%3.2es), AUC = %3.2e" % (num_iter, list_test_pred_error[-1], accuracy, test_time, roc_auc)
            print msg

        num_iter += 1
    return (auc_test_list, auc_train_list, auc_val_list, pred_train_list, pred_test_list, pred_val_list, labels_test, labels_train, labels_val)