def running_one_time(n, dropout, seed, n_conv_feat, lr, l2_regu, nb_layers, ord_row): """ function to run the architecture one time Inputs: n: number of runs of the architecture for the same initialization with the weights and biases of the last run, seed: seed to use for the random sampling between training, testing and validation sets, n_conv_feat: number of weights to use for the GCNN layer. Default value = 36, l2_regu: coefficient to use in front of the l2 regularization term. Default value = 1, dropout: dropout rate on the GCNN output. Default = 0.5, lr: learning rate. Default value = 0.001 nb_layers: number of GCNN layers ord_row: number of Chebyshev polynomials for the GCNN layers Outputs: auc_test_list, auc_train_list, auc_val_list: lists of the AUC values on the test, train and validation sets for the n runs, pred_train_list, pred_test_list, pred_val_list: lists of the prediction values on the test, train and validation sets for the n runs, labels_test, labels_train, labels_val: labels of the test, train and validation sets """ #initialization of the matrix, of the training, testing and validation sets training_set_mask, testing_set_mask, idx_training, idx_testing = preprocessing_dataset.split_train_test( 0.8, M_str, seed, labels) new_labels_train = np.copy(labels) new_labels_train[idx_testing] = -1 #creation of a validation set from the training set. Split the training set into 4 parts, one for validation training_set_mask, validation_set_mask, idx_training, idx_validation = preprocessing_dataset.split_train_validation_4( 3, M_str, seed, new_labels_train) labels_train = labels * training_set_mask labels_test = labels * testing_set_mask labels_val = labels * validation_set_mask indexes_train = np.concatenate((idx_testing, idx_validation), axis=0) indexes_validation = np.concatenate((idx_training, idx_testing), axis=0) indexes_test = np.concatenate((idx_training, idx_validation), axis=0) labels_test_reduce = np.delete(labels_test, indexes_test, 0) labels_train_reduce = np.delete(labels_train, indexes_train, 0) labels_val_reduce = np.delete(labels_val, indexes_validation, 0) learning_obj = Train_test_matrix_completion(M, Lrow_age, Lrow_sex, Lrow_agesex, A_age, A_sex, A_sexage, mask_age, mask_sex, mask_agesex, mask_nosignificance, labels_train, labels_test, labels_val, testing_set_mask, training_set_mask, validation_set_mask, order_chebyshev_row=ord_row, n_conv_feat=n_conv_feat, dropout=dropout, learning_rate=lr, l2_regu=l2_regu, nb_layers=nb_layers) num_iter_test = 10 num_total_iter_training = n num_iter = 0 list_training_loss = list() list_training_norm_grad = list() list_test_pred_error = list() auc_train_list = [] pred_train_list = [] auc_test_list = [] pred_test_list = [] auc_val_list = [] pred_val_list = [] for k in range(num_iter, num_total_iter_training): tic = time.time() # run of the algorithm on the training set list_of_outputs = learning_obj.session.run( [ learning_obj.optimizer, learning_obj.loss, learning_obj.norm_grad, learning_obj.classification_train, learning_obj.binary_entropy ] + learning_obj.var_grad ) #learning_obj.loss_frob, learning_obj.loss_trace_row, learning_obj.frob_norm_H, learning_obj.frob_norm_W, learning_obj.binary_entropy current_training_loss = list_of_outputs[1] norm_grad = list_of_outputs[2] pred_train = list_of_outputs[3] pred_train = np.delete(pred_train, indexes_train, 0) fpr_train, tpr_train, thresholds_train = roc_curve( labels_train_reduce, pred_train) roc_auc_train = auc(fpr_train, tpr_train) X_grad = list_of_outputs[3:] training_time = time.time() - tic list_training_loss.append(current_training_loss) list_training_norm_grad.append(norm_grad) if (np.mod(num_iter, num_iter_test) == 0): msg = "[TRN] iter = %03i, cost = %3.2e, |grad| = %.2e (%3.2es), AUC = %3.2e" \ % (num_iter, list_training_loss[-1], list_training_norm_grad[-1], training_time, roc_auc_train) print msg auc_train_list.append(roc_auc_train) pred_train_list.append(pred_train) tic = time.time() # run of the algorithm on the validation set pred_val = learning_obj.session.run( [learning_obj.classification_val]) # test_time = time.time() - tic pred_val = np.delete(pred_val[0], indexes_validation, 0) fpr, tpr, thresholds = roc_curve(labels_val_reduce, pred_val) roc_auc_val = auc(fpr, tpr) auc_val_list.append(roc_auc_val) pred_val_list.append(pred_val) msg = "[VAL] iter = %03i, AUC = %3.2e" % (num_iter, roc_auc_val) print msg tic = time.time() # run of the algorithm on the test set pred_error, pred = learning_obj.session.run([ learning_obj.predictions_error, learning_obj.classification_test ]) # test_time = time.time() - tic pred = np.delete(pred, indexes_test, 0) list_test_pred_error.append(pred_error) fpr, tpr, thresholds = roc_curve(labels_test_reduce, pred) roc_auc = auc(fpr, tpr) auc_test_list.append(roc_auc) pred_test_list.append(pred) msg = "[TST] iter = %03i, cost = %3.2e, AUC = %3.2e" % ( num_iter, list_test_pred_error[-1], roc_auc) print msg num_iter += 1 return (auc_test_list, auc_train_list, auc_val_list, pred_train_list, pred_test_list, pred_val_list, labels_test_reduce, labels_train_reduce, labels_val_reduce)
def run(seed, gamma, beta, hidden, lr, NB_EPOCH=300): """ Main function. Run the architecture for the initialization defined by seed and by the hyperparameters gamma, beta, hidden, lr Inputs: seed : seed to defined the initialization of the training/testing/validation split, gamma, beta, hidden, lr: hyperparameters of the architecture NB_EPOCH: number of runs to do of the same architecture with different weight initializations. Default: 1000 Outputs: auc_test, auc_train, auc_val: AUC on the test, train and validation sets """ tf.reset_default_graph() training_set_mask, testing_set_mask, idx_training, idx_testing = preprocessing_dataset.split_train_test( 0.8, M_str, seed, labels) #create a training and test mask on the data Otraining = preprocessing_dataset.load_mask(training_set_mask, M_str, nrRows, nrCols) Otest = preprocessing_dataset.load_mask(testing_set_mask, M_str, nrRows, nrCols) new_labels_train = np.copy(labels) new_labels_train[idx_testing] = -1 #split train set into 4 parts to create a validation set training_set_mask, validation_set_mask, idx_training, idx_validation = preprocessing_dataset.split_train_validation_4( 3, M_str, seed, new_labels_train) Otraining = preprocessing_dataset.load_mask(training_set_mask, M_str, nrRows, nrCols) Ovalidation = preprocessing_dataset.load_mask(validation_set_mask, M_str, nrRows, nrCols) Otraining = np.concatenate((Otraining, training_set_mask), axis=1) Ocol = np.zeros((Otest.shape[0], 1)) Otest_support = np.concatenate((Otest, Ocol), axis=1) Ovalidation_support = np.concatenate((Ovalidation, Ocol), axis=1) Osupport_t = Otraining + Otest_support + Ovalidation_support Ovalidation = np.concatenate((Ovalidation, validation_set_mask), axis=1) Otest = np.concatenate((Otest, testing_set_mask), axis=1) u_features, v_features, train_labels, train_u_indices, train_v_indices, val_labels, val_u_indices, val_v_indices, test_labels, test_u_indices, test_v_indices = load_data_monti_tadpole( M, Otraining, Otest, Ovalidation) m, n = M.shape # global normalization support = [] support_t = [] path_support_women = "women_synth_noteasy.csv" women_support, _, _ = read_tadpole.load_csv_no_header(path_support_women) women_support = preprocessing_dataset.str_to_float(women_support) women_support = women_support * M_sup women_support = sp.csr_matrix(women_support, dtype=np.float32) support.append(women_support) support_t.append(women_support.T) path_support_men = "men_synth_noteasy.csv" men_support, _, _ = read_tadpole.load_csv_no_header(path_support_men) men_support = preprocessing_dataset.str_to_float(men_support) men_support = men_support * M_sup men_support = sp.csr_matrix(men_support, dtype=np.float32) support.append(men_support) support_t.append(men_support.T) path_support_women_84 = "age_84_92_women_synth_noteasy.csv" women_84_support, _, _ = read_tadpole.load_csv_no_header( path_support_women_84) women_84_support = preprocessing_dataset.str_to_float(women_84_support) women_84_support = women_84_support * M_sup women_84_support = sp.csr_matrix(women_84_support, dtype=np.float32) support.append(women_84_support) support_t.append(women_84_support.T) path_support_men_84 = "age_84_92_men_synth_noteasy.csv" men_84_support, _, _ = read_tadpole.load_csv_no_header(path_support_men_84) men_84_support = preprocessing_dataset.str_to_float(men_84_support) men_84_support = men_84_support * M_sup men_84_support = sp.csr_matrix(men_84_support, dtype=np.float32) support.append(men_84_support) support_t.append(men_84_support.T) path_support_84 = "age_84_92_synth_noteasy.csv" age84_support, _, _ = read_tadpole.load_csv_no_header(path_support_84) age84_support = preprocessing_dataset.str_to_float(age84_support) age84_support = age84_support * M_sup age84_support = sp.csr_matrix(age84_support, dtype=np.float32) support.append(age84_support) support_t.append(age84_support.T) path_support_women_79 = "age_79_84_women_synth_noteasy.csv" women_79_support, _, _ = read_tadpole.load_csv_no_header( path_support_women_79) women_79_support = preprocessing_dataset.str_to_float(women_79_support) women_79_support = women_79_support * M_sup women_79_support = sp.csr_matrix(women_79_support, dtype=np.float32) support.append(women_79_support) support_t.append(women_79_support.T) path_support_men_79 = "age_79_84_men_synth_noteasy.csv" men_79_support, _, _ = read_tadpole.load_csv_no_header(path_support_men_79) men_79_support = preprocessing_dataset.str_to_float(men_79_support) men_79_support = men_79_support * M_sup men_79_support = sp.csr_matrix(men_79_support, dtype=np.float32) support.append(men_79_support) support_t.append(men_79_support.T) path_support_79 = "age_79_84_synth_noteasy.csv" age79_support, _, _ = read_tadpole.load_csv_no_header(path_support_79) age79_support = preprocessing_dataset.str_to_float(age79_support) age79_support = age79_support * M_sup age79_support = sp.csr_matrix(age79_support, dtype=np.float32) support.append(age79_support) support_t.append(age79_support.T) path_support_women_74 = "age_74_79_women_synth_noteasy.csv" women_74_support, _, _ = read_tadpole.load_csv_no_header( path_support_women_74) women_74_support = preprocessing_dataset.str_to_float(women_74_support) women_74_support = women_74_support * M_sup women_74_support = sp.csr_matrix(women_74_support, dtype=np.float32) support.append(women_74_support) support_t.append(women_74_support.T) path_support_men_74 = "age_74_79_men_synth_noteasy.csv" men_74_support, _, _ = read_tadpole.load_csv_no_header(path_support_men_74) men_74_support = preprocessing_dataset.str_to_float(men_74_support) men_74_support = men_74_support * M_sup men_74_support = sp.csr_matrix(men_74_support, dtype=np.float32) support.append(men_74_support) support_t.append(men_74_support.T) path_support_74 = "age_74_79_synth_noteasy.csv" age74_support, _, _ = read_tadpole.load_csv_no_header(path_support_74) age74_support = preprocessing_dataset.str_to_float(age74_support) age74_support = age74_support * M_sup age74_support = sp.csr_matrix(age74_support, dtype=np.float32) support.append(age74_support) support_t.append(age74_support.T) path_support_women_69 = "age_69_74_women_synth_noteasy.csv" women_69_support, _, _ = read_tadpole.load_csv_no_header( path_support_women_69) women_69_support = preprocessing_dataset.str_to_float(women_69_support) women_69_support = women_69_support * M_sup women_69_support = sp.csr_matrix(women_69_support, dtype=np.float32) support.append(women_69_support) support_t.append(women_69_support.T) path_support_men_69 = "age_69_74_men_synth_noteasy.csv" men_69_support, _, _ = read_tadpole.load_csv_no_header(path_support_men_69) men_69_support = preprocessing_dataset.str_to_float(men_69_support) men_69_support = men_69_support * M_sup men_69_support = sp.csr_matrix(men_69_support, dtype=np.float32) support.append(men_69_support) support_t.append(men_69_support.T) path_support_69 = "age_69_74_synth_noteasy.csv" age69_support, _, _ = read_tadpole.load_csv_no_header(path_support_69) age69_support = preprocessing_dataset.str_to_float(age69_support) age69_support = age69_support * M_sup age69_support = sp.csr_matrix(age69_support, dtype=np.float32) support.append(age69_support) support_t.append(age69_support.T) path_support_women_64 = "age_64_69_women_synth_noteasy.csv" women_64_support, _, _ = read_tadpole.load_csv_no_header( path_support_women_64) women_64_support = preprocessing_dataset.str_to_float(women_64_support) women_64_support = women_64_support * M_sup women_64_support = sp.csr_matrix(women_64_support, dtype=np.float32) support.append(women_64_support) support_t.append(women_64_support.T) path_support_men_64 = "age_64_69_men_synth_noteasy.csv" men_64_support, _, _ = read_tadpole.load_csv_no_header(path_support_men_64) men_64_support = preprocessing_dataset.str_to_float(men_64_support) men_64_support = men_64_support * M_sup men_64_support = sp.csr_matrix(men_64_support, dtype=np.float32) support.append(men_64_support) support_t.append(men_64_support.T) path_support_64 = "age_64_69_synth_noteasy.csv" age64_support, _, _ = read_tadpole.load_csv_no_header(path_support_64) age64_support = preprocessing_dataset.str_to_float(age64_support) age64_support = age64_support * M_sup age64_support = sp.csr_matrix(age64_support, dtype=np.float32) support.append(age64_support) support_t.append(age64_support.T) path_support_women_59 = "age_59_64_women_synth_noteasy.csv" women_59_support, _, _ = read_tadpole.load_csv_no_header( path_support_women_59) women_59_support = preprocessing_dataset.str_to_float(women_59_support) women_59_support = women_59_support * M_sup women_59_support = sp.csr_matrix(women_59_support, dtype=np.float32) support.append(women_59_support) support_t.append(women_59_support.T) path_support_men_59 = "age_59_64_men_synth_noteasy.csv" men_59_support, _, _ = read_tadpole.load_csv_no_header(path_support_men_59) men_59_support = preprocessing_dataset.str_to_float(men_59_support) men_59_support = men_59_support * M_sup men_59_support = sp.csr_matrix(men_59_support, dtype=np.float32) support.append(men_59_support) support_t.append(men_59_support.T) path_support_59 = "age_59_64_synth_noteasy.csv" age59_support, _, _ = read_tadpole.load_csv_no_header(path_support_59) age59_support = preprocessing_dataset.str_to_float(age59_support) age59_support = age59_support * M_sup age59_support = sp.csr_matrix(age59_support, dtype=np.float32) support.append(age59_support) support_t.append(age59_support.T) path_support_women_54 = "age_54_59_women_synth_noteasy.csv" women_54_support, _, _ = read_tadpole.load_csv_no_header( path_support_women_54) women_54_support = preprocessing_dataset.str_to_float(women_54_support) women_54_support = women_54_support * M_sup women_54_support = sp.csr_matrix(women_54_support, dtype=np.float32) support.append(women_54_support) support_t.append(women_54_support.T) path_support_men_54 = "age_54_59_men_synth_noteasy.csv" men_54_support, _, _ = read_tadpole.load_csv_no_header(path_support_men_54) men_54_support = preprocessing_dataset.str_to_float(men_54_support) men_54_support = men_54_support * M_sup men_54_support = sp.csr_matrix(men_54_support, dtype=np.float32) support.append(men_54_support) support_t.append(men_54_support.T) path_support_54 = "age_54_59_synth_noteasy.csv" age54_support, _, _ = read_tadpole.load_csv_no_header(path_support_54) age54_support = preprocessing_dataset.str_to_float(age54_support) age54_support = age54_support * M_sup age54_support = sp.csr_matrix(age54_support, dtype=np.float32) support.append(age54_support) support_t.append(age54_support.T) num_support = len(support) mask_support_t = [] Osupport_t = sp.csr_matrix(Osupport_t, dtype=np.int) for i in range(num_support): mask_support_t.append(Osupport_t.T) mask_support_t = sp.hstack(mask_support_t, format='csr') support = sp.hstack(support, format='csr') support_t = sp.hstack(support_t, format='csr') # Collect all user and item nodes for test set test_u = list(set(test_u_indices)) test_v = list(set(test_v_indices)) test_u_dict = {n: i for i, n in enumerate(test_u)} test_v_dict = {n: i for i, n in enumerate(test_v)} test_u_indices = np.array([test_u_dict[o] for o in test_u_indices]) test_v_indices = np.array([test_v_dict[o] for o in test_v_indices]) test_support = support[np.array(test_u)] for i in range(test_support.shape[0]): for j in range(563, test_support.shape[1], 564): test_support[i, j] = 0.0 test_support_t = sp.csr_matrix.multiply(support_t, mask_support_t) # Collect all user and item nodes for validation set val_u = list(set(val_u_indices)) val_v = list(set(val_v_indices)) val_u_dict = {n: i for i, n in enumerate(val_u)} val_v_dict = {n: i for i, n in enumerate(val_v)} val_u_indices = np.array([val_u_dict[o] for o in val_u_indices]) val_v_indices = np.array([val_v_dict[o] for o in val_v_indices]) val_support = support[np.array(val_u)] for i in range(val_support.shape[0]): for j in range(563, val_support.shape[1], 564): val_support[i, j] = 0.0 val_support_t = sp.csr_matrix.multiply(support_t, mask_support_t) # Collect all user and item nodes for train set train_u = list(set(train_u_indices)) train_v = list(set(train_v_indices)) train_u_dict = {n: i for i, n in enumerate(train_u)} train_v_dict = {n: i for i, n in enumerate(train_v)} train_u_indices = np.array([train_u_dict[o] for o in train_u_indices]) train_v_indices = np.array([train_v_dict[o] for o in train_v_indices]) train_support = support[np.array(train_u)] train_support_t = sp.csr_matrix.multiply(support_t, mask_support_t) placeholders = { 'u_features': tf.sparse_placeholder(tf.float32, shape=np.array(u_features.shape, dtype=np.int64)), 'v_features': tf.sparse_placeholder(tf.float32, shape=np.array(v_features.shape, dtype=np.int64)), 'u_features_nonzero': tf.placeholder(tf.int32, shape=()), 'v_features_nonzero': tf.placeholder(tf.int32, shape=()), 'labels': tf.placeholder(tf.float32, shape=(None, )), 'indices_labels': tf.placeholder(tf.int32, shape=(None, )), 'user_indices': tf.placeholder(tf.int32, shape=(None, )), 'item_indices': tf.placeholder(tf.int32, shape=(None, )), 'dropout': tf.placeholder_with_default(0., shape=()), 'weight_decay': tf.placeholder_with_default(0., shape=()), 'support': tf.sparse_placeholder(tf.float32, shape=(None, None)), 'support_t': tf.sparse_placeholder(tf.float32, shape=(None, None)), } div = hidden[0] // num_support if hidden[0] % num_support != 0: print( """\nWARNING: HIDDEN[0] (=%d) of stack layer is adjusted to %d such that it can be evenly split in %d splits.\n""" % (hidden[0], num_support * div, num_support)) hidden[0] = num_support * div # create model model = MG_GAE(placeholders, input_dim=u_features.shape[1], num_support=num_support, hidden=hidden, num_users=m, num_items=n, learning_rate=lr, gamma=gamma, beta=beta, logging=True) # Convert sparse placeholders to tuples to construct feed_dict test_support = sparse_to_tuple(test_support) test_support_t = sparse_to_tuple(test_support_t) val_support = sparse_to_tuple(val_support) val_support_t = sparse_to_tuple(val_support_t) train_support = sparse_to_tuple(train_support) train_support_t = sparse_to_tuple(train_support_t) u_features = sparse_to_tuple(u_features) v_features = sparse_to_tuple(v_features) assert u_features[2][1] == v_features[2][ 1], 'Number of features of users and items must be the same!' num_features = u_features[2][1] u_features_nonzero = u_features[1].shape[0] v_features_nonzero = v_features[1].shape[0] indices_labels = [563] * train_labels.shape[0] indices_labels_val = [563] * val_labels.shape[0] indices_labels_test = [563] * test_labels.shape[0] # Feed_dicts for validation and test set stay constant over different update steps train_feed_dict = construct_feed_dict(placeholders, u_features, v_features, u_features_nonzero, v_features_nonzero, train_support, train_support_t, train_labels, indices_labels, train_u_indices, train_v_indices, 0.) # No dropout for validation and test runs val_feed_dict = construct_feed_dict(placeholders, u_features, v_features, u_features_nonzero, v_features_nonzero, val_support, val_support_t, val_labels, indices_labels_val, val_u_indices, val_v_indices, 0.) test_feed_dict = construct_feed_dict(placeholders, u_features, v_features, u_features_nonzero, v_features_nonzero, test_support, test_support_t, test_labels, indices_labels_test, test_u_indices, test_v_indices, 0.) # Collect all variables to be logged into summary merged_summary = tf.summary.merge_all() sess = tf.Session() sess.run(tf.global_variables_initializer()) auc_train = [] auc_test = [] auc_val = [] test_pred = [] for epoch in range(NB_EPOCH): t = time.time() # Run single weight update outs = sess.run([ model.training_op, model.loss, model.indices, model.labels, model.outputs, model.labels_class, model.classification, model.inputs, model.gcn_u, model.gcn_v, model.loss_frob, model.binary_entropy, model.u_inputs, model.v_inputs, model.weight, model.input_u, model.input_v, model.u_indices, model.v_indices ], feed_dict=train_feed_dict) train_avg_loss = outs[1] label_train = outs[5] output_train = outs[6] fpr_train, tpr_train, thresholds_train = roc_curve( label_train, output_train, pos_label=label_train.max()) roc_auc_train = auc(fpr_train, tpr_train) auc_train.append(roc_auc_train) val_avg_loss, val_classification, val_labels_corres = sess.run( [model.loss, model.classification, model.labels_class], feed_dict=val_feed_dict) #test_feed_dict)# fpr_val, tpr_val, thresholds_train = roc_curve( val_labels_corres, val_classification, pos_label=label_train.max()) roc_auc_val = auc(fpr_val, tpr_val) auc_val.append(roc_auc_val) test_avg_loss, test_classification, test_labels_corres = sess.run( [model.loss, model.classification, model.labels_class], feed_dict=test_feed_dict) fpr_test, tpr_test, thresholds_test = roc_curve( test_labels_corres, test_classification, pos_label=label_train.max()) roc_auc_test = auc(fpr_test, tpr_test) auc_test.append(roc_auc_test) test_pred.append(test_classification) if VERBOSE: print("[*] Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(train_avg_loss), "train_auc=", "{:.5f}".format(roc_auc_train), "val_loss=", "{:.5f}".format(val_avg_loss), "val_auc=", "{:.5f}".format(roc_auc_val), "\t\ttime=", "{:.5f}".format(time.time() - t)) print('test auc = ', roc_auc_test) sess.close() return auc_test, auc_train, auc_val
age_file_path = 'ages_synthetic_data_noteasy.csv' age_column, nrRowsa, nrColsa = preprocessing_dataset.load_csv_no_header_float(age_file_path) age_column=age_column.reshape(-1) labels = preprocessing_dataset.str_to_float(labels) labels_save=labels.reshape(-1) M_init= np.concatenate((M_str, labels), axis=1) M_support = np.concatenate((M_str, labels), axis=1) M = preprocessing_dataset.normalization_gae(M_init) M_sup=preprocessing_dataset.normalization_for_supportreal(M_support) seed=0 training_set_mask, testing_set_mask, idx_training, idx_testing=preprocessing_dataset.split_train_test(0.8, M_str, seed, labels) #create a training and test mask on the data Otraining = preprocessing_dataset.load_mask(training_set_mask, M_str, nrRows, nrCols) Otest= preprocessing_dataset.load_mask(testing_set_mask, M_str, nrRows, nrCols) new_labels_train=np.copy(labels) new_labels_train[idx_testing]=-1 #split train set into 4 parts to create a validation set training_set_mask, validation_set_mask, idx_training, idx_validation=preprocessing_dataset.split_train_validation_4(3, M_str, seed, new_labels_train) Otraining = preprocessing_dataset.load_mask(training_set_mask, M_str, nrRows, nrCols) Ovalidation= preprocessing_dataset.load_mask(validation_set_mask, M_str, nrRows, nrCols) Otraining = np.concatenate((Otraining, training_set_mask), axis=1) Ocol = np.zeros((Otest.shape[0], 1 )) Otest_support = np.concatenate((Otest, Ocol), axis=1) Ovalidation_support = np.concatenate((Ovalidation, Ocol), axis=1)
def running_one_time(validation, rank, l2_regu, gamma, gamma_H, gamma_W, gamma_e, n, seed, cheby, n_conv_feat, lr, ord_row): """ function to run the architecture one time Inputs: validation : boolean, to include a validation set or not rank : rank of the SVD decomposition l2_regu: coefficient to use in front of the l2 regularization term. Default value = 1, gamma, gamma_H, gamma_W, gamma_e: coefficients in front of the other loss terms of the loss function, n: number of runs of the architecture for the same initialization with the weights and biases of the last run, seed: seed to use for the random sampling between training, testing and validation sets, cheby: boolean, use of a GCNN or a GCN layer. 0: GCN, 1: GCNN. Default value = 1, n_conv_feat: number of weights to use for the GCNN layer. Default value = 36, lr: learning rate. Default value = 0.001 ord_row: int, number of chebyshev polynomials to use for the GCNN Outputs: auc_test_list, auc_train_list, auc_val_list: lists of the AUC values on the test, train and validation sets for the n runs, pred_train_list, pred_test_list, pred_val_list: lists of the prediction values on the test, train and validation sets for the n runs, labels_test, labels_train, labels_val: labels of the test, train and validation sets """ #initialization training_set_mask, testing_set_mask, idx_training, idx_testing = preprocessing_dataset.split_train_test( 0.8, M_str, seed, labels) #create a training and test mask on the data Otraining = preprocessing_dataset.load_mask(training_set_mask, M_str, nrRows, nrCols) Otest = preprocessing_dataset.load_mask(testing_set_mask, M_str, nrRows, nrCols) if validation: #split train set into 4 parts to create a validation set new_labels_train = np.copy(labels) new_labels_train[idx_testing] = -1 training_set_mask, validation_set_mask, idx_training, idx_validation = preprocessing_dataset.split_train_validation_4( 3, M_str, seed, new_labels_train) Otraining = preprocessing_dataset.load_mask(training_set_mask, M_str, nrRows, nrCols) Ovalidation = preprocessing_dataset.load_mask(validation_set_mask, M_str, nrRows, nrCols) else: validation_set_mask = [] idx_validation = [] Odata, _ = preprocessing_dataset.split_train_into_2(Otraining, seed) Otraining = np.concatenate((Otraining, training_set_mask), axis=1) if validation: Oinitial = np.concatenate( (Odata + Otest + Ovalidation, training_set_mask), axis=1) else: Oinitial = np.concatenate((Odata + Otest, training_set_mask), axis=1) M_init_final = preprocessing_dataset.init_mask(Oinitial, M_init, labels) #apply SVD initially for detecting the main components of our initialization initial_W, initial_H = preprocessing_dataset.svd_W_H(M_init_final, rank) learning_obj = Train_test_matrix_completion(M, Lrow, Wrow, Otraining, initial_W, initial_H, labels, training_set_mask, testing_set_mask, validation_set_mask, mask_features, validation, order_chebyshev_row=ord_row, cheby=cheby, n_conv_feat=n_conv_feat, l2_regu=l2_regu, gamma=gamma, gamma_H=gamma_H, gamma_W=gamma_W, gamma_e=gamma_e, learning_rate=lr) num_iter_test = 10 num_total_iter_training = n num_iter = 0 list_training_loss = list() list_training_norm_grad = list() list_test_pred_error = list() list_test_times = list() list_grad_X = list() auc_train_list = [] pred_train_list = [] auc_val_list = [] pred_val_list = [] auc_test_list = [] pred_test_list = [] num_iter = 0 for k in range(num_iter, num_total_iter_training): tic = time.time() list_of_outputs = learning_obj.session.run([ learning_obj.optimizer, learning_obj.loss, learning_obj.norm_grad, learning_obj.prediction_train, learning_obj.labels_training, learning_obj.loss_frob, learning_obj.loss_trace_row, learning_obj. frob_norm_H, learning_obj.frob_norm_W, learning_obj.binary_entropy ] + learning_obj.var_grad) current_training_loss = list_of_outputs[1] norm_grad = list_of_outputs[2] pred_train = list_of_outputs[3] labels_train = list_of_outputs[4] loss_frob = list_of_outputs[5] loss_trace = list_of_outputs[6] loss_norm_H = list_of_outputs[7] loss_norm_W = list_of_outputs[8] loss_entropy = list_of_outputs[9] indexes_train = np.concatenate((idx_testing, idx_validation), axis=0) pred_train = np.delete(pred_train, indexes_train, 0) labels_train = np.delete(labels_train, indexes_train, 0) accuracy_train, tn_train, fn_train, tp_train, fp_train, spe_train, sen_train = preprocessing_dataset.accuracy_computation( pred_train, training_set_mask, labels) fpr_train, tpr_train, thresholds_train = roc_curve( labels_train, pred_train) roc_auc_train = auc(fpr_train, tpr_train) X_grad = list_of_outputs[10:] training_time = time.time() - tic list_training_loss.append(current_training_loss) list_training_norm_grad.append(norm_grad) if (np.mod(num_iter, num_iter_test) == 0): msg = "[TRN] iter = %03i, cost = %3.2e, |grad| = %.2e (%3.2es), accuracy = %3.2e, auc = %3.2e" \ % (num_iter, list_training_loss[-1], list_training_norm_grad[-1], training_time, accuracy_train, roc_auc_train) print msg auc_train_list.append(roc_auc_train) pred_train_list.append(pred_train) #Test Code tic = time.time() pred_error, pred, labels_test = learning_obj.session.run([ learning_obj.predictions_error, learning_obj.predictions, learning_obj.labels_test ]) test_time = time.time() - tic if validation: pred_val, labels_val = learning_obj.session.run( [learning_obj.predictions_val, learning_obj.labels_val]) indexes_validation = np.concatenate( (idx_training, idx_testing), axis=0) pred_val = np.delete(pred_val, indexes_validation, 0) labels_val = np.delete(labels_val, indexes_validation, 0) fpr, tpr, thresholds = roc_curve(labels_val, pred_val) roc_auc_val = auc(fpr, tpr) auc_val_list.append(roc_auc_val) pred_val_list.append(pred_val) msg = "[VAL] iter = %03i, AUC = %3.2e" % (num_iter, roc_auc_val) print msg else: pred_val_list = [] labels_val = [] indexes_test = np.concatenate((idx_training, idx_validation), axis=0) pred = np.delete(pred, indexes_test, 0) labels_test = np.delete(labels_test, indexes_test, 0) list_test_pred_error.append(pred_error) accuracy, tn, fn, tp, fp, spe, sen = preprocessing_dataset.accuracy_computation( pred, testing_set_mask, labels) fpr, tpr, thresholds = roc_curve(labels_test, pred) roc_auc = auc(fpr, tpr) auc_test_list.append(roc_auc) pred_test_list.append(pred) msg = "[TST] iter = %03i, cost = %3.2e, Accuracy = %3.2e (%3.2es), AUC = %3.2e" % ( num_iter, list_test_pred_error[-1], accuracy, test_time, roc_auc) print msg num_iter += 1 return (auc_test_list, auc_train_list, auc_val_list, pred_train_list, pred_test_list, pred_val_list, labels_test, labels_train, labels_val)
def running_one_time(l2_regu, gamma_age, gamma_sex, gamma_agesex, gamma_W, gamma_e,n, seed, cheby, n_conv_feat, dropout, lr): """ function to run the architecture one time Inputs: l2_regu: coefficient to use in front of the l2 regularization term. Default value = 1, gamma_age, gamma_sex, gamma_agesex, gamma_W, gamma_e: coefficients in front of the other loss terms of the loss function, n: number of runs of the architecture for the same initialization with the weights and biases of the last run, seed: seed to use for the random sampling between training, testing and validation sets, cheby: boolean, use of a GCNN or a GCN layer. 0: GCN, 1: GCNN. Default value = 1, n_conv_feat: number of weights to use for the GCNN layer. Default value = 36, dropout: dropout rate on the GCNN output. Default = 0.5, lr: learning rate. Default value = 0.001 Outputs: auc_test_list, auc_train_list, auc_val_list: lists of the AUC values on the test, train and validation sets for the n runs, pred_train_list, pred_test_list, pred_val_list: lists of the prediction values on the test, train and validation sets for the n runs, labels_test, labels_train, labels_val: labels of the test, train and validation sets """ #initialization of the matrix, of the training, testing and validation sets training_set_mask, testing_set_mask, idx_training, idx_testing=preprocessing_dataset.split_train_test(0.8, M_str, seed, labels) #create a training and test mask on the data Otraining = preprocessing_dataset.load_mask(training_set_mask, M_str, nrRows, nrCols) Otest = preprocessing_dataset.load_mask(testing_set_mask, M_str, nrRows, nrCols) #creation of a validation set from the training set. Split the training set into 4 parts, one for validation new_labels_train=np.copy(labels) new_labels_train[idx_testing]=-1 training_set_mask, validation_set_mask, idx_training, idx_validation=preprocessing_dataset.split_train_validation_4(3, M_str, seed, new_labels_train) Otraining = preprocessing_dataset.load_mask(training_set_mask, M_str, nrRows, nrCols) Ovalidation= preprocessing_dataset.load_mask(validation_set_mask, M_str, nrRows, nrCols) Odata, _=preprocessing_dataset.split_train_into_2(Otraining, seed) Otraining = np.concatenate((Otraining, training_set_mask), axis=1) Oinitial=np.concatenate((Odata+Otest+Ovalidation, training_set_mask), axis=1) M_init_final=preprocessing_dataset.init_mask(Oinitial, M_init, labels) learning_obj = Train_test_matrix_completion(M, M_init_final, A_age, A_sex, A_sexage, mask_age, mask_sex, mask_agesex , mask_nosignificance, Lrow_age, Lrow_sex, Lrow_agesex, Otraining, labels, training_set_mask, testing_set_mask, validation_set_mask, mask_features, order_chebyshev_row = ord_row,cheby=cheby, n_conv_feat=n_conv_feat, l2_regu=l2_regu,dropout=dropout, gamma_age=gamma_age, gamma_sex=gamma_sex, gamma_agesex=gamma_agesex, gamma_W=gamma_W, gamma_e=gamma_e, learning_rate=lr) num_iter_test = 10 num_total_iter_training = n num_iter = 0 list_training_loss = list() list_training_norm_grad = list() list_test_pred_error = list() auc_train_list=[] pred_train_list=[] auc_test_list=[] pred_test_list=[] auc_val_list=[] pred_val_list=[] for k in range(num_iter, num_total_iter_training): tic = time.time() # run of the algorithm on the training set list_of_outputs = learning_obj.session.run([learning_obj.optimizer, learning_obj.loss, learning_obj.norm_grad, learning_obj.prediction_train, learning_obj.labels_training, learning_obj.loss_frob, learning_obj.frob_norm_W, learning_obj.binary_entropy, learning_obj.loss_trace_row_age, learning_obj.loss_trace_row_agesex, learning_obj.loss_trace_row_sex] + learning_obj.var_grad) current_training_loss = list_of_outputs[1] norm_grad = list_of_outputs[2] pred_train = list_of_outputs[3] labels_train =list_of_outputs[4] loss_frob = list_of_outputs[5] loss_frob_W = list_of_outputs[6] loss_entropy =list_of_outputs[7] loss_age =list_of_outputs[8] loss_agesex = list_of_outputs[9] loss_sex= list_of_outputs[10] indexes_train = np.concatenate((idx_testing, idx_validation), axis=0) pred_train = np.delete(pred_train, indexes_train,0) labels_train = np.delete(labels_train, indexes_train,0) accuracy_train, tn_train, fn_train, tp_train, fp_train, spe_train, sen_train=preprocessing_dataset.accuracy_computation(pred_train, training_set_mask, labels) fpr_train, tpr_train, thresholds_train=roc_curve(labels_train, pred_train) roc_auc_train = auc(fpr_train, tpr_train) X_grad = list_of_outputs[10:] training_time = time.time() - tic list_training_loss.append(current_training_loss) list_training_norm_grad.append(norm_grad) if (np.mod(num_iter, num_iter_test)==0): msg = "[TRN] iter = %03i, cost = %3.2e, |grad| = %.2e (%3.2es), accuracy = %3.2e, auc = %3.2e" \ % (num_iter, list_training_loss[-1], list_training_norm_grad[-1], training_time, accuracy_train, roc_auc_train) print msg auc_train_list.append(roc_auc_train) pred_train_list.append(pred_train) #Test Code tic = time.time() # run of the algorithm on the validation and test sets pred_error, pred, labels_test, pred_val, labels_val= learning_obj.session.run([learning_obj.predictions_error, learning_obj.predictions, learning_obj.labels_test, learning_obj.predictions_val, learning_obj.labels_val]) test_time = time.time() - tic indexes_validation = np.concatenate((idx_training, idx_testing), axis=0) pred_val = np.delete(pred_val, indexes_validation,0) labels_val = np.delete(labels_val, indexes_validation,0) fpr, tpr, thresholds=roc_curve(labels_val, pred_val) roc_auc_val = auc(fpr, tpr) auc_val_list.append(roc_auc_val) pred_val_list.append(pred_val) msg = "[VAL] iter = %03i, AUC = %3.2e" % (num_iter, roc_auc_val) print msg indexes_test = np.concatenate((idx_training, idx_validation), axis=0) pred = np.delete(pred, indexes_test,0) labels_test = np.delete(labels_test, indexes_test,0) list_test_pred_error.append(pred_error) accuracy, tn, fn, tp, fp, spe, sen= preprocessing_dataset.accuracy_computation(pred, testing_set_mask, labels) fpr, tpr, thresholds=roc_curve(labels_test, pred) roc_auc = auc(fpr, tpr) auc_test_list.append(roc_auc) pred_test_list.append(pred) msg = "[TST] iter = %03i, cost = %3.2e, Accuracy = %3.2e (%3.2es), AUC = %3.2e" % (num_iter, list_test_pred_error[-1], accuracy, test_time, roc_auc) print msg num_iter += 1 return (auc_test_list, auc_train_list, auc_val_list, pred_train_list, pred_test_list, pred_val_list, labels_test, labels_train, labels_val)