num_side_features = 0 # feature loading if not FEATURES: u_features = sp.identity(num_users, format='csr') v_features = sp.identity(num_items, format='csr') u_features, v_features = preprocess_user_item_features( u_features, v_features) elif FEATURES and u_features is not None and v_features is not None: # use features as side information and node_id's as node input features print("Normalizing feature vectors...") u_features_side = normalize_features(u_features) v_features_side = normalize_features(v_features) u_features_side, v_features_side = preprocess_user_item_features( u_features_side, v_features_side) u_features_side = np.array(u_features_side.todense(), dtype=np.float32) v_features_side = np.array(v_features_side.todense(), dtype=np.float32) num_side_features = u_features_side.shape[1] # node id's for node input features id_csr_v = sp.identity(num_items, format='csr') id_csr_u = sp.identity(num_users, format='csr') u_features, v_features = preprocess_user_item_features(id_csr_u, id_csr_v)
def run(user_features, movie_features, learning_rate=0.01, epochs=500, hidden=[500, 75], feat_hidden=64, accumulation='sum', dropout=0.7, num_basis_functions=2, features=False, symmetric=True, testing=True): """accumulation can be sum or stack""" # Set random seed # seed = 123 # use only for unit testing seed = int(time.time()) np.random.seed(seed) tf.set_random_seed(seed) tf.reset_default_graph() # Settings # ap = argparse.ArgumentParser() # # ap.add_argument("-d", "--dataset", type=str, default="ml_100k", # # choices=['ml_100k', 'ml_1m', 'ml_10m', 'douban', 'yahoo_music', 'flixster'], # # help="Dataset string.") # ap.add_argument("-lr", "--learning_rate", type=float, default=0.01, # help="Learning rate") # ap.add_argument("-e", "--epochs", type=int, default=2500, # help="Number training epochs") # ap.add_argument("-hi", "--hidden", type=int, nargs=2, default=[500, 75], # help="Number hidden units in 1st and 2nd layer") # ap.add_argument("-fhi", "--feat_hidden", type=int, default=64, # help="Number hidden units in the dense layer for features") # ap.add_argument("-ac", "--accumulation", type=str, default="sum", choices=['sum', 'stack'], # help="Accumulation function: sum or stack.") # ap.add_argument("-do", "--dropout", type=float, default=0.7, # help="Dropout fraction") # ap.add_argument("-nb", "--num_basis_functions", type=int, default=2, # help="Number of basis functions for Mixture Model GCN.") # ap.add_argument("-ds", "--data_seed", type=int, default=1234, # help="""Seed used to shuffle data in data_utils, taken from cf-nade (1234, 2341, 3412, 4123, 1324). # Only used for ml_1m and ml_10m datasets. """) # ap.add_argument("-sdir", "--summaries_dir", type=str, default='logs/' + str(datetime.datetime.now()).replace(' ', '_'), # help="Directory for saving tensorflow summaries.") # # Boolean flags # fp = ap.add_mutually_exclusive_group(required=False) # fp.add_argument('-nsym', '--norm_symmetric', dest='norm_symmetric', # help="Option to turn on symmetric global normalization", action='store_true') # fp.add_argument('-nleft', '--norm_left', dest='norm_symmetric', # help="Option to turn on left global normalization", action='store_false') # ap.set_defaults(norm_symmetric=True) # fp = ap.add_mutually_exclusive_group(required=False) # fp.add_argument('-f', '--features', dest='features', # help="Whether to use features (1) or not (0)", action='store_true') # fp.add_argument('-no_f', '--no_features', dest='features', # help="Whether to use features (1) or not (0)", action='store_false') # ap.set_defaults(features=False) # fp = ap.add_mutually_exclusive_group(required=False) # fp.add_argument('-ws', '--write_summary', dest='write_summary', # help="Option to turn on summary writing", action='store_true') # fp.add_argument('-no_ws', '--no_write_summary', dest='write_summary', # help="Option to turn off summary writing", action='store_false') # ap.set_defaults(write_summary=False) # fp = ap.add_mutually_exclusive_group(required=False) # fp.add_argument('-t', '--testing', dest='testing', # help="Option to turn on test set evaluation", action='store_true') # fp.add_argument('-v', '--validation', dest='testing', # help="Option to only use validation set evaluation", action='store_false') # ap.set_defaults(testing=False) # args = vars(ap.parse_args()) # print('Settings:') # print(args, '\n') # Define parameters DATASET = 'ml_100k' DATASEED = 1234 NB_EPOCH = epochs DO = dropout HIDDEN = hidden FEATHIDDEN = feat_hidden BASES = num_basis_functions LR = learning_rate WRITESUMMARY = False SUMMARIESDIR = 'logs/' + str(datetime.datetime.now()).replace(' ', '_') FEATURES = features SYM = symmetric TESTING = testing ACCUM = accumulation SELFCONNECTIONS = False SPLITFROMFILE = True VERBOSE = True NUMCLASSES = 5 # Splitting dataset in training, validation and test set print("Using official MovieLens dataset split u1.base/u1.test with 20% validation set size...") u_features = user_features v_features = movie_features _, _, adj_train, train_labels, train_u_indices, train_v_indices, \ val_labels, val_u_indices, val_v_indices, test_labels, \ test_u_indices, test_v_indices, class_values = load_official_trainvaltest_split('ml_100k', TESTING) num_users, num_items = adj_train.shape num_side_features = 0 # feature loading if not FEATURES: u_features = sp.identity(num_users, format='csr') v_features = sp.identity(num_items, format='csr') u_features, v_features = preprocess_user_item_features(u_features, v_features) elif FEATURES and u_features is not None and v_features is not None: # use features as side information and node_id's as node input features print("Normalizing feature vectors...") u_features_side = normalize_features(u_features) v_features_side = normalize_features(v_features) u_features_side, v_features_side = preprocess_user_item_features(u_features_side, v_features_side) u_features_side = np.array(u_features_side.todense(), dtype=np.float32) v_features_side = np.array(v_features_side.todense(), dtype=np.float32) num_side_features = u_features_side.shape[1] # node id's for node input features id_csr_v = sp.identity(num_items, format='csr') id_csr_u = sp.identity(num_users, format='csr') u_features, v_features = preprocess_user_item_features(id_csr_u, id_csr_v) else: raise ValueError('Features flag is set to true but no features are loaded from dataset ' + DATASET) # global normalization support = [] support_t = [] adj_train_int = sp.csr_matrix(adj_train, dtype=np.int32) for i in range(NUMCLASSES): # build individual binary rating matrices (supports) for each rating support_unnormalized = sp.csr_matrix(adj_train_int == i + 1, dtype=np.float32) if support_unnormalized.nnz == 0 and DATASET != 'yahoo_music': # yahoo music has dataset split with not all ratings types present in training set. # this produces empty adjacency matrices for these ratings. sys.exit('ERROR: normalized bipartite adjacency matrix has only zero entries!!!!!') support_unnormalized_transpose = support_unnormalized.T support.append(support_unnormalized) support_t.append(support_unnormalized_transpose) support = globally_normalize_bipartite_adjacency(support, symmetric=SYM) support_t = globally_normalize_bipartite_adjacency(support_t, symmetric=SYM) if SELFCONNECTIONS: support.append(sp.identity(u_features.shape[0], format='csr')) support_t.append(sp.identity(v_features.shape[0], format='csr')) num_support = len(support) support = sp.hstack(support, format='csr') support_t = sp.hstack(support_t, format='csr') if ACCUM == 'stack': div = HIDDEN[0] // num_support if HIDDEN[0] % num_support != 0: print("""\nWARNING: HIDDEN[0] (=%d) of stack layer is adjusted to %d such that it can be evenly split in %d splits.\n""" % (HIDDEN[0], num_support * div, num_support)) HIDDEN[0] = num_support * div # Collect all user and item nodes for test set test_u = list(set(test_u_indices)) test_v = list(set(test_v_indices)) test_u_dict = {n: i for i, n in enumerate(test_u)} test_v_dict = {n: i for i, n in enumerate(test_v)} test_u_indices = np.array([test_u_dict[o] for o in test_u_indices]) test_v_indices = np.array([test_v_dict[o] for o in test_v_indices]) test_support = support[np.array(test_u)] test_support_t = support_t[np.array(test_v)] # Collect all user and item nodes for validation set val_u = list(set(val_u_indices)) val_v = list(set(val_v_indices)) val_u_dict = {n: i for i, n in enumerate(val_u)} val_v_dict = {n: i for i, n in enumerate(val_v)} val_u_indices = np.array([val_u_dict[o] for o in val_u_indices]) val_v_indices = np.array([val_v_dict[o] for o in val_v_indices]) val_support = support[np.array(val_u)] val_support_t = support_t[np.array(val_v)] # Collect all user and item nodes for train set train_u = list(set(train_u_indices)) train_v = list(set(train_v_indices)) train_u_dict = {n: i for i, n in enumerate(train_u)} train_v_dict = {n: i for i, n in enumerate(train_v)} train_u_indices = np.array([train_u_dict[o] for o in train_u_indices]) train_v_indices = np.array([train_v_dict[o] for o in train_v_indices]) train_support = support[np.array(train_u)] train_support_t = support_t[np.array(train_v)] # features as side info if FEATURES: test_u_features_side = u_features_side[np.array(test_u)] test_v_features_side = v_features_side[np.array(test_v)] val_u_features_side = u_features_side[np.array(val_u)] val_v_features_side = v_features_side[np.array(val_v)] train_u_features_side = u_features_side[np.array(train_u)] train_v_features_side = v_features_side[np.array(train_v)] else: test_u_features_side = None test_v_features_side = None val_u_features_side = None val_v_features_side = None train_u_features_side = None train_v_features_side = None placeholders = { 'u_features': tf.sparse_placeholder(tf.float32, shape=np.array(u_features.shape, dtype=np.int64)), 'v_features': tf.sparse_placeholder(tf.float32, shape=np.array(v_features.shape, dtype=np.int64)), 'u_features_nonzero': tf.placeholder(tf.int32, shape=()), 'v_features_nonzero': tf.placeholder(tf.int32, shape=()), 'labels': tf.placeholder(tf.int32, shape=(None,)), 'u_features_side': tf.placeholder(tf.float32, shape=(None, num_side_features)), 'v_features_side': tf.placeholder(tf.float32, shape=(None, num_side_features)), 'user_indices': tf.placeholder(tf.int32, shape=(None,)), 'item_indices': tf.placeholder(tf.int32, shape=(None,)), 'class_values': tf.placeholder(tf.float32, shape=class_values.shape), 'dropout': tf.placeholder_with_default(0., shape=()), 'weight_decay': tf.placeholder_with_default(0., shape=()), 'support': tf.sparse_placeholder(tf.float32, shape=(None, None)), 'support_t': tf.sparse_placeholder(tf.float32, shape=(None, None)), } # create model if FEATURES: model = RecommenderSideInfoGAE(placeholders, input_dim=u_features.shape[1], feat_hidden_dim=FEATHIDDEN, num_classes=NUMCLASSES, num_support=num_support, self_connections=SELFCONNECTIONS, num_basis_functions=BASES, hidden=HIDDEN, num_users=num_users, num_items=num_items, accum=ACCUM, learning_rate=LR, num_side_features=num_side_features, logging=True) else: model = RecommenderGAE(placeholders, input_dim=u_features.shape[1], num_classes=NUMCLASSES, num_support=num_support, self_connections=SELFCONNECTIONS, num_basis_functions=BASES, hidden=HIDDEN, num_users=num_users, num_items=num_items, accum=ACCUM, learning_rate=LR, logging=True) # Convert sparse placeholders to tuples to construct feed_dict test_support = sparse_to_tuple(test_support) test_support_t = sparse_to_tuple(test_support_t) val_support = sparse_to_tuple(val_support) val_support_t = sparse_to_tuple(val_support_t) train_support = sparse_to_tuple(train_support) train_support_t = sparse_to_tuple(train_support_t) u_features = sparse_to_tuple(u_features) v_features = sparse_to_tuple(v_features) assert u_features[2][1] == v_features[2][1], 'Number of features of users and items must be the same!' num_features = u_features[2][1] u_features_nonzero = u_features[1].shape[0] v_features_nonzero = v_features[1].shape[0] # Feed_dicts for validation and test set stay constant over different update steps train_feed_dict = construct_feed_dict(placeholders, u_features, v_features, u_features_nonzero, v_features_nonzero, train_support, train_support_t, train_labels, train_u_indices, train_v_indices, class_values, DO, train_u_features_side, train_v_features_side) # No dropout for validation and test runs val_feed_dict = construct_feed_dict(placeholders, u_features, v_features, u_features_nonzero, v_features_nonzero, val_support, val_support_t, val_labels, val_u_indices, val_v_indices, class_values, 0., val_u_features_side, val_v_features_side) test_feed_dict = construct_feed_dict(placeholders, u_features, v_features, u_features_nonzero, v_features_nonzero, test_support, test_support_t, test_labels, test_u_indices, test_v_indices, class_values, 0., test_u_features_side, test_v_features_side) # Collect all variables to be logged into summary merged_summary = tf.summary.merge_all() #sess = tf.Session() sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) if WRITESUMMARY: train_summary_writer = tf.summary.FileWriter(SUMMARIESDIR + '/train', sess.graph) val_summary_writer = tf.summary.FileWriter(SUMMARIESDIR + '/val') else: train_summary_writer = None val_summary_writer = None best_val_score = np.inf best_val_loss = np.inf best_epoch = 0 wait = 0 print('Training...') train_loss_values = [] train_rmse_values = [] val_loss_values = [] val_rmse_values = [] list_embeddings = [] for epoch in range(NB_EPOCH): t = time.time() # Run single weight update # outs = sess.run([model.opt_op, model.loss, model.rmse], feed_dict=train_feed_dict) # with exponential moving averages outs = sess.run([model.training_op, model.loss, model.rmse], feed_dict=train_feed_dict) #print(len(model.embeddings)) train_avg_loss = outs[1] train_rmse = outs[2] val_avg_loss, val_rmse = sess.run([model.loss, model.rmse], feed_dict=val_feed_dict) train_loss_values.append(train_avg_loss) train_rmse_values.append(train_rmse) val_loss_values.append(val_avg_loss) val_rmse_values.append(val_rmse) if VERBOSE: print("[*] Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(train_avg_loss), "train_rmse=", "{:.5f}".format(train_rmse), "val_loss=", "{:.5f}".format(val_avg_loss), "val_rmse=", "{:.5f}".format(val_rmse), "\t\ttime=", "{:.5f}".format(time.time() - t)) if epoch==NB_EPOCH - 1: embedding_users = model.embeddings[0].eval(feed_dict=train_feed_dict) embedding_movies = model.embeddings[1].eval(feed_dict=train_feed_dict) if val_rmse < best_val_score: best_val_score = val_rmse best_epoch = epoch if epoch % 20 == 0 and WRITESUMMARY: # Train set summary summary = sess.run(merged_summary, feed_dict=train_feed_dict) train_summary_writer.add_summary(summary, epoch) train_summary_writer.flush() # Validation set summary summary = sess.run(merged_summary, feed_dict=val_feed_dict) val_summary_writer.add_summary(summary, epoch) val_summary_writer.flush() if epoch % 100 == 0 and epoch > 1000 and not TESTING and False: saver = tf.train.Saver() save_path = saver.save(sess, "tmp/%s_seed%d.ckpt" % (model.name, DATASEED), global_step=model.global_step) # load polyak averages variables_to_restore = model.variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) saver.restore(sess, save_path) val_avg_loss, val_rmse = sess.run([model.loss, model.rmse], feed_dict=val_feed_dict) print('polyak val loss = ', val_avg_loss) print('polyak val rmse = ', val_rmse) # Load back normal variables saver = tf.train.Saver() saver.restore(sess, save_path) # store model including exponential moving averages saver = tf.train.Saver() save_path = saver.save(sess, "tmp/%s.ckpt" % model.name, global_step=model.global_step) if VERBOSE: print("\nOptimization Finished!") print('best validation score =', best_val_score, 'at iteration', best_epoch+1) if TESTING: test_avg_loss, test_rmse = sess.run([model.loss, model.rmse], feed_dict=test_feed_dict) print('test loss = ', test_avg_loss) print('test rmse = ', test_rmse) # restore with polyak averages of parameters variables_to_restore = model.variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) saver.restore(sess, save_path) test_avg_loss, test_rmse = sess.run([model.loss, model.rmse], feed_dict=test_feed_dict) print('polyak test loss = ', test_avg_loss) print('polyak test rmse = ', test_rmse) else: # restore with polyak averages of parameters variables_to_restore = model.variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) saver.restore(sess, save_path) val_avg_loss, val_rmse = sess.run([model.loss, model.rmse], feed_dict=val_feed_dict) print('polyak val loss = ', val_avg_loss) print('polyak val rmse = ', val_rmse) print('global seed = ', seed) sess.close() return embedding_users, embedding_movies, train_loss_values, train_rmse_values, val_loss_values, val_rmse_values
def run(DATASET='douban', DATASEED=1234, random_seed=123, NB_EPOCH=200, DO=0, HIDDEN=[100, 75], FEATHIDDEN=64, LR=0.01, decay_rate=1.25, consecutive_threshold=5, FEATURES=False, SYM=True, TESTING=False, ACCUM='stackRGGCN', NUM_LAYERS=1, GCMC_INDICES=False): np.random.seed(random_seed) tf.set_random_seed(random_seed) SELFCONNECTIONS = False SPLITFROMFILE = True VERBOSE = False BASES = 2 WRITESUMMARY = False SUMMARIESDIR = 'logs/' if DATASET == 'ml_1m' or DATASET == 'ml_100k' or DATASET == 'douban': NUMCLASSES = 5 elif DATASET == 'ml_10m': NUMCLASSES = 10 print( '\n WARNING: this might run out of RAM, consider using train_minibatch.py for dataset %s' % DATASET) print( 'If you want to proceed with this option anyway, uncomment this.\n' ) sys.exit(1) elif DATASET == 'flixster': NUMCLASSES = 10 elif DATASET == 'yahoo_music': NUMCLASSES = 71 if ACCUM == 'sum': print( '\n WARNING: combining DATASET=%s with ACCUM=%s can cause memory issues due to large number of classes.' ) print( 'Consider using "--accum stack" as an option for this dataset.' ) print( 'If you want to proceed with this option anyway, uncomment this.\n' ) sys.exit(1) # Splitting dataset in training, validation and test set if DATASET == 'ml_1m' or DATASET == 'ml_10m': if FEATURES: datasplit_path = 'data/' + DATASET + '/withfeatures_split_seed' + str( DATASEED) + '.pickle' else: datasplit_path = 'data/' + DATASET + '/split_seed' + str( DATASEED) + '.pickle' elif FEATURES: datasplit_path = 'data/' + DATASET + '/withfeatures.pickle' else: datasplit_path = 'data/' + DATASET + '/nofeatures.pickle' if DATASET == 'flixster' or DATASET == 'douban' or DATASET == 'yahoo_music': u_features, v_features, adj_train, train_labels, train_u_indices, train_v_indices, \ val_labels, val_u_indices, val_v_indices, test_labels, \ test_u_indices, test_v_indices, class_values = load_data_monti(DATASET, TESTING) elif DATASET == 'ml_100k': print( "Using official MovieLens dataset split u1.base/u1.test with 20% validation set size..." ) u_features, v_features, adj_train, train_labels, train_u_indices, train_v_indices, \ val_labels, val_u_indices, val_v_indices, test_labels, \ test_u_indices, test_v_indices, class_values = load_official_trainvaltest_split(DATASET, TESTING) else: print("Using random dataset split ...") u_features, v_features, adj_train, train_labels, train_u_indices, train_v_indices, \ val_labels, val_u_indices, val_v_indices, test_labels, \ test_u_indices, test_v_indices, class_values = create_trainvaltest_split(DATASET, DATASEED, TESTING, datasplit_path, SPLITFROMFILE, VERBOSE) num_users, num_items = adj_train.shape num_side_features = 0 # feature loading if not FEATURES: u_features = sp.identity( num_users, format='csr') # features is just one-hot vector! v_features = sp.identity(num_items, format='csr') u_features, v_features = preprocess_user_item_features( u_features, v_features) elif FEATURES and u_features is not None and v_features is not None: # use features as side information and node_id's as node input features print("Normalizing feature vectors...") u_features_side = normalize_features(u_features) v_features_side = normalize_features(v_features) u_features_side, v_features_side = preprocess_user_item_features( u_features_side, v_features_side) u_features_side = np.array(u_features_side.todense(), dtype=np.float32) v_features_side = np.array(v_features_side.todense(), dtype=np.float32) num_side_features = u_features_side.shape[1] # node id's for node input features id_csr_v = sp.identity(num_items, format='csr') id_csr_u = sp.identity(num_users, format='csr') u_features, v_features = preprocess_user_item_features( id_csr_u, id_csr_v) else: raise ValueError( 'Features flag is set to true but no features are loaded from dataset ' + DATASET) # print("User features shape: " + str(u_features.shape)) # print("Item features shape: " + str(v_features.shape)) # print("adj_train shape: " + str(adj_train.shape)) # global normalization support = [] support_t = [] adj_train_int = sp.csr_matrix(adj_train, dtype=np.int32) for i in range(NUMCLASSES): # build individual binary rating matrices (supports) for each rating support_unnormalized = sp.csr_matrix(adj_train_int == i + 1, dtype=np.float32) if support_unnormalized.nnz == 0 and DATASET != 'yahoo_music': # yahoo music has dataset split with not all ratings types present in training set. # this produces empty adjacency matrices for these ratings. sys.exit( 'ERROR: normalized bipartite adjacency matrix has only zero entries!!!!!' ) support_unnormalized_transpose = support_unnormalized.T support.append(support_unnormalized) support_t.append(support_unnormalized_transpose) support = globally_normalize_bipartite_adjacency(support, symmetric=SYM) support_t = globally_normalize_bipartite_adjacency(support_t, symmetric=SYM) if SELFCONNECTIONS: support.append(sp.identity(u_features.shape[0], format='csr')) support_t.append(sp.identity(v_features.shape[0], format='csr')) num_support = len(support) support = sp.hstack(support, format='csr') support_t = sp.hstack(support_t, format='csr') # support and support_t become 3000x15000 (for douban with 3000 users/items and 5 ratings) # support is n_users x (n_items*n_ratings). support_t is n_items x (n_users*ratings) # NOTE: support is sparse matrix so the shape may not be as large as expected (?) # When is num_support ever not == num_rating_classes? # print('support shape: ' + str(support.shape)) # print('support_t shape: ' + str(support_t.shape)) if ACCUM == 'stack' or ACCUM == 'stackRGGCN': div = HIDDEN[0] // num_support if HIDDEN[0] % num_support != 0: print( """\nWARNING: HIDDEN[0] (=%d) of stack layer is adjusted to %d such that it can be evenly split in %d splits.\n""" % (HIDDEN[0], num_support * div, num_support)) HIDDEN[0] = num_support * div ################################################################################################################## """ support contains only training set ratings. index into support using user/item indices to create test set support. """ test_support = val_support = train_support = support test_support_t = val_support_t = train_support_t = support_t if GCMC_INDICES: # Collect all user and item nodes for test set test_u = list(set(test_u_indices)) test_v = list(set(test_v_indices)) test_support = support[np.array(test_u)] test_support_t = support_t[np.array(test_v)] # Collect all user and item nodes for validation set val_u = list(set(val_u_indices)) val_v = list(set(val_v_indices)) val_support = support[np.array(val_u)] val_support_t = support_t[np.array(val_v)] # Collect all user and item nodes for train set train_u = list(set(train_u_indices)) train_v = list(set(train_v_indices)) train_support = support[np.array(train_u)] train_support_t = support_t[np.array(train_v)] test_u_dict = {n: i for i, n in enumerate(test_u)} test_v_dict = {n: i for i, n in enumerate(test_v)} test_u_indices = np.array([test_u_dict[o] for o in test_u_indices]) test_v_indices = np.array([test_v_dict[o] for o in test_v_indices]) val_u_dict = {n: i for i, n in enumerate(val_u)} val_v_dict = {n: i for i, n in enumerate(val_v)} val_u_indices = np.array([val_u_dict[o] for o in val_u_indices]) val_v_indices = np.array([val_v_dict[o] for o in val_v_indices]) train_u_dict = {n: i for i, n in enumerate(train_u)} train_v_dict = {n: i for i, n in enumerate(train_v)} print('max train_u_indices: {}'.format(max(train_u_indices))) train_u_indices = np.array( [train_u_dict[o] for o in train_u_indices] ) ### HERE IS WHERE indices get changed to suit the new indexing into smaller set of users train_v_indices = np.array([train_v_dict[o] for o in train_v_indices]) print('max train_u_indices after: {}'.format(max(train_u_indices))) # print('train_support_shape: {}'.format(train_support.shape)) # if GCMC_INDICES, THIS IS NO LONGER (n_users, n_items*n_rating_types). but < n_users ################################################################################################################## # features as side info if FEATURES: test_u_features_side = u_features_side[np.array(test_u)] test_v_features_side = v_features_side[np.array(test_v)] val_u_features_side = u_features_side[np.array(val_u)] val_v_features_side = v_features_side[np.array(val_v)] train_u_features_side = u_features_side[np.array(train_u)] train_v_features_side = v_features_side[np.array(train_v)] else: test_u_features_side = None test_v_features_side = None val_u_features_side = None val_v_features_side = None train_u_features_side = None train_v_features_side = None placeholders = { 'u_features': tf.sparse_placeholder(tf.float32, shape=np.array(u_features.shape, dtype=np.int64)), 'v_features': tf.sparse_placeholder(tf.float32, shape=np.array(v_features.shape, dtype=np.int64)), 'u_features_nonzero': tf.placeholder(tf.int32, shape=()), 'v_features_nonzero': tf.placeholder(tf.int32, shape=()), 'labels': tf.placeholder(tf.int32, shape=(None, )), 'u_features_side': tf.placeholder(tf.float32, shape=(None, num_side_features)), 'v_features_side': tf.placeholder(tf.float32, shape=(None, num_side_features)), 'user_indices': tf.placeholder(tf.int32, shape=(None, )), 'item_indices': tf.placeholder(tf.int32, shape=(None, )), 'class_values': tf.placeholder(tf.float32, shape=class_values.shape), 'dropout': tf.placeholder_with_default(0., shape=()), 'weight_decay': tf.placeholder_with_default(0., shape=()), 'support': tf.sparse_placeholder(tf.float32, shape=(None, None)), 'support_t': tf.sparse_placeholder(tf.float32, shape=(None, None)), } ################################################################################################################## E_start, E_end = get_edges_matrices(adj_train) # E_start = sp.hstack(E_start, format='csr') # confirm if vstack is correct and not hstack # E_end = sp.hstack(E_end, format='csr') # placeholders['E_start'] = tf.sparse_placeholder(tf.float32, shape=(None, None, None)) # placeholders['E_end'] = tf.sparse_placeholder(tf.float32, shape=(None, None, None)) placeholders['E_start_list'] = [] placeholders['E_end_list'] = [] for i in range(num_support): placeholders['E_start_list'].append( tf.sparse_placeholder(tf.float32, shape=(None, None))) placeholders['E_end_list'].append( tf.sparse_placeholder(tf.float32, shape=(None, None))) # print('shape of E_end for first rating type: {}'.format(E_end[0].toarray().shape)) ################################################################################################################## # create model if FEATURES: model = RecommenderSideInfoGAE(placeholders, input_dim=u_features.shape[1], feat_hidden_dim=FEATHIDDEN, num_classes=NUMCLASSES, num_support=num_support, self_connections=SELFCONNECTIONS, num_basis_functions=BASES, hidden=HIDDEN, num_users=num_users, num_items=num_items, accum=ACCUM, learning_rate=LR, num_side_features=num_side_features, logging=True) else: model = RecommenderGAE(placeholders, input_dim=u_features.shape[1], num_classes=NUMCLASSES, num_support=num_support, self_connections=SELFCONNECTIONS, num_basis_functions=BASES, hidden=HIDDEN, num_users=num_users, num_items=num_items, accum=ACCUM, learning_rate=LR, num_layers=NUM_LAYERS, logging=True) # Convert sparse placeholders to tuples to construct feed_dict. sparse placeholders expect tuple of (indices, values, shape) test_support = sparse_to_tuple(test_support) test_support_t = sparse_to_tuple(test_support_t) val_support = sparse_to_tuple(val_support) val_support_t = sparse_to_tuple(val_support_t) train_support = sparse_to_tuple(train_support) train_support_t = sparse_to_tuple(train_support_t) u_features = sparse_to_tuple(u_features) v_features = sparse_to_tuple(v_features) assert u_features[2][1] == v_features[2][ 1], 'Number of features of users and items must be the same!' num_features = u_features[2][1] u_features_nonzero = u_features[1].shape[0] v_features_nonzero = v_features[1].shape[0] # setting E_start to be the same for train, val, and test. E_start already only contains train edges (from preprocessing script) train_E_start = [] train_E_end = [] # print('LENGTH OF E_START: {}'.format(len(E_start))) # print('NUM_SUPPORT: {}'.format(num_support)) for i in range(num_support): train_E_start.append(sparse_to_tuple(E_start[i])) train_E_end.append(sparse_to_tuple(E_end[i])) val_E_start = test_E_start = train_E_start val_E_end = test_E_end = train_E_end # Feed_dicts for validation and test set stay constant over different update steps train_feed_dict = construct_feed_dict( placeholders, u_features, v_features, u_features_nonzero, v_features_nonzero, train_support, train_support_t, train_labels, train_u_indices, train_v_indices, class_values, DO, train_u_features_side, train_v_features_side, train_E_start, train_E_end) # No dropout for validation and test runs. DO = dropout. input for val and test is same u_features and v_features. val_feed_dict = construct_feed_dict( placeholders, u_features, v_features, u_features_nonzero, v_features_nonzero, val_support, val_support_t, val_labels, val_u_indices, val_v_indices, class_values, 0., val_u_features_side, val_v_features_side, val_E_start, val_E_end) test_feed_dict = construct_feed_dict( placeholders, u_features, v_features, u_features_nonzero, v_features_nonzero, test_support, test_support_t, test_labels, test_u_indices, test_v_indices, class_values, 0., test_u_features_side, test_v_features_side, test_E_start, test_E_end) # Collect all variables to be logged into summary merged_summary = tf.summary.merge_all() sess = tf.Session() sess.run(tf.global_variables_initializer()) if WRITESUMMARY: train_summary_writer = tf.summary.FileWriter(SUMMARIESDIR + '/train', sess.graph) val_summary_writer = tf.summary.FileWriter(SUMMARIESDIR + '/val') else: train_summary_writer = None val_summary_writer = None best_val_score = np.inf best_val_loss = np.inf best_epoch = 0 wait = 0 print('Training...') #### COUTNING PARAMS total_parameters = 0 for variable in tf.trainable_variables(): # shape is an array of tf.Dimension shape = variable.get_shape() variable_parameters = 1 for dim in shape: variable_parameters *= dim.value total_parameters += variable_parameters print('Total params: {}'.format(total_parameters)) # FOR A VARIABLE LEARNING RATE assign_placeholder = tf.placeholder(tf.float32) assign_op = model.learning_rate.assign(assign_placeholder) old_loss = float('inf') # print('Original learning rate is {}'.format(sess.run(model.optimizer._lr))) train_rmses, val_rmses, train_losses, val_losses = [], [], [], [] for epoch in tqdm(range(NB_EPOCH)): t = time.time() # Run single weight update # outs = sess.run([model.opt_op, model.loss, model.rmse], feed_dict=train_feed_dict) # with exponential moving averages outs = sess.run([model.training_op, model.loss, model.rmse], feed_dict=train_feed_dict) train_avg_loss = outs[1] train_rmse = outs[2] val_avg_loss, val_rmse = sess.run([model.loss, model.rmse], feed_dict=val_feed_dict) # if train_avg_loss > 0.999*old_loss: # consecutive += 1 # if consecutive >= consecutive_threshold: # LR /= decay_rate # sess.run(assign_op, feed_dict={assign_placeholder: LR}) # print('New learning rate is {}'.format(sess.run(model.optimizer._lr))) # consecutive = 0 # else: # consecutive = 0 # old_loss = train_avg_loss train_rmses.append(train_rmse) val_rmses.append(val_rmse) train_losses.append(train_avg_loss) val_losses.append(val_avg_loss) if VERBOSE: print("[*] Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(train_avg_loss), "train_rmse=", "{:.5f}".format(train_rmse), "val_loss=", "{:.5f}".format(val_avg_loss), "val_rmse=", "{:.5f}".format(val_rmse), "\t\ttime=", "{:.5f}".format(time.time() - t)) if val_rmse < best_val_score: best_val_score = val_rmse best_epoch = epoch if epoch % 20 == 0 and WRITESUMMARY: # Train set summary summary = sess.run(merged_summary, feed_dict=train_feed_dict) train_summary_writer.add_summary(summary, epoch) train_summary_writer.flush() # Validation set summary summary = sess.run(merged_summary, feed_dict=val_feed_dict) val_summary_writer.add_summary(summary, epoch) val_summary_writer.flush() if epoch % 100 == 0 and epoch > 1000 and not TESTING and False: saver = tf.train.Saver() save_path = saver.save(sess, "tmp/%s_seed%d.ckpt" % (model.name, DATASEED), global_step=model.global_step) # load polyak averages variables_to_restore = model.variable_averages.variables_to_restore( ) saver = tf.train.Saver(variables_to_restore) saver.restore(sess, save_path) val_avg_loss, val_rmse = sess.run([model.loss, model.rmse], feed_dict=val_feed_dict) print('polyak val loss = ', val_avg_loss) print('polyak val rmse = ', val_rmse) # Load back normal variables saver = tf.train.Saver() saver.restore(sess, save_path) # store model including exponential moving averages saver = tf.train.Saver() save_path = saver.save(sess, "tmp/%s.ckpt" % model.name, global_step=model.global_step) if VERBOSE: print("\nOptimization Finished!") print('best validation score =', best_val_score, 'at iteration', best_epoch) if TESTING: test_avg_loss, test_rmse = sess.run([model.loss, model.rmse], feed_dict=test_feed_dict) print('test loss = ', test_avg_loss) print('test rmse = ', test_rmse) # restore with polyak averages of parameters variables_to_restore = model.variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) saver.restore(sess, save_path) test_avg_loss, test_rmse = sess.run([model.loss, model.rmse], feed_dict=test_feed_dict) print('polyak test loss = ', test_avg_loss) print('polyak test rmse = ', test_rmse) sess.close() tf.reset_default_graph() return train_rmses, val_rmses, train_losses, val_losses, test_rmse else: # restore with polyak averages of parameters variables_to_restore = model.variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) saver.restore(sess, save_path) val_avg_loss, val_rmse = sess.run([model.loss, model.rmse], feed_dict=val_feed_dict) print('polyak val loss = ', val_avg_loss) print('polyak val rmse = ', val_rmse) sess.close() tf.reset_default_graph() return train_rmses, val_rmses, train_losses, val_losses, val_rmse
def create_features_graph(adj, features, labels): ''' creating a features_graph by add feature_node from node_feature to the graph in which node belongs to . :param adj: coo_matrix with dimension n*n where n = number of total nodes (without features node) :param features: coo_matrix with dimension n*d where d = feature dim :param labels: numpy array :return: adj of features_node_graph ''' ''' step >assign index to features with value of 1 (coo_matrix) >add coordinate to adj (coo_matrix) ''' # -- convert node_feature adj_dim = adj.shape[0] # use this as a start index of features_row features_dim = features.shape[1] # display2screen(features_dim) # # -- feature to feature edges # # --option 1 feature nodes have no self loops # option = "feature_no_self_loop" # ff_row = [i + adj_dim for i in range(features_dim)] # ff_col = [i + adj_dim for i in range(features_dim)] # ff_val = [0 for i in range(features_dim)] # # -- option 2 features nodes have self loop option = 'features_with_self_loop' identity_mx = sp.eye(adj.shape[0]).tocoo() ff_row, ff_col, ff_val = identity_mx.row.tolist(), identity_mx.col.tolist(), identity_mx.data.tolist() # # -- option 3 all features are connected as clique # option = 'features_clique' # ones_max = np.ones(adj.shape[0]) # ones_max = sp.coo_matrix(ones_max) # ff_row, ff_col, ff_val = ones_max.row.tolist(), ones_max.col.tolist(), ones_max.data.tolist() # ff_row, ff_col, ff_val = identity_mx.row, identity_mx.col, identity_mx.data # node to feature edges features_ind = [i + adj_dim for i in range(features_dim)] # dim = feature_dim nodes_ind = [i for i in range(adj_dim)] # dim = adj_dim nf_row = [] nf_col = [] nf_val = [] # todo create unormalized_fetaures_graph.npy # file_name = f"adj_features.npy" file_name = f"adj_features_graph_normalized_node_option={option}.npy" # file_name = "adj_unormalized_features_graph.npy" file_path = f'data/preprocessing/{file_name}' if os.path.exists(file_path): # -- load pre_processed numpy from file_path s = time.time() adj = np.load(file_path) f = time.time() total = f-s print(f"total time = {total}") node_features_row, node_features_col = sp.find(sp.csr_matrix(adj))[0], sp.find(sp.csr_matrix(adj))[1] else: print("converted node features to features graph...") s = time.time() for row in nodes_ind: ''' csr_matrix.nonzero() eg (array([0, 0, 1, 2, 2]), array([0, 1, 2, 0, 2])) ''' # -- non_zero_feature of the current cow non_zero_ind = [ features.nonzero()[1][i] + adj_dim for i,j in enumerate(features.nonzero()[0]) if j == row] nf_row += [row for i in range(len(non_zero_ind))] nf_col += non_zero_ind # -- val = 1 if node has feature in it otherwise 0 nf_val += [1 for i in features_ind if i in non_zero_ind] # dim = adj_dim f = time.time() total = f-s print(f"time ={total}") # --normalized nn_val adj = preprocessing.normalize_features(adj) nn_row = sp.find(adj)[0].tolist() nn_col = sp.find(adj)[1].tolist() nn_val = sp.find(adj)[2].tolist() # --create adj of node_featurse_graph node_features_row = ff_row + nf_row + nf_col + nn_row node_features_col = ff_col + nf_col + nf_row + nn_col node_features_val = ff_val + nf_val + nf_val + nn_val assert len(node_features_val) == len(node_features_row) == len(node_features_col), f"{len(node_features_val)} == {len(node_features_row)} == {len(node_features_col)}" node_features_graph = sp.csr_matrix((node_features_val, (node_features_row, node_features_col))) # -- edges # edges = np.array([[i, j] for i, j in zip(node_features_row, node_features_col)]).T # dim (2, #edges) # -- for readability adj = node_features_graph.todense() # -- save to file_path np.save(file_path, adj) # display2screen('line 117') max_labels = np.amax(labels) non_class_label = max_labels+1 labels = labels.tolist() + [non_class_label for i in range(adj.shape[0] - labels.flatten().shape[0])] # -- convert to numpy array edges = np.array([[i, j] for i, j in zip(node_features_row, node_features_col)]).T # dim (2, #edges) edges = edges.astype(int) labels = np.array(labels) # display2screen(edges.shape) return adj, edges, labels
# # >> interestin observation: # # :accuracy converge very slowly at 550 to 71 percent accuracy at with option 1.1 normalized adj # # :accuracy converge very quickly at 61-61 percent with option 1.2 unnormalized adj # # tmp = adj # adj = normalize_features(sp.csr_matrix(adj.astype(float))) # --option 1.1 normalize # # adj = adj # option 1.2 unnormalize # # display2screen(tmp[np.nonzero(x)[0][0], np.nonzero(x)[1][0]], x[np.nonzero(x)[0][0], np.nonzero(x)[1][0]]) # adj = torch.tensor(adj) # x = adj # edge_index = torch.tensor(edge_index, dtype=torch.int32) # y = torch.tensor(y, dtype=torch.long) # -- option 2 => dim = (n+f) * (n+f) ;identity matrix # >> test accuracy converge at aroung 61-62 percent x = np.identity(adj.shape[0]) x = preprocessing.normalize_features(sp.csr_matrix(x)) x = torch.tensor(x, dtype=torch.long) edge_index = torch.tensor(edge_index, dtype=torch.int32) y = torch.tensor(y, dtype=torch.long) # -- option 3 => dim = (n+f) * f; # >>very bad accuracy at epoch 200, but it seems that its performance has not yet fully converge # features = np.concatenate((features.numpy(), np.identity(features.shape[1]))) # -- option 3.1 # # features = np.concatenate((features.numpy(), 0 * np.identity(features.shape[1]))).astype(float) # -- option 3.2 # # display2screen(np.amax(np.sum(features, axis=1))) # # tmp = features # features = normalize_features(sp.csr_matrix(features)) # # display2screen(tmp[np.nonzero(x)[0][0], np.nonzero(x)[1][0]], x[np.nonzero(x)[0][0], np.nonzero(x)[1][0]]) # features = torch.tensor(features, dtype=torch.long) # x = features
def run_gcn_on_disease_graph(config, emb_name): ''' Frame the problem by connect subgraph that has shared nodes ie. diseases that share node will be connected by an edges :param config: :return: ''' # -- input arguments copd = config["data"] input = config["input"] # {disease_idx1: [[0,0,0,1,0,0], ....], disease_idx2: [...],... } y = config['label'] train_mask = config['train_mask'] test_mask = config['test_mask'] emb = config['emb'] hidden_sizes = config['hidden_layers'] epochs = config['epochs'] args = config['args'] param = config['param'] len_nodes = len(input.keys()) # amount of all node train_label = y[train_mask] test_label = y[test_mask] train_onehot = [] test_onehot = [] train_key = [] test_key = [] # -- convert onehot input into the following format # from # {disease_idx1: [[0,0,0,1,0,0],[0,1,0,0,0,0] ....], disease_idx2: [...],... } # to # {disease_idx1: [0,1,0,1,0,0], disease_idx2: [...],... } for key, val in input.items(): sum = 0 if int(key) in train_mask: for v in val: sum = np.add(sum, v) input[key] = sum train_onehot.append(input[key]) train_key.append(key) sum1 = 0 if int(key) in test_mask: for v in val: sum1 = np.add(sum1, v) input[key] = sum1 test_onehot.append(input[key]) test_key.append(key) # -- normalize feature train_input = preprocessing.normalize_features(csr_matrix(np.array(train_onehot))) test_input = preprocessing.normalize_features(csr_matrix(np.array(test_onehot))) # -- edge_index for disease_graph # 1. find overlap value between each disease edge_index = [] # the higher the threshold, the most overfit to training set it is. # This is because in there will noly have edges to node that have edge sto more genes. th = int(args.th) # default = 100 for d_out, k_out in zip(test_input, test_key): for d_in, k_in in zip(test_input, test_key): x = d_out - d_in x = x[x!=0] if x.shape[1] > th: if [k_out, k_in] not in edge_index and [k_in, k_out] not in edge_index: # print(f"form edges between {k_out} and {k_in}") edge_index.append([k_out, k_in]) for d_out, k_out in zip(train_input, train_key): for d_in, k_in in zip(train_input, train_key): x = d_out - d_in x = x[x!=0] if x.shape[1] > th: if [k_out, k_in] not in edge_index and [k_in, k_out] not in edge_index: # print(f"form edges between {k_out} and {k_in}") edge_index.append([k_out, k_in]) import math sparsity = len(edge_index)/ math.factorial(len_nodes) print(f"num_edges = {len(edge_index)}") print(f"edges sparsity = {sparsity}" ) edge_index = np.array(edge_index).T # display2screen(edge_index.shape, np.amax(edge_index.flatten())) # -- create train_input if emb_name != 'no_feat': train_input = emb[train_mask] test_input = emb[test_mask] else: train_input = train_input test_input = test_input # -- convert to tensor train_input = torch.tensor(train_input, dtype=torch.float) test_input = torch.tensor(test_input, dtype=torch.float) train_label = torch.tensor(train_label, dtype=torch.long) test_label = torch.tensor(test_label, dtype=torch.long) edge_index = torch.tensor(edge_index, dtype=torch.long) weighted_class = torch.tensor(args.weighted_class, dtype=torch.float) x = torch.cat((train_input,test_input), 0) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # import torch_geometric from torch_geometric.nn import GCNConv, ChebConv, GATConv, SAGEConv class Net(torch.nn.Module): def __init__(self): super(Net, self).__init__() modules = { # "conv1": GCNConv(20, args.hidden, cached=True), "conv1": GCNConv(train_input.shape[1], args.hidden, cached=True), "conv2": GCNConv(args.hidden, len(copd.labels2idx().keys()), cached=True) } for name, module in modules.items(): self.add_module(name, module) def forward(self, x, edge_index): x = F.relu(self.conv1(x, edge_index)) x = F.dropout(x, p=args.dropout, training=self.training) x = self.conv2(x, edge_index) return F.log_softmax(x, dim=1) gcn = Net().to(device) optimizer = torch.optim.Adam(gcn.parameters(), lr=args.lr, weight_decay=args.weight_decay) def unlabeled_weight(epochs): alpha = 0.0 if epochs > param['T1']: if epochs > param['T2']: alpha = param['af'] else: alpha = (epochs - param['T1']) / (param['T2'] - param['T1'] * param['af']) return alpha def train(): gcn.train() optimizer.zero_grad() if args.pseudo_label_topk: labeled_loss = F.nll_loss(gcn(x, edge_index)[train_mask], train_label, weight=torch.tensor(list(map(int, args.weighted_class)), dtype=torch.float), reduction="mean") # -- labeled top k most confidence node to be pseduo_labels pseudo_label_pred = gcn(x, edge_index).max(1)[1] tmp = gcn(x, edge_index).max(1)[1].detach().flatten().tolist() tmp = [(l, i) for i, l in enumerate(tmp)] tmp = sorted(tmp, key=lambda x: x[0], reverse=True) # rank label by predicted confidence value ranked_labels = [(l, i) for (l, i) in tmp] top_k_tuple = [] for (l, i) in ranked_labels: if len(top_k_tuple) >= int(args.topk): break top_k_tuple.append((i, l)) # get index of top_k to be masked during loss if len(top_k_tuple) >0: top_k = [t[0] for t in top_k_tuple] # -- add top_k to labeld_loss pseudo_label_loss = F.nll_loss(gcn(x, edge_index)[top_k], pseudo_label_pred[top_k], weight=weighted_class, reduction='mean') else: pseudo_label_loss = 0 loss_output = labeled_loss + pseudo_label_loss else: loss_output = F.nll_loss(gcn(x, edge_index)[train_mask], train_label, weight=torch.tensor(list(map(int, args.weighted_class)), dtype=torch.float), reduction="mean") loss_output.backward() optimizer.step() return loss_output.data def test(): gcn.eval() train_pred = gcn(x, edge_index)[train_mask].max(1)[1] train_acc = train_pred.eq(train_label).sum().item() / train_mask.shape[0] test_pred = gcn(x, edge_index)[test_mask].max(1)[1] test_acc = test_pred.eq(test_label).sum().item() / test_mask.shape[0] return [train_acc, test_acc] train_acc_hist = [] test_acc_hist = [] loss_hist = [] log_list = [] for epoch in range(epochs): loss_epoch = train() train_acc, test_acc = test() logging = 'Epoch: {:03d}, Train: {:.4f}, Test: {:.4f}'.format(epoch, train_acc, test_acc) if args.verbose: print(logging) log_list.append(logging) loss_hist.append(loss_epoch) train_acc_hist.append(train_acc) test_acc_hist.append(test_acc) split = args.split # -- create dir for hyperparameter config if not already exists weighted_class = ''.join(list(map(str, args.weighted_class))) HP = f'lr={args.lr}_d={args.dropout}_wd={args.weight_decay}' folder = f"log/gene_disease/{args.time_stamp}/gcn_on_disease_graph/split={split}/{HP}/" import os if not os.path.exists(folder): os.makedirs(folder) # if args.add_features: if args.emb_name != "no_feat": feat_stat = "YES" else: feat_stat = "NO" if args.pseudo_label_all: pseudo_label_stat = "ALL" elif args.pseudo_label_topk: pseudo_label_stat = "TOP_K" elif args.pseudo_label_topk_with_replacement: pseudo_label_stat = "TOP_K_WITH_REPLACEMENT" else: pseudo_label_stat = "NONE" T_param = ','.join([str(param['T1']), str(param['T2'])]) # -- creat directory if not yet created save_path = f'{folder}img/' if not os.path.exists(save_path): os.makedirs(save_path) if args.plot_all is True: args.plot_loss = True args.plot_no_train = True args.plot_train = True if args.plot_loss: # ====================== # == plot loss and acc vlaue # ====================== plt.figure(1) # -- plot loss hist plt.subplot(211) plt.plot(range(len(loss_hist)), loss_hist) plt.ylabel("loss values") plt.title("loss history") # -- plot acc hist plt.subplot(212) plt.plot(range(len(train_acc_hist)), train_acc_hist) plt.plot(range(len(test_acc_hist)), test_acc_hist) plt.ylabel("accuracy values") plt.title("accuracy history") print( "writing to " + save_path + f"LOSS_ACC_feat={feat_stat}_gene_thresh_hold={th}_wc=[{weighted_class}]_T=[{T_param}].png") plt.savefig( save_path + f'ACC_feat={feat_stat}_gene_thresh_hold={th}_wc=[{weighted_class}]_T=[{T_param}].png') plt.show() # --train_mask f1,precision,recall train_pred = gcn(x, edge_index)[train_mask].max(1)[1] train_f1 = f1_score(train_label, train_pred, average='micro') train_precision = precision_score(train_label, train_pred, average='micro') train_recall = recall_score(train_label, train_pred, average='micro') # -- test_mask f1,precision,recall test_pred = gcn(x, edge_index)[test_mask].max(1)[1] test_f1 = f1_score(test_label, test_pred, average='micro') test_precision = precision_score(test_label, test_pred, average='micro') test_recall = recall_score(test_label, test_pred, average='micro') if args.log: save_path = f'{folder}ACC_feat={feat_stat}_pseudo_label={pseudo_label_stat}_gene_thresh_hold={th}_wc={weighted_class}.txt' print(f"writing to {save_path}...") with open(save_path, 'w') as f: txt = '\n'.join(log_list) f.write(txt) if args.log: cm_train = confusion_matrix(gcn(x, edge_index)[train_mask].max(1)[1], train_label) cm_test = confusion_matrix(gcn(x, edge_index)[test_mask].max(1)[1], test_label) # formatter = {'float_kind': lambda x: "%.2f" % x}) cm_train = np.array2string(cm_train) cm_test = np.array2string(cm_test) save_path = f'{folder}CM_feat={feat_stat}_pseudo_label={pseudo_label_stat}_gene_thresh_hold={th}_wc={weighted_class}.txt' print(f"writing to {save_path}...") # txt = 'class int_rep is [' + ','.join(list(map(str, np.unique(data.y.numpy()).tolist()))) + ']' txt = 'class int_rep is [' + ','.join([str(i) for i in range(len(copd.labels2idx().values()))]) + ']' txt = txt + '\n\n' + "training cm" + '\n' + cm_train + '\n' \ + f"training_accuracy ={log_list[-1].split(',')[1]}" + '\n' \ + f"training_f1 ={train_f1}" + '\n' \ + f"training_precision={train_precision}" + '\n' \ + f"training_recall ={train_recall}" + '\n' txt = txt + '\n\n' + "test cm" + '\n' + cm_test + '\n' \ + f"test_accuracy ={log_list[-1].split(',')[2]}" + '\n' \ + f"test_f1 ={test_f1}" + '\n' \ + f"test_precision={test_precision}" + '\n' \ + f"test_recall ={test_recall}" + '\n' with open(save_path, 'w') as f: f.write(txt)