def main(args): # check if 'outputs' directory exist, if not create outputs_dir = os.path.join(os.getcwd(), '../outputs') if not os.path.exists(outputs_dir): os.makedirs(outputs_dir) # load dataset print("********** LOAD DATASET **********") g, features, labels, train_mask, valid_mask, test_mask = load_dataset(args) # read parameters from config file path = '../configs/' + args.dataset + '.yaml' config_file = os.path.join(os.getcwd(), path) with open(config_file, 'r') as f: config = yaml.load(f, Loader=yaml.FullLoader) h_feats = config['hidden_features'] in_feats = features.shape[1] out_feats = torch.max(labels).item() + 1 # array to save acc_array = np.zeros(args.exp_times) for i in range(args.exp_times): print("********** BUILD NETWORK: {} Iteration **********".format(i)) # build network gcn = GCN(in_feats, h_feats, out_feats).to(device) print("********** TRAIN NETWORK: {} Iteration **********".format(i)) # train network best_acc = train_gcn(gcn, g, features, labels, train_mask, test_mask, args) acc_array[i] = best_acc # store in the acc_array acc_current_dataset = np.mean(acc_array) * 100 print("Average accuracy for dataset {} is {}% !".format( args.dataset, acc_current_dataset)) # save results if args.fixpoint_loss: models = ["GCN", "SSE", "GCN trained with joint loss"] datasets = ["cora", "pubmed", "citeseer"] # check if the file exists outputs_file = os.path.join(os.getcwd(), '../outputs/fixedpoint.csv') if os.path.exists(outputs_file): # read from file acc_df = pd.read_csv(outputs_file, index_col=0, header=0) else: # new array to store accracy # row: dataset column: model acc_all_dataset = np.array([[81.5, 79.4, 0], [79.0, 75.8, 0], [70.3, 72.5, 0]]) acc_df = pd.DataFrame(acc_all_dataset, index=datasets, columns=models) acc_df.loc[args.dataset, "GCN trained with joint loss"] = acc_current_dataset acc_df.to_csv(outputs_file)
def evaluate(self, symbols_dev_npz_folder): dev_dataset = load_dataset(symbols_dev_npz_folder) devX, devY = dev_dataset["X"], dev_dataset["y"] devY = devY.reshape(-1, 1) devX = devX.reshape(*devX.shape, -1) loss_and_metrics = self.model.evaluate(devX, self.encoder.transform(devY), batch_size=128) return {"loss": loss_and_metrics[0], "accuracy": loss_and_metrics[1]}
def train(self, symbols_train_npz_folder): train_dataset = load_dataset(symbols_train_npz_folder) trainX, trainY = train_dataset["X"], train_dataset["y"] trainY = trainY.reshape(-1, 1) trainX = trainX.reshape(*trainX.shape, -1) self.encoder.fit(trainY) self.model.fit(trainX, self.encoder.transform(trainY), epochs=1, batch_size=128, validation_split=0.01)
def main(args): # check if 'outputs' and 'outputs/identifiability' directory exist, if not create outputs_dir = os.path.join(os.getcwd(), '../outputs') outputs_subdir = os.path.join(outputs_dir, 'identifiability') if not os.path.exists(outputs_dir): os.makedirs(outputs_dir) if not os.path.exists(outputs_subdir): os.makedirs(outputs_subdir) # load dataset print("********** LOAD DATASET **********") g, features, labels, train_mask, valid_mask, test_mask = load_dataset(args) # prepare to build network path = '../configs/' + args.dataset + '.yaml' config_file = os.path.join(os.getcwd(), path) with open(config_file, 'r') as f: config = yaml.load(f, Loader=yaml.FullLoader) h_feats = config['hidden_features'] in_feats = features.shape[1] out_feats = torch.max(labels).item() + 1 # result before averaging over experiments'iterations # store 'identifiability rates'/'repeating rates' for differnt iterations/layers/models # 1st dim.:repeating time 2nd dim.:layer 3rd dim.:model identifiability_rates = np.zeros( [args.repeat_times, args.max_gcn_layers, args.max_gcn_layers]) repeating_rates = np.zeros( [args.repeat_times, args.max_gcn_layers, args.max_gcn_layers]) # store 'accracy'/'accuracy on identifiable nodes'/'accuracy on unidentifiable nodes' for different iterations/models # 1st dim.:repeating time 2nd dim.:model accuracy = np.zeros([args.repeat_times, args.max_gcn_layers]) accuracy_id = np.zeros([args.repeat_times, args.max_gcn_layers]) accuracy_unid = np.zeros([args.repeat_times, args.max_gcn_layers]) for repeat_time in np.arange(args.repeat_times): for gcn_model_layer in np.arange(1, args.max_gcn_layers + 1): print("********** EXPERIMENT ITERATION: {} **********".format( repeat_time + 1)) print("********** GCN MODEL: GCN_{}layers **********".format( gcn_model_layer)) print("********** BUILD GCN NETWORK**********") gcn = GCN(gcn_model_layer, in_feats, h_feats, out_feats).to(device) print("********** EXPERIMENT ITERATION: {} **********".format( repeat_time + 1)) print("********** GCN MODEL: GCN_{}layer **********".format( gcn_model_layer)) print("********** TRAIN GCN NETWORK **********") acc = train_gcn(gcn, g, features, labels, train_mask, valid_mask, args) accuracy[repeat_time, gcn_model_layer - 1] = acc correct_classified_nodes, incorrect_classified_nodes = classify_nodes( gcn, g, features, labels) print("********** EXPERIMENT ITERATION: {} **********".format( repeat_time + 1)) print("********** GCN MODEL: GCN_{}layer **********".format( gcn_model_layer)) identifiable_nodes_1layer = set( ) # store nodes that can be identified after 1st layer identifiable_nodes_last_layer = set( ) # store nodes that can be identified after last layer for intermediate_layer in np.arange(1, gcn_model_layer + 1): identifiable_nodes_current_layer = set( ) # store nodes that can be identified after current layer print("********** INTERMEDIATE LAYER: {} **********".format( intermediate_layer)) print( "********** BUILD REGRESSION MODEL TO RECOVER INPUT FROM EMBEDDING **********" ) # prepare to build the regression model embedding = gcn( g, features)[intermediate_layer].clone().detach().to(device) input_features = features.clone().detach().to(device) regression_in = embedding.shape[1] regression_out = input_features.shape[1] regression_h = config['regression_hidden_features_identity'] regression_model = MLP(regression_in, regression_h, regression_out) # regression model print("********** EXPERIMENT ITERATION: {} **********".format( repeat_time + 1)) print("********** GCN MODEL: GCN_{}layer **********".format( gcn_model_layer)) print("********** INTERMEDIATE LAYER: {} **********".format( intermediate_layer)) print( "********** TRAIN REGRESSION MODEL TO RECOVER INPUT FROM EMBEDDING **********" ) train_regression(regression_model, embedding, input_features, train_mask, test_mask, args) print("********** EXPERIMENT ITERATION: {} **********".format( repeat_time + 1)) print("********** GCN MODEL: GCN_{}layer **********".format( gcn_model_layer)) print("********** INTERMEDIATE LAYER: {} **********".format( intermediate_layer)) print( "********** K-NN FINDING CORRESPONDING INPUT FEATURES **********" ) num_nodes = g.number_of_nodes() nn_list = np.zeros([ num_nodes, args.knn ]) # indices of the found nearest neighbourhood of nodes regression_output = regression_model(embedding) for node_ind in range(num_nodes): neighbour_ind = g.adjacency_matrix( transpose=True)[node_ind].to_dense() == 1 neighbour_feat = input_features[neighbour_ind] node_regression_output = regression_output[node_ind] # Find k Nearest Neighbourhood node_regression_output = node_regression_output.expand_as( neighbour_feat) dist = torch.nn.functional.cosine_similarity( neighbour_feat, node_regression_output) nn = dist.topk(args.knn, largest=False) # record the index of nn nn_list[node_ind] = g.adjacency_matrix( transpose=True)[node_ind]._indices()[0, nn.indices] print('Node_id: {} | Corresponding NN Node_id: {}'.format( node_ind, nn_list[node_ind])) print("********** EXPERIMENT ITERATION: {} **********".format( repeat_time + 1)) print("********** GCN MODEL: GCN_{}layer **********".format( gcn_model_layer)) print("********** INTERMEDIATE LAYER: {} **********".format( intermediate_layer)) print( "********** COMPUTE IDENTIFIABILITY/REPEATING RATES **********" ) # compute number of identifiable nodes nodes_indices = np.arange(num_nodes) nodes_indices_expansion = np.expand_dims( nodes_indices, args.knn) identifiable_nodes_current_layer.update(nodes_indices[np.any( nodes_indices_expansion.repeat(args.knn, axis=1) == nn_list, axis=1)]) num_identifiable = len(identifiable_nodes_current_layer) if intermediate_layer == 1: identifiable_nodes_1layer.update( identifiable_nodes_current_layer) if intermediate_layer == gcn_model_layer: # compute accuracy on identifiable nodes and unidentifiable nodes identifiable_nodes_last_layer.update( identifiable_nodes_current_layer) nodes = set(nodes_indices) unidentifiable_nodes_last_layer = nodes.difference( identifiable_nodes_last_layer) id_correct = len( identifiable_nodes_last_layer.intersection( correct_classified_nodes)) id_incorrect = len( identifiable_nodes_last_layer.intersection( incorrect_classified_nodes)) unid_correct = len( unidentifiable_nodes_last_layer.intersection( correct_classified_nodes)) unid_incorrect = len( unidentifiable_nodes_last_layer.intersection( incorrect_classified_nodes)) accuracy_id[repeat_time, gcn_model_layer - 1] = id_correct * 1.0 / (id_correct + id_incorrect) accuracy_unid[repeat_time, gcn_model_layer - 1] = unid_correct * 1.0 / (unid_correct + unid_incorrect) print('accuracy on identifiable nodes = {} %'.format( accuracy_id[repeat_time, gcn_model_layer - 1] * 100)) print('accuracy on unidentifiable nodes = {} %'.format( accuracy_unid[repeat_time, gcn_model_layer - 1] * 100)) # compute number of repeating nodes num_repeating = len( identifiable_nodes_current_layer.intersection( identifiable_nodes_1layer)) identifiability_rate = num_identifiable * 1.0 / num_nodes if len(identifiable_nodes_1layer) != 0: repeating_rate = num_repeating * 1.0 / len( identifiable_nodes_1layer) else: repeating_rate = 0.0 print('identifiability_rate = {} %'.format( identifiability_rate * 100)) print('node_repeatability_rate = {} %'.format(repeating_rate * 100)) identifiability_rates[repeat_time, intermediate_layer - 1, gcn_model_layer - 1] = identifiability_rate repeating_rates[repeat_time, intermediate_layer - 1, gcn_model_layer - 1] = repeating_rate print("********** SAVE RESULT **********") # final result # dataframe to store 'identifiability rates'/'repeating rates' for different layers/models # row:layer column:model models = [] # dataframe column index for gcn_model_layer in np.arange(1, args.max_gcn_layers + 1): model_name = str(gcn_model_layer) + '-layers ' + 'GCN' models.append(model_name) identifiability_rates_df = pd.DataFrame(np.mean(identifiability_rates, axis=0), index=np.arange( 1, args.max_gcn_layers + 1), columns=models) repeating_rates_df = pd.DataFrame(np.mean(repeating_rates, axis=0), index=np.arange(1, args.max_gcn_layers + 1), columns=models) accuracy_df = pd.DataFrame(np.mean(accuracy, axis=0), index=models, columns=['accuracy']) accuracy_id_unid_array = np.array( [np.mean(accuracy_id, axis=0), np.mean(accuracy_unid, axis=0)]).transpose() accuracy_id_unid_df = pd.DataFrame(accuracy_id_unid_array, index=models, columns=[ 'accuracy on identifiable nodes', 'accuracy on unidentifiable nodes' ]) # save result in csv files save_subpath = 'embedding_id_rates_' + args.dataset + '.csv' save_path = os.path.join(outputs_subdir, save_subpath) identifiability_rates_df.to_csv(save_path) save_subpath = 'repeating_rates_' + args.dataset + '.csv' save_path = os.path.join(outputs_subdir, save_subpath) repeating_rates_df.to_csv(save_path) save_subpath = 'accuracy_' + args.dataset + '.csv' save_path = os.path.join(outputs_subdir, save_subpath) accuracy_df.to_csv(save_path) save_subpath = 'accuracy_id_unid_' + args.dataset + '.csv' save_path = os.path.join(outputs_subdir, save_subpath) accuracy_id_unid_df.to_csv(save_path)
def main(args): # check if 'outputs' directory exist, if not create outputs_dir = os.path.join(os.getcwd(), '../outputs') outputs_subdir = os.path.join(outputs_dir, 'gnn_n') if not os.path.exists(outputs_dir): os.makedirs(outputs_dir) if not os.path.exists(outputs_subdir): os.makedirs(outputs_subdir) # load dataset print("********** LOAD DATASET **********") g, _, _, _, _, _ = load_dataset(args) num_nodes = g.number_of_nodes() print("********** READ RESULTS FROM FILE **********") nodes_list_100layerGCN_file = os.path.join(os.getcwd(), '../outputs/gnn_n/nodes_list_100layerGCN_' + args.dataset + '.npy') with open(nodes_list_100layerGCN_file, 'rb') as f: nodes_list_100layerGCN = np.load(f, allow_pickle=True) f.close() nodes_list_3layerMLP_file = os.path.join(os.getcwd(), '../outputs/gnn_n/nodes_list_3layerMLP_' + args.dataset + '.npy') with open(nodes_list_3layerMLP_file, 'rb') as f: nodes_list_3layerMLP = np.load(f, allow_pickle=True) f.close() print("********** COMPUTE GNN-N VALUE **********") features_probability = np.zeros(num_nodes) for i in np.arange(num_nodes): correctly_classified_times = 0 for j in np.arange(args.mlp_exp_times): if i in nodes_list_3layerMLP[j]: correctly_classified_times += 1 features_probability[i] = correctly_classified_times * 1.0 / args.mlp_exp_times graph_structures_probability = np.zeros(num_nodes) for i in np.arange(num_nodes): correctly_classified_times = 0 for j in np.arange(args.gcn_exp_times): for k in np.arange(args.gcn_num_random_features): if i in nodes_list_100layerGCN[j*args.gcn_num_random_features+k]: correctly_classified_times += 1 graph_structures_probability[i] = correctly_classified_times * 1.0 / (args.gcn_exp_times * args.gcn_num_random_features) gnn_n = np.mean((1-graph_structures_probability)*(1-features_probability)) # save results datasets = ["cora", "pubmed", "citeseer", "amazon_photo", "amazon_computers", "coauthors_physics", "coauthors_cs"] # check if the file exists outputs_file = os.path.join(outputs_subdir, 'gnn_n.csv') if os.path.exists(outputs_file): # read from file gnn_n_df = pd.read_csv(outputs_file, index_col=0, header=0) else: # new array to store results # row: dataset column: item gnn_n_array = np.zeros([len(datasets), 1]) gnn_n_df = pd.DataFrame(gnn_n_array, index=datasets, columns=['GNN-N']) gnn_n_df.loc[args.dataset, "GNN-N"] = gnn_n gnn_n_df.to_csv(outputs_file)
def main(args): # check if 'outputs' directory exist, if not create outputs_dir = os.path.join(os.getcwd(), '../outputs') outputs_subdir = os.path.join(outputs_dir, 'gnn_n') if not os.path.exists(outputs_dir): os.makedirs(outputs_dir) if not os.path.exists(outputs_subdir): os.makedirs(outputs_subdir) # load dataset print("********** LOAD DATASET **********") g, features, labels, train_mask, valid_mask, test_mask = load_dataset(args) # read parameters from config file path = '../configs/' + args.dataset + '.yaml' config_file = os.path.join(os.getcwd(), path) with open(config_file, 'r') as f: config = yaml.load(f, Loader=yaml.FullLoader) h_feats = config['hidden_features'] in_feats = features.shape[1] out_feats = torch.max(labels).item() + 1 # declarations of variables to save experiment results acc_original_features = np.zeros(args.exp_times) acc_random_features = np.zeros([args.exp_times, args.num_random_features]) correctly_classified_nodes_original_features_list = [] correctly_classified_nodes_random_features_list = [] for i in range(args.exp_times): print("********** BUILD NETWORK: {} Experiment **********".format(i + 1)) # build network gcn = GCN(100, in_feats, h_feats, out_feats).to(device) print("********** TRAIN NETWORK: {} Experiment **********".format(i + 1)) # train network _ = train_gcn(gcn, g, features, labels, train_mask, valid_mask, args) print( "********** TEST WITH ORIGINAL FEATURES: {} Experiment **********". format(i + 1)) # test with original features acc_original_features[ i], correctly_classified_nodes_original_features = evaluate_and_classify_nodes_gcn( gcn, g, features, labels, test_mask) correctly_classified_nodes_original_features_list.append( correctly_classified_nodes_original_features) print("Test accuracy with original features: {:.2f}% !".format( acc_original_features[i] * 100)) print("********** TEST WITH RANDOM FEATURES: {} Experiment **********". format(i + 1)) # test with random features for j in range(args.num_random_features): acc_random_features[ i, j], correctly_classified_nodes_random_features = evaluate_and_classify_nodes_with_random_features_gcn( gcn, g, features, labels, test_mask) correctly_classified_nodes_random_features_list.append( correctly_classified_nodes_random_features) print("Test accuracy with random features {}: {:.2f}% !".format( j + 1, acc_random_features[i, j] * 100)) print("********** COMPUTE AVERAGE ACCURACY **********") acc_mean_original_features = np.mean(acc_original_features) acc_std_original_features = np.std(acc_original_features, ddof=1) acc_mean_random_features = np.mean(acc_random_features) acc_std_random_features = np.std(acc_random_features, ddof=1) print( "Average accuracy with original features is {:.2f}+-{:.2f}% !".format( acc_mean_original_features * 100, acc_std_original_features * 100)) print("Average accuracy with random features is {:.2f}+-{:.2f}% !".format( acc_mean_random_features * 100, acc_std_random_features * 100)) print("********** COMPUTE R-RR **********") r_rr_list = [] for i in np.arange(args.exp_times): for j in np.arange(args.num_random_features): for k in np.arange(j + 1, args.num_random_features): set_1 = correctly_classified_nodes_random_features_list[ i * args.num_random_features + j] set_2 = correctly_classified_nodes_random_features_list[ i * args.num_random_features + k] r_rr_list.append( len(set_1.intersection(set_2)) * 1.0 / min(len(set_1), len(set_2))) r_rr_mean = np.mean(np.array(r_rr_list)) r_rr_std = np.std(np.array(r_rr_list), ddof=1) print("R-RR is {:.2f}+-{:.2f}% !".format(r_rr_mean * 100, r_rr_std * 100)) print("********** COMPUTE RO-RR **********") ro_rr_array = np.zeros(args.exp_times) for i in np.arange(args.exp_times): set_1 = correctly_classified_nodes_original_features_list[i] set_2 = correctly_classified_nodes_random_features_list[ i * args.num_random_features] ro_rr_array[i] = len(set_1.intersection(set_2)) * 1.0 / min( len(set_1), len(set_2)) ro_rr_mean = np.mean(ro_rr_array) ro_rr_std = np.std(ro_rr_array, ddof=1) print("RO-RR is {:.2f}+-{:.2f}% !".format(ro_rr_mean * 100, ro_rr_std * 100)) print("********** COMPUTE TT-RR **********") tt_rr_array = np.eye(args.exp_times) for i in np.arange(args.exp_times): for j in np.arange(i + 1, args.exp_times): set_1 = correctly_classified_nodes_random_features_list[ i * args.num_random_features] set_2 = correctly_classified_nodes_random_features_list[ j * args.num_random_features] tt_rr_array[i, j] = len(set_1.intersection(set_2)) * 1.0 / min( len(set_1), len(set_2)) tt_rr_array[j, i] = len(set_1.intersection(set_2)) * 1.0 / min( len(set_1), len(set_2)) # save results datasets = [ "cora", "pubmed", "citeseer", "amazon_photo", "amazon_computers", "coauthors_physics", "coauthors_cs" ] items = [ "acc mean original features", "acc std original features", "acc mean random features", "acc std random features", "r_rr mean", "r_rr std", "ro_rr mean", "ro_rr std" ] # check if the file exists outputs_file = os.path.join(outputs_subdir, 'gnn_n_100layerGCN.csv') if os.path.exists(outputs_file): # read from file results_df = pd.read_csv(outputs_file, index_col=0, header=0) else: # new array to store results # row: dataset column: item results_all_dataset = np.zeros([len(datasets), len(items)]) results_df = pd.DataFrame(results_all_dataset, index=datasets, columns=items) results_df.loc[args.dataset, "acc mean original features"] = acc_mean_original_features results_df.loc[args.dataset, "acc std original features"] = acc_std_original_features results_df.loc[args.dataset, "acc mean random features"] = acc_mean_random_features results_df.loc[args.dataset, "acc std random features"] = acc_std_random_features results_df.loc[args.dataset, "r_rr mean"] = r_rr_mean results_df.loc[args.dataset, "r_rr std"] = r_rr_std results_df.loc[args.dataset, "ro_rr mean"] = ro_rr_mean results_df.loc[args.dataset, "ro_rr std"] = ro_rr_std results_df.to_csv(outputs_file) tt_rr_file = os.path.join(outputs_subdir, 'tt_rr_' + args.dataset + '.npy') with open(tt_rr_file, 'wb') as f: np.save(f, tt_rr_array) f.close() correctly_classified_nodes_random_features_list_file = os.path.join( outputs_subdir, 'nodes_list_100layerGCN_' + args.dataset + '.npy') with open(correctly_classified_nodes_random_features_list_file, 'wb') as f: np.save(f, correctly_classified_nodes_random_features_list) f.close()
def main(args): # check if 'outputs' directory exist, if not create outputs_dir = os.path.join(os.getcwd(), '../outputs') outputs_subdir = os.path.join(outputs_dir, 'gnn_n') if not os.path.exists(outputs_dir): os.makedirs(outputs_dir) if not os.path.exists(outputs_subdir): os.makedirs(outputs_subdir) # load dataset print("********** LOAD DATASET **********") g, features, labels, train_mask, valid_mask, test_mask = load_dataset(args) # read parameters from config file path = '../configs/' + args.dataset + '.yaml' config_file = os.path.join(os.getcwd(), path) with open(config_file, 'r') as f: config = yaml.load(f, Loader=yaml.FullLoader) h_feats = config['hidden_features'] in_feats = features.shape[1] out_feats = torch.max(labels).item() + 1 # declarations of variables to save experiment results acc = np.zeros(args.exp_times) correctly_classified_nodes_list = [] for i in range(args.exp_times): print("********** BUILD NETWORK: {} Experiment **********".format(i + 1)) # build network mlp = MLP(3, in_feats, h_feats, out_feats).to(device) print("********** TRAIN NETWORK: {} Experiment **********".format(i + 1)) # train network _ = train_mlp(mlp, features, labels, train_mask, valid_mask, args) print("********** TEST MLP: {} Experiment **********".format(i + 1)) # test with original features acc[i], correctly_classified_nodes = evaluate_and_classify_nodes_mlp( mlp, features, labels, test_mask) correctly_classified_nodes_list.append(correctly_classified_nodes) print("Test accuracy: {:.2f}% !".format(acc[i] * 100)) print("********** COMPUTE RAVERAGE ACCURACY **********") acc_avg = np.mean(acc) acc_std = np.std(acc, ddof=1) print("Accuracy for {} is {:.2f}+-{:.2f}% !".format( args.dataset, acc_avg * 100, acc_std * 100)) print("********** COMPUTE AVERAGE REPEATING RATE **********") repeating_rates_list = [] for i in np.arange(args.exp_times): for j in np.arange(i + 1, args.exp_times): set_1 = correctly_classified_nodes_list[i] set_2 = correctly_classified_nodes_list[j] repeating_rates_list.append( len(set_1.intersection(set_2)) * 1.0 / min(len(set_1), len(set_2))) rr_avg = np.mean(np.array(repeating_rates_list)) rr_std = np.std(np.array(repeating_rates_list), ddof=1) print("Repeating rate for {} is {:.2f}+-{:.2f}% !".format( args.dataset, rr_avg * 100, rr_std * 100)) # save results datasets = [ "cora", "pubmed", "citeseer", "amazon_photo", "amazon_computers", "coauthors_physics", "coauthors_cs" ] items = ["acc_mean", "acc_std", "rr_mean", "rr_std"] # check if the file exists outputs_file = os.path.join(outputs_subdir, 'gnn_n_3layerMLP.csv') if os.path.exists(outputs_file): # read from file results_df = pd.read_csv(outputs_file, index_col=0, header=0) else: # new array to store results # row: dataset column: item results_all_dataset = np.zeros([len(datasets), len(items)]) results_df = pd.DataFrame(results_all_dataset, index=datasets, columns=items) results_df.loc[args.dataset, "acc_mean"] = acc_avg results_df.loc[args.dataset, "acc_std"] = acc_std results_df.loc[args.dataset, "rr_mean"] = rr_avg results_df.loc[args.dataset, "rr_std"] = rr_std results_df.to_csv(outputs_file) correctly_classified_nodes_list_file = os.path.join( outputs_subdir, 'nodes_list_3layerMLP_' + args.dataset + '.npy') with open(correctly_classified_nodes_list_file, 'wb') as f: np.save(f, correctly_classified_nodes_list) f.close()
def generate(symbols_folder, background_folder, batch_per_class, label_binarizer, target_height=32, target_width=32): """ :param symbols_folder: path to the folder containing symbols .npz files :param background_folder: path to the folder containing background .npz files :param batch_per_class: number of every symbol example in a single batch :param label_binarizer: sklearn.binarizer :param target_height: target height of the generated symbols' images :param target_width: target width of the generated symbols' images :return: (batch of symbols images, batch of corresponding labels) """ symbols, background = load_dataset_by_symbol(symbols_folder), load_dataset( background_folder) for s in symbols: s_X, s_y = symbols[s] new_s_X = [] for i in range(s_X.shape[0]): new_s_X.append(preprocess_symbol(s_X[i], 3)) symbols[s] = np.array(new_s_X), s_y bg_X, bg_y = background['X'], background['y'] new_bg_X = [] for i in range(bg_X.shape[0]): new_bg_X.append(preprocess_background(bg_X[i])) bg_X = np.array(bg_X) # Image format # bg_X, bg_y # symbols[symbol_name -> symbol_X, symbol_y] label_binarizer.fit(np.array([s for s in symbols] + ["background"])) batch_nr = 0 while True: batch_X, batch_y = [], [] batch_bg = bg_X[np.random.randint(0, bg_X.shape[0], batch_per_class * len(symbols))] for s in symbols: symbol_X, symbol_y = symbols[s] m = symbol_X.shape[0] start_ind = (batch_nr * batch_per_class) % m end_ind = start_ind + batch_per_class if end_ind <= m: batch_X.append(symbol_X[start_ind:end_ind]) else: batch_X.append(symbol_X[start_ind:]) batch_X.append(symbol_X[0:end_ind % m]) batch_y.append(symbol_y[0:batch_per_class]) batch_X, batch_y = np.concatenate(batch_X), np.concatenate(batch_y) batch_X = merge_symbols_and_backgrounds(batch_X, batch_bg) # add background random_bg_ind = np.random.randint( 0, bg_X.shape[0], batch_per_class) # mb batch_per_class * len(symbols) bg_batch_x, bg_batch_y = bg_X[random_bg_ind], bg_y[random_bg_ind] batch_X, batch_y = np.concatenate( [batch_X, bg_batch_x]), np.concatenate([batch_y, bg_batch_y]) # one hot encode y vector batch_y = label_binarizer.transform(batch_y) # reshape to fit the keras format batch_y = batch_y.reshape(batch_y.shape[0], 1, 1, batch_y.shape[1]) # rescale accordingly batch_X = rescale_dataset(batch_X, target_height, target_width) yield shuffle_dataset(batch_X, batch_y) batch_nr += 1