示例#1
0
def main(args):

    # check if 'outputs' directory exist, if not create
    outputs_dir = os.path.join(os.getcwd(), '../outputs')
    if not os.path.exists(outputs_dir):
        os.makedirs(outputs_dir)

    # load dataset
    print("********** LOAD DATASET **********")
    g, features, labels, train_mask, valid_mask, test_mask = load_dataset(args)

    # read parameters from config file
    path = '../configs/' + args.dataset + '.yaml'
    config_file = os.path.join(os.getcwd(), path)
    with open(config_file, 'r') as f:
        config = yaml.load(f, Loader=yaml.FullLoader)
    h_feats = config['hidden_features']
    in_feats = features.shape[1]
    out_feats = torch.max(labels).item() + 1

    # array to save
    acc_array = np.zeros(args.exp_times)

    for i in range(args.exp_times):
        print("********** BUILD NETWORK: {} Iteration **********".format(i))
        # build network
        gcn = GCN(in_feats, h_feats, out_feats).to(device)

        print("********** TRAIN NETWORK: {} Iteration **********".format(i))
        # train network
        best_acc = train_gcn(gcn, g, features, labels, train_mask, test_mask,
                             args)

        acc_array[i] = best_acc  # store in the acc_array

    acc_current_dataset = np.mean(acc_array) * 100
    print("Average accuracy for dataset {} is {}% !".format(
        args.dataset, acc_current_dataset))

    # save results
    if args.fixpoint_loss:
        models = ["GCN", "SSE", "GCN trained with joint loss"]
        datasets = ["cora", "pubmed", "citeseer"]
        # check if the file exists
        outputs_file = os.path.join(os.getcwd(), '../outputs/fixedpoint.csv')
        if os.path.exists(outputs_file):
            # read from file
            acc_df = pd.read_csv(outputs_file, index_col=0, header=0)
        else:
            # new array to store accracy
            # row: dataset    column: model
            acc_all_dataset = np.array([[81.5, 79.4, 0], [79.0, 75.8, 0],
                                        [70.3, 72.5, 0]])
            acc_df = pd.DataFrame(acc_all_dataset,
                                  index=datasets,
                                  columns=models)

        acc_df.loc[args.dataset,
                   "GCN trained with joint loss"] = acc_current_dataset
        acc_df.to_csv(outputs_file)
    def evaluate(self, symbols_dev_npz_folder):
        dev_dataset = load_dataset(symbols_dev_npz_folder)
        devX, devY = dev_dataset["X"], dev_dataset["y"]

        devY = devY.reshape(-1, 1)
        devX = devX.reshape(*devX.shape, -1)

        loss_and_metrics = self.model.evaluate(devX,
                                               self.encoder.transform(devY),
                                               batch_size=128)
        return {"loss": loss_and_metrics[0], "accuracy": loss_and_metrics[1]}
    def train(self, symbols_train_npz_folder):
        train_dataset = load_dataset(symbols_train_npz_folder)
        trainX, trainY = train_dataset["X"], train_dataset["y"]

        trainY = trainY.reshape(-1, 1)
        trainX = trainX.reshape(*trainX.shape, -1)

        self.encoder.fit(trainY)
        self.model.fit(trainX,
                       self.encoder.transform(trainY),
                       epochs=1,
                       batch_size=128,
                       validation_split=0.01)
示例#4
0
def main(args):

    # check if 'outputs' and 'outputs/identifiability' directory exist, if not create
    outputs_dir = os.path.join(os.getcwd(), '../outputs')
    outputs_subdir = os.path.join(outputs_dir, 'identifiability')
    if not os.path.exists(outputs_dir):
        os.makedirs(outputs_dir)
    if not os.path.exists(outputs_subdir):
        os.makedirs(outputs_subdir)

    # load dataset
    print("********** LOAD DATASET **********")
    g, features, labels, train_mask, valid_mask, test_mask = load_dataset(args)

    # prepare to build network
    path = '../configs/' + args.dataset + '.yaml'
    config_file = os.path.join(os.getcwd(), path)
    with open(config_file, 'r') as f:
        config = yaml.load(f, Loader=yaml.FullLoader)
    h_feats = config['hidden_features']
    in_feats = features.shape[1]
    out_feats = torch.max(labels).item() + 1

    # result before averaging over experiments'iterations
    # store 'identifiability rates'/'repeating rates' for differnt iterations/layers/models
    # 1st dim.:repeating time  2nd dim.:layer  3rd dim.:model
    identifiability_rates = np.zeros(
        [args.repeat_times, args.max_gcn_layers, args.max_gcn_layers])
    repeating_rates = np.zeros(
        [args.repeat_times, args.max_gcn_layers, args.max_gcn_layers])
    # store 'accracy'/'accuracy on identifiable nodes'/'accuracy on unidentifiable nodes' for different iterations/models
    # 1st dim.:repeating time  2nd dim.:model
    accuracy = np.zeros([args.repeat_times, args.max_gcn_layers])
    accuracy_id = np.zeros([args.repeat_times, args.max_gcn_layers])
    accuracy_unid = np.zeros([args.repeat_times, args.max_gcn_layers])

    for repeat_time in np.arange(args.repeat_times):
        for gcn_model_layer in np.arange(1, args.max_gcn_layers + 1):
            print("********** EXPERIMENT ITERATION: {} **********".format(
                repeat_time + 1))
            print("********** GCN MODEL: GCN_{}layers **********".format(
                gcn_model_layer))
            print("********** BUILD GCN NETWORK**********")
            gcn = GCN(gcn_model_layer, in_feats, h_feats, out_feats).to(device)

            print("********** EXPERIMENT ITERATION: {} **********".format(
                repeat_time + 1))
            print("********** GCN MODEL: GCN_{}layer **********".format(
                gcn_model_layer))
            print("********** TRAIN GCN NETWORK **********")
            acc = train_gcn(gcn, g, features, labels, train_mask, valid_mask,
                            args)
            accuracy[repeat_time, gcn_model_layer - 1] = acc
            correct_classified_nodes, incorrect_classified_nodes = classify_nodes(
                gcn, g, features, labels)

            print("********** EXPERIMENT ITERATION: {} **********".format(
                repeat_time + 1))
            print("********** GCN MODEL: GCN_{}layer **********".format(
                gcn_model_layer))
            identifiable_nodes_1layer = set(
            )  # store nodes that can be identified after 1st layer
            identifiable_nodes_last_layer = set(
            )  # store nodes that can be identified after last layer
            for intermediate_layer in np.arange(1, gcn_model_layer + 1):
                identifiable_nodes_current_layer = set(
                )  # store nodes that can be identified after current layer
                print("********** INTERMEDIATE LAYER: {} **********".format(
                    intermediate_layer))
                print(
                    "********** BUILD REGRESSION MODEL TO RECOVER INPUT FROM EMBEDDING **********"
                )
                # prepare to build the regression model
                embedding = gcn(
                    g,
                    features)[intermediate_layer].clone().detach().to(device)
                input_features = features.clone().detach().to(device)
                regression_in = embedding.shape[1]
                regression_out = input_features.shape[1]
                regression_h = config['regression_hidden_features_identity']

                regression_model = MLP(regression_in, regression_h,
                                       regression_out)  # regression model

                print("********** EXPERIMENT ITERATION: {} **********".format(
                    repeat_time + 1))
                print("********** GCN MODEL: GCN_{}layer **********".format(
                    gcn_model_layer))
                print("********** INTERMEDIATE LAYER: {} **********".format(
                    intermediate_layer))
                print(
                    "********** TRAIN REGRESSION MODEL TO RECOVER INPUT FROM EMBEDDING **********"
                )
                train_regression(regression_model, embedding, input_features,
                                 train_mask, test_mask, args)

                print("********** EXPERIMENT ITERATION: {} **********".format(
                    repeat_time + 1))
                print("********** GCN MODEL: GCN_{}layer **********".format(
                    gcn_model_layer))
                print("********** INTERMEDIATE LAYER: {} **********".format(
                    intermediate_layer))
                print(
                    "********** K-NN FINDING CORRESPONDING INPUT FEATURES **********"
                )
                num_nodes = g.number_of_nodes()
                nn_list = np.zeros([
                    num_nodes, args.knn
                ])  # indices of the found nearest neighbourhood of nodes
                regression_output = regression_model(embedding)
                for node_ind in range(num_nodes):
                    neighbour_ind = g.adjacency_matrix(
                        transpose=True)[node_ind].to_dense() == 1
                    neighbour_feat = input_features[neighbour_ind]
                    node_regression_output = regression_output[node_ind]
                    # Find k Nearest Neighbourhood
                    node_regression_output = node_regression_output.expand_as(
                        neighbour_feat)
                    dist = torch.nn.functional.cosine_similarity(
                        neighbour_feat, node_regression_output)
                    nn = dist.topk(args.knn, largest=False)
                    # record the index of nn
                    nn_list[node_ind] = g.adjacency_matrix(
                        transpose=True)[node_ind]._indices()[0, nn.indices]
                    print('Node_id: {} | Corresponding NN Node_id: {}'.format(
                        node_ind, nn_list[node_ind]))

                print("********** EXPERIMENT ITERATION: {} **********".format(
                    repeat_time + 1))
                print("********** GCN MODEL: GCN_{}layer **********".format(
                    gcn_model_layer))
                print("********** INTERMEDIATE LAYER: {} **********".format(
                    intermediate_layer))
                print(
                    "********** COMPUTE IDENTIFIABILITY/REPEATING RATES **********"
                )
                # compute number of identifiable nodes
                nodes_indices = np.arange(num_nodes)
                nodes_indices_expansion = np.expand_dims(
                    nodes_indices, args.knn)
                identifiable_nodes_current_layer.update(nodes_indices[np.any(
                    nodes_indices_expansion.repeat(args.knn,
                                                   axis=1) == nn_list,
                    axis=1)])
                num_identifiable = len(identifiable_nodes_current_layer)

                if intermediate_layer == 1:
                    identifiable_nodes_1layer.update(
                        identifiable_nodes_current_layer)
                if intermediate_layer == gcn_model_layer:
                    # compute accuracy on identifiable nodes and unidentifiable nodes
                    identifiable_nodes_last_layer.update(
                        identifiable_nodes_current_layer)
                    nodes = set(nodes_indices)
                    unidentifiable_nodes_last_layer = nodes.difference(
                        identifiable_nodes_last_layer)
                    id_correct = len(
                        identifiable_nodes_last_layer.intersection(
                            correct_classified_nodes))
                    id_incorrect = len(
                        identifiable_nodes_last_layer.intersection(
                            incorrect_classified_nodes))
                    unid_correct = len(
                        unidentifiable_nodes_last_layer.intersection(
                            correct_classified_nodes))
                    unid_incorrect = len(
                        unidentifiable_nodes_last_layer.intersection(
                            incorrect_classified_nodes))
                    accuracy_id[repeat_time, gcn_model_layer -
                                1] = id_correct * 1.0 / (id_correct +
                                                         id_incorrect)
                    accuracy_unid[repeat_time, gcn_model_layer -
                                  1] = unid_correct * 1.0 / (unid_correct +
                                                             unid_incorrect)
                    print('accuracy on identifiable nodes = {} %'.format(
                        accuracy_id[repeat_time, gcn_model_layer - 1] * 100))
                    print('accuracy on unidentifiable nodes = {} %'.format(
                        accuracy_unid[repeat_time, gcn_model_layer - 1] * 100))

                # compute number of repeating nodes
                num_repeating = len(
                    identifiable_nodes_current_layer.intersection(
                        identifiable_nodes_1layer))

                identifiability_rate = num_identifiable * 1.0 / num_nodes
                if len(identifiable_nodes_1layer) != 0:
                    repeating_rate = num_repeating * 1.0 / len(
                        identifiable_nodes_1layer)
                else:
                    repeating_rate = 0.0
                print('identifiability_rate = {} %'.format(
                    identifiability_rate * 100))
                print('node_repeatability_rate = {} %'.format(repeating_rate *
                                                              100))

                identifiability_rates[repeat_time, intermediate_layer - 1,
                                      gcn_model_layer -
                                      1] = identifiability_rate
                repeating_rates[repeat_time, intermediate_layer - 1,
                                gcn_model_layer - 1] = repeating_rate

    print("********** SAVE RESULT **********")
    # final result
    # dataframe to store 'identifiability rates'/'repeating rates' for different layers/models
    # row:layer  column:model
    models = []  # dataframe column index
    for gcn_model_layer in np.arange(1, args.max_gcn_layers + 1):
        model_name = str(gcn_model_layer) + '-layers ' + 'GCN'
        models.append(model_name)
    identifiability_rates_df = pd.DataFrame(np.mean(identifiability_rates,
                                                    axis=0),
                                            index=np.arange(
                                                1, args.max_gcn_layers + 1),
                                            columns=models)
    repeating_rates_df = pd.DataFrame(np.mean(repeating_rates, axis=0),
                                      index=np.arange(1,
                                                      args.max_gcn_layers + 1),
                                      columns=models)
    accuracy_df = pd.DataFrame(np.mean(accuracy, axis=0),
                               index=models,
                               columns=['accuracy'])
    accuracy_id_unid_array = np.array(
        [np.mean(accuracy_id, axis=0),
         np.mean(accuracy_unid, axis=0)]).transpose()
    accuracy_id_unid_df = pd.DataFrame(accuracy_id_unid_array,
                                       index=models,
                                       columns=[
                                           'accuracy on identifiable nodes',
                                           'accuracy on unidentifiable nodes'
                                       ])

    # save result in csv files
    save_subpath = 'embedding_id_rates_' + args.dataset + '.csv'
    save_path = os.path.join(outputs_subdir, save_subpath)
    identifiability_rates_df.to_csv(save_path)
    save_subpath = 'repeating_rates_' + args.dataset + '.csv'
    save_path = os.path.join(outputs_subdir, save_subpath)
    repeating_rates_df.to_csv(save_path)
    save_subpath = 'accuracy_' + args.dataset + '.csv'
    save_path = os.path.join(outputs_subdir, save_subpath)
    accuracy_df.to_csv(save_path)
    save_subpath = 'accuracy_id_unid_' + args.dataset + '.csv'
    save_path = os.path.join(outputs_subdir, save_subpath)
    accuracy_id_unid_df.to_csv(save_path)
示例#5
0
def main(args):

    # check if 'outputs' directory exist, if not create
    outputs_dir = os.path.join(os.getcwd(), '../outputs')
    outputs_subdir = os.path.join(outputs_dir, 'gnn_n')
    if not os.path.exists(outputs_dir):
        os.makedirs(outputs_dir)
    if not os.path.exists(outputs_subdir):
        os.makedirs(outputs_subdir)

    # load dataset
    print("********** LOAD DATASET **********")
    g, _, _, _, _, _ = load_dataset(args)
    num_nodes = g.number_of_nodes()

    print("********** READ RESULTS FROM FILE **********")
    nodes_list_100layerGCN_file = os.path.join(os.getcwd(), '../outputs/gnn_n/nodes_list_100layerGCN_' + args.dataset + '.npy')
    with open(nodes_list_100layerGCN_file, 'rb') as f:
        nodes_list_100layerGCN = np.load(f, allow_pickle=True)
    f.close()

    nodes_list_3layerMLP_file = os.path.join(os.getcwd(), '../outputs/gnn_n/nodes_list_3layerMLP_' + args.dataset + '.npy')
    with open(nodes_list_3layerMLP_file, 'rb') as f:
        nodes_list_3layerMLP = np.load(f, allow_pickle=True)
    f.close()

    print("********** COMPUTE GNN-N VALUE **********")
    features_probability = np.zeros(num_nodes)
    for i in np.arange(num_nodes):
        correctly_classified_times = 0
        for j in np.arange(args.mlp_exp_times):
            if i in nodes_list_3layerMLP[j]:
                correctly_classified_times += 1
        features_probability[i] = correctly_classified_times * 1.0 / args.mlp_exp_times

    graph_structures_probability = np.zeros(num_nodes)
    for i in np.arange(num_nodes):
        correctly_classified_times = 0
        for j in np.arange(args.gcn_exp_times):
            for k in np.arange(args.gcn_num_random_features):
                if i in nodes_list_100layerGCN[j*args.gcn_num_random_features+k]:
                    correctly_classified_times += 1
        graph_structures_probability[i] = correctly_classified_times * 1.0 / (args.gcn_exp_times * args.gcn_num_random_features)

    gnn_n = np.mean((1-graph_structures_probability)*(1-features_probability))

    # save results
    datasets = ["cora", "pubmed", "citeseer", "amazon_photo", "amazon_computers", "coauthors_physics", "coauthors_cs"]

    # check if the file exists
    outputs_file = os.path.join(outputs_subdir, 'gnn_n.csv')
    if os.path.exists(outputs_file):
        # read from file
        gnn_n_df = pd.read_csv(outputs_file, index_col=0, header=0)
    else:
        # new array to store results
        # row: dataset    column: item
        gnn_n_array = np.zeros([len(datasets), 1])
        gnn_n_df = pd.DataFrame(gnn_n_array, index=datasets, columns=['GNN-N'])

    gnn_n_df.loc[args.dataset, "GNN-N"] = gnn_n
    gnn_n_df.to_csv(outputs_file)
示例#6
0
def main(args):

    # check if 'outputs' directory exist, if not create
    outputs_dir = os.path.join(os.getcwd(), '../outputs')
    outputs_subdir = os.path.join(outputs_dir, 'gnn_n')
    if not os.path.exists(outputs_dir):
        os.makedirs(outputs_dir)
    if not os.path.exists(outputs_subdir):
        os.makedirs(outputs_subdir)

    # load dataset
    print("********** LOAD DATASET **********")
    g, features, labels, train_mask, valid_mask, test_mask = load_dataset(args)

    # read parameters from config file
    path = '../configs/' + args.dataset + '.yaml'
    config_file = os.path.join(os.getcwd(), path)
    with open(config_file, 'r') as f:
        config = yaml.load(f, Loader=yaml.FullLoader)
    h_feats = config['hidden_features']
    in_feats = features.shape[1]
    out_feats = torch.max(labels).item() + 1

    # declarations of variables to save experiment results
    acc_original_features = np.zeros(args.exp_times)
    acc_random_features = np.zeros([args.exp_times, args.num_random_features])
    correctly_classified_nodes_original_features_list = []
    correctly_classified_nodes_random_features_list = []

    for i in range(args.exp_times):
        print("********** BUILD NETWORK: {} Experiment **********".format(i +
                                                                          1))
        # build network
        gcn = GCN(100, in_feats, h_feats, out_feats).to(device)

        print("********** TRAIN NETWORK: {} Experiment **********".format(i +
                                                                          1))
        # train network
        _ = train_gcn(gcn, g, features, labels, train_mask, valid_mask, args)

        print(
            "********** TEST WITH ORIGINAL FEATURES: {} Experiment **********".
            format(i + 1))
        # test with original features
        acc_original_features[
            i], correctly_classified_nodes_original_features = evaluate_and_classify_nodes_gcn(
                gcn, g, features, labels, test_mask)
        correctly_classified_nodes_original_features_list.append(
            correctly_classified_nodes_original_features)
        print("Test accuracy with original features: {:.2f}% !".format(
            acc_original_features[i] * 100))

        print("********** TEST WITH RANDOM FEATURES: {} Experiment **********".
              format(i + 1))
        # test with random features
        for j in range(args.num_random_features):
            acc_random_features[
                i,
                j], correctly_classified_nodes_random_features = evaluate_and_classify_nodes_with_random_features_gcn(
                    gcn, g, features, labels, test_mask)
            correctly_classified_nodes_random_features_list.append(
                correctly_classified_nodes_random_features)
            print("Test accuracy with random features {}: {:.2f}% !".format(
                j + 1, acc_random_features[i, j] * 100))

    print("********** COMPUTE AVERAGE ACCURACY **********")
    acc_mean_original_features = np.mean(acc_original_features)
    acc_std_original_features = np.std(acc_original_features, ddof=1)
    acc_mean_random_features = np.mean(acc_random_features)
    acc_std_random_features = np.std(acc_random_features, ddof=1)
    print(
        "Average accuracy with original features is {:.2f}+-{:.2f}% !".format(
            acc_mean_original_features * 100, acc_std_original_features * 100))
    print("Average accuracy with random features is {:.2f}+-{:.2f}% !".format(
        acc_mean_random_features * 100, acc_std_random_features * 100))

    print("********** COMPUTE R-RR **********")
    r_rr_list = []
    for i in np.arange(args.exp_times):
        for j in np.arange(args.num_random_features):
            for k in np.arange(j + 1, args.num_random_features):
                set_1 = correctly_classified_nodes_random_features_list[
                    i * args.num_random_features + j]
                set_2 = correctly_classified_nodes_random_features_list[
                    i * args.num_random_features + k]
                r_rr_list.append(
                    len(set_1.intersection(set_2)) * 1.0 /
                    min(len(set_1), len(set_2)))
    r_rr_mean = np.mean(np.array(r_rr_list))
    r_rr_std = np.std(np.array(r_rr_list), ddof=1)
    print("R-RR is {:.2f}+-{:.2f}% !".format(r_rr_mean * 100, r_rr_std * 100))

    print("********** COMPUTE RO-RR **********")
    ro_rr_array = np.zeros(args.exp_times)
    for i in np.arange(args.exp_times):
        set_1 = correctly_classified_nodes_original_features_list[i]
        set_2 = correctly_classified_nodes_random_features_list[
            i * args.num_random_features]
        ro_rr_array[i] = len(set_1.intersection(set_2)) * 1.0 / min(
            len(set_1), len(set_2))
    ro_rr_mean = np.mean(ro_rr_array)
    ro_rr_std = np.std(ro_rr_array, ddof=1)
    print("RO-RR is {:.2f}+-{:.2f}% !".format(ro_rr_mean * 100,
                                              ro_rr_std * 100))

    print("********** COMPUTE TT-RR **********")
    tt_rr_array = np.eye(args.exp_times)
    for i in np.arange(args.exp_times):
        for j in np.arange(i + 1, args.exp_times):
            set_1 = correctly_classified_nodes_random_features_list[
                i * args.num_random_features]
            set_2 = correctly_classified_nodes_random_features_list[
                j * args.num_random_features]
            tt_rr_array[i, j] = len(set_1.intersection(set_2)) * 1.0 / min(
                len(set_1), len(set_2))
            tt_rr_array[j, i] = len(set_1.intersection(set_2)) * 1.0 / min(
                len(set_1), len(set_2))

    # save results
    datasets = [
        "cora", "pubmed", "citeseer", "amazon_photo", "amazon_computers",
        "coauthors_physics", "coauthors_cs"
    ]
    items = [
        "acc mean original features", "acc std original features",
        "acc mean random features", "acc std random features", "r_rr mean",
        "r_rr std", "ro_rr mean", "ro_rr std"
    ]

    # check if the file exists
    outputs_file = os.path.join(outputs_subdir, 'gnn_n_100layerGCN.csv')
    if os.path.exists(outputs_file):
        # read from file
        results_df = pd.read_csv(outputs_file, index_col=0, header=0)
    else:
        # new array to store results
        # row: dataset    column: item
        results_all_dataset = np.zeros([len(datasets), len(items)])
        results_df = pd.DataFrame(results_all_dataset,
                                  index=datasets,
                                  columns=items)

    results_df.loc[args.dataset,
                   "acc mean original features"] = acc_mean_original_features
    results_df.loc[args.dataset,
                   "acc std original features"] = acc_std_original_features
    results_df.loc[args.dataset,
                   "acc mean random features"] = acc_mean_random_features
    results_df.loc[args.dataset,
                   "acc std random features"] = acc_std_random_features
    results_df.loc[args.dataset, "r_rr mean"] = r_rr_mean
    results_df.loc[args.dataset, "r_rr std"] = r_rr_std
    results_df.loc[args.dataset, "ro_rr mean"] = ro_rr_mean
    results_df.loc[args.dataset, "ro_rr std"] = ro_rr_std
    results_df.to_csv(outputs_file)

    tt_rr_file = os.path.join(outputs_subdir, 'tt_rr_' + args.dataset + '.npy')
    with open(tt_rr_file, 'wb') as f:
        np.save(f, tt_rr_array)
        f.close()

    correctly_classified_nodes_random_features_list_file = os.path.join(
        outputs_subdir, 'nodes_list_100layerGCN_' + args.dataset + '.npy')
    with open(correctly_classified_nodes_random_features_list_file, 'wb') as f:
        np.save(f, correctly_classified_nodes_random_features_list)
        f.close()
示例#7
0
def main(args):

    # check if 'outputs' directory exist, if not create
    outputs_dir = os.path.join(os.getcwd(), '../outputs')
    outputs_subdir = os.path.join(outputs_dir, 'gnn_n')
    if not os.path.exists(outputs_dir):
        os.makedirs(outputs_dir)
    if not os.path.exists(outputs_subdir):
        os.makedirs(outputs_subdir)

    # load dataset
    print("********** LOAD DATASET **********")
    g, features, labels, train_mask, valid_mask, test_mask = load_dataset(args)

    # read parameters from config file
    path = '../configs/' + args.dataset + '.yaml'
    config_file = os.path.join(os.getcwd(), path)
    with open(config_file, 'r') as f:
        config = yaml.load(f, Loader=yaml.FullLoader)
    h_feats = config['hidden_features']
    in_feats = features.shape[1]
    out_feats = torch.max(labels).item() + 1

    # declarations of variables to save experiment results
    acc = np.zeros(args.exp_times)
    correctly_classified_nodes_list = []

    for i in range(args.exp_times):
        print("********** BUILD NETWORK: {} Experiment **********".format(i +
                                                                          1))
        # build network
        mlp = MLP(3, in_feats, h_feats, out_feats).to(device)

        print("********** TRAIN NETWORK: {} Experiment **********".format(i +
                                                                          1))
        # train network
        _ = train_mlp(mlp, features, labels, train_mask, valid_mask, args)

        print("********** TEST MLP: {} Experiment **********".format(i + 1))
        # test with original features
        acc[i], correctly_classified_nodes = evaluate_and_classify_nodes_mlp(
            mlp, features, labels, test_mask)
        correctly_classified_nodes_list.append(correctly_classified_nodes)
        print("Test accuracy: {:.2f}% !".format(acc[i] * 100))

    print("********** COMPUTE RAVERAGE ACCURACY **********")
    acc_avg = np.mean(acc)
    acc_std = np.std(acc, ddof=1)
    print("Accuracy for {} is {:.2f}+-{:.2f}% !".format(
        args.dataset, acc_avg * 100, acc_std * 100))

    print("********** COMPUTE AVERAGE REPEATING RATE **********")
    repeating_rates_list = []
    for i in np.arange(args.exp_times):
        for j in np.arange(i + 1, args.exp_times):
            set_1 = correctly_classified_nodes_list[i]
            set_2 = correctly_classified_nodes_list[j]
            repeating_rates_list.append(
                len(set_1.intersection(set_2)) * 1.0 /
                min(len(set_1), len(set_2)))
    rr_avg = np.mean(np.array(repeating_rates_list))
    rr_std = np.std(np.array(repeating_rates_list), ddof=1)
    print("Repeating rate for {} is {:.2f}+-{:.2f}% !".format(
        args.dataset, rr_avg * 100, rr_std * 100))

    # save results
    datasets = [
        "cora", "pubmed", "citeseer", "amazon_photo", "amazon_computers",
        "coauthors_physics", "coauthors_cs"
    ]
    items = ["acc_mean", "acc_std", "rr_mean", "rr_std"]
    # check if the file exists
    outputs_file = os.path.join(outputs_subdir, 'gnn_n_3layerMLP.csv')
    if os.path.exists(outputs_file):
        # read from file
        results_df = pd.read_csv(outputs_file, index_col=0, header=0)
    else:
        # new array to store results
        # row: dataset    column: item
        results_all_dataset = np.zeros([len(datasets), len(items)])
        results_df = pd.DataFrame(results_all_dataset,
                                  index=datasets,
                                  columns=items)

    results_df.loc[args.dataset, "acc_mean"] = acc_avg
    results_df.loc[args.dataset, "acc_std"] = acc_std
    results_df.loc[args.dataset, "rr_mean"] = rr_avg
    results_df.loc[args.dataset, "rr_std"] = rr_std
    results_df.to_csv(outputs_file)

    correctly_classified_nodes_list_file = os.path.join(
        outputs_subdir, 'nodes_list_3layerMLP_' + args.dataset + '.npy')
    with open(correctly_classified_nodes_list_file, 'wb') as f:
        np.save(f, correctly_classified_nodes_list)
        f.close()
def generate(symbols_folder,
             background_folder,
             batch_per_class,
             label_binarizer,
             target_height=32,
             target_width=32):
    """
    :param symbols_folder: path to the folder containing symbols .npz files
    :param background_folder: path to the folder containing background .npz files
    :param batch_per_class: number of every symbol example in a single batch
    :param label_binarizer: sklearn.binarizer
    :param target_height: target height of the generated symbols' images
    :param target_width: target width of the generated symbols' images
    :return: (batch of symbols images, batch of corresponding labels)
    """
    symbols, background = load_dataset_by_symbol(symbols_folder), load_dataset(
        background_folder)
    for s in symbols:
        s_X, s_y = symbols[s]
        new_s_X = []
        for i in range(s_X.shape[0]):
            new_s_X.append(preprocess_symbol(s_X[i], 3))
        symbols[s] = np.array(new_s_X), s_y

    bg_X, bg_y = background['X'], background['y']
    new_bg_X = []
    for i in range(bg_X.shape[0]):
        new_bg_X.append(preprocess_background(bg_X[i]))
    bg_X = np.array(bg_X)

    # Image format
    # bg_X, bg_y
    # symbols[symbol_name -> symbol_X, symbol_y]

    label_binarizer.fit(np.array([s for s in symbols] + ["background"]))
    batch_nr = 0
    while True:
        batch_X, batch_y = [], []
        batch_bg = bg_X[np.random.randint(0, bg_X.shape[0],
                                          batch_per_class * len(symbols))]
        for s in symbols:
            symbol_X, symbol_y = symbols[s]
            m = symbol_X.shape[0]
            start_ind = (batch_nr * batch_per_class) % m
            end_ind = start_ind + batch_per_class
            if end_ind <= m:
                batch_X.append(symbol_X[start_ind:end_ind])
            else:
                batch_X.append(symbol_X[start_ind:])
                batch_X.append(symbol_X[0:end_ind % m])
            batch_y.append(symbol_y[0:batch_per_class])
        batch_X, batch_y = np.concatenate(batch_X), np.concatenate(batch_y)

        batch_X = merge_symbols_and_backgrounds(batch_X, batch_bg)

        # add background
        random_bg_ind = np.random.randint(
            0, bg_X.shape[0],
            batch_per_class)  # mb batch_per_class * len(symbols)
        bg_batch_x, bg_batch_y = bg_X[random_bg_ind], bg_y[random_bg_ind]

        batch_X, batch_y = np.concatenate(
            [batch_X, bg_batch_x]), np.concatenate([batch_y, bg_batch_y])

        # one hot encode y vector
        batch_y = label_binarizer.transform(batch_y)

        # reshape to fit the keras format
        batch_y = batch_y.reshape(batch_y.shape[0], 1, 1, batch_y.shape[1])

        # rescale accordingly
        batch_X = rescale_dataset(batch_X, target_height, target_width)

        yield shuffle_dataset(batch_X, batch_y)

        batch_nr += 1