Exemplo n.º 1
0
def data_partition_random(dataset_dir, dataset_name, label_n_per_class):
    # Random data partition
    text_set_n = 1000
    val_set_n = 500
    adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, one_hot_labels = load_data(dataset_name, dataset_dir)

    n = len(y_train)
    k = len(y_train[0])

    labels = one_hot_labels.argmax(axis=1)

    train_index_new = np.zeros(k*label_n_per_class).astype(int)
    train_mask_new = np.zeros(n).astype(bool)
    val_mask_new = np.zeros(n).astype(bool)
    test_mask_new = np.zeros(n).astype(bool)

    y_train_new = np.zeros((n, k))
    y_val_new = np.zeros((n, k))
    y_test_new = np.zeros((n, k))

    class_index_dict = {}
    for i in range(k):
        class_index_dict[i] = np.where(labels == i)[0]

    for i in range(k):
        class_index = class_index_dict[i]
        train_index_one_class = np.random.choice(class_index, label_n_per_class, replace=False)
        train_index_new[i*label_n_per_class:i*label_n_per_class + label_n_per_class] = train_index_one_class

    train_index_new = list(train_index_new)
    test_val_potential_index = list(set([i for i in range(n)]) - set(train_index_new))
    test_index_new = np.random.choice(test_val_potential_index, text_set_n, replace=False)
    potential_val_index = list(set(test_val_potential_index) - set(test_index_new))
    val_index_new = np.random.choice(potential_val_index, val_set_n, replace=False)

    train_mask_new[train_index_new] = True
    val_mask_new[val_index_new] = True
    test_mask_new[test_index_new] = True

    for i in train_index_new:
        y_train_new[i][labels[i]] = 1

    for i in val_index_new:
        y_val_new[i][labels[i]] = 1

    for i in test_index_new:
        y_test_new[i][labels[i]] = 1

    return adj, features, y_train_new, y_val_new, y_test_new, train_mask_new, val_mask_new, test_mask_new, one_hot_labels
Exemplo n.º 2
0
def data_partition_fix(dataset_dir, dataset_name, label_n_per_class):
    # Data partition using the official split from Kipf's original GCN
    if dataset_name == 'movie':
        adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, labels = load_movie(
            dataset_name, dataset_dir)
    else:
        adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, labels = load_data(
            dataset_name, dataset_dir)
    k = len(y_train[0])
    train_set_index = np.where(train_mask == True)[0]
    # print("train_set_index", train_set_index)
    train_set_labels = labels[train_set_index]
    # print(labels)
    train_node_index = {}
    for i in range(k):
        train_node_index[i] = np.where(train_set_labels[:, i] == 1)[0]

    for i in range(k):
        hide_index = train_node_index[i][label_n_per_class:]
        print("The training set index for class {} is {}".format(
            i, train_node_index[i][0:label_n_per_class]))
        train_mask[hide_index] = False
        y_train[hide_index] = 0

    return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, labels
Exemplo n.º 3
0
def data_partition_fix(dataset_dir, dataset_name, label_n_per_class):
    # Data partition using the official split from Kipf's original GCN
    adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, one_hot_labels = load_data(
        dataset_name, dataset_dir)
    k = len(y_train[0])
    train_set_index = np.where(train_mask == True)[0]
    labels = one_hot_labels.argmax(axis=1)
    train_set_labels = labels[train_set_index]
    train_node_index = {}
    for i in range(k):
        train_node_index[i] = np.where(train_set_labels == i)[0]

    for i in range(k):
        hide_index = train_node_index[i][label_n_per_class:]
        print("The training set index for class {} is {}".format(i, train_node_index[i][0:label_n_per_class]))
        train_mask[hide_index] = False
        y_train[hide_index] = 0

    return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, one_hot_labels
Exemplo n.º 4
0
def get_data(dataset_name, random_split, split_sizes, random_split_seed,
             add_val=True, add_val_seed=1, p_val=0.5,
             adjacency_filename=None,
             use_knn_graph=False,
             knn_metric=None,
             knn_k=None,
             balanced_split=False,
             samples_per_class=20):

    if dataset_name == 'polblogs':
        X, y, _ = load_polblogs()
        A = load_adjacency_from_file("datasets/polblogs/polblogs_graph.gpickle")
        X = sp.sparse.lil_matrix(X)
    else:
        A, X, y_train, y_val, y_test, mask_train, mask_val, mask_test, y = load_data(dataset_name, "datasets")

    if use_knn_graph:
        tf.logging.info("Using KNN graph")
        A = kneighbors_graph(X, knn_k, metric=knn_metric)

        # consistent with our implementation of only considering the lower triangular
        A = sp.sparse.tril(A, k=-1)
        A = A + np.transpose(A)

    if adjacency_filename:
        A = load_adjacency_from_file(adjacency_filename)

        # consistent with our implementation of only considering the lower triangular
        A = sp.sparse.tril(A, k=-1)
        A = A + np.transpose(A)

    n, d = X.shape
    _, k = y.shape

    tf.logging.info("Dataset has {} samples, dimensionality {}".format(n, d))
    tf.logging.info("Targets belong to {} classes".format(k))

    if random_split:
        print("Using a random split")
        random_state = np.random.RandomState(random_split_seed)
        split = recursive_stratified_shuffle_split(
            sizes=split_sizes, random_state=random_state
        )
        indices = list(split(X, y))
    elif balanced_split:
        indices = balanced_data_split(X, y, samples_per_class, random_state=random_split_seed)
    else:  # fixed plit
        indices = load_split(dataset_name)

    tf.logging.info(
        "Split resulted in "
        "{} training, "
        "{} validation, "
        "{} test samples.".format(*map(len, indices))
    )

    [mask_train, mask_val, mask_test] = masks = list(
        map(partial(indices_to_mask, size=n), indices)
    )

    y_train, y_val, y_test = map(partial(mask_values, y), masks)

    # A = A.toarray()

    if (add_val):
        mask_train, mask_val = add_val_to_train(mask_train, mask_val, add_val_seed, p_val)
        masks = [mask_train, mask_val, mask_test]
        y_train, y_val, y_test = map(partial(mask_values, y), masks)


    print("**********************************************************************************************")
    print("train size: {} val size: {} test size: {}".format(np.sum(mask_train), np.sum(mask_val), np.sum(mask_test)))
    print("**********************************************************************************************")

    return X, y, A, mask_train, mask_val, mask_test, y_train, y_val, y_test



# def get_data_incompatible(dataset_name, random_split, split_sizes, random_split_seed,
#              add_val=True, add_val_seed=1, p_val=0.5,
#              adjacency_filename=None,
#              balanced_split=False, samples_per_class=20):
#
#     tf.logging.info("Loading '{}' dataset...".format(dataset_name))
#     loader = DATASET_LOADERS[dataset_name]
#     X, y, A = loader()
#
#     if adjacency_filename:
#         A = load_adjacency_from_file(adjacency_filename)
#
#     X = normalize(X, norm="l1", axis=1)
#
#     n, d = X.shape
#     _, k = y.shape
#
#     tf.logging.info("Dataset has {} samples, dimensionality {}".format(n, d))
#     tf.logging.info("Targets belong to {} classes".format(k))
#
#     if random_split:
#         random_state = np.random.RandomState(random_split_seed)
#         split = recursive_stratified_shuffle_split(
#             sizes=split_sizes, random_state=random_state
#         )
#         indices = list(split(X, y))
#     elif balanced_split:
#         indices = balanced_data_split(X, y, samples_per_class, random_state=random_split_seed)
#     else: # fixed plit
#         indices = load_split(dataset_name)
#
#     tf.logging.info(
#         "Split resulted in "
#         "{} training, "
#         "{} validation, "
#         "{} test samples.".format(*map(len, indices))
#     )
#
#     [mask_train, mask_val, mask_test] = masks = list(
#         map(partial(indices_to_mask, size=n), indices)
#     )
#
#     y_train, y_val, y_test = map(partial(mask_values, y), masks)
#
#     # A = A.toarray()
#
#     if (add_val):
#         mask_train, mask_val = add_val_to_train(mask_train, mask_val, add_val_seed, p_val)




    print("**********************************************************************************************")
    print("train size: {} val size: {} test size: {}".format(np.sum(mask_train), np.sum(mask_val), np.sum(mask_test)))
    print("**********************************************************************************************")

    return X, y, A, mask_train, mask_val, mask_test, y_train, y_val, y_test