Exemplo n.º 1
0
def load_semi_supervised(n_labeled=100, max_parse_size=1012, seed=123456):
    """
    Load the AG News dataset where only a fraction of data points are labeled. The amount
    of labeled data will be evenly distributed accross classes.
    :param n_labeled: Number of labeled data points.
    :param max_parse_size: The fixed length of a string. The largest needed is 1012.
    :param seed: The seed for the pseudo random shuffle of data points.
    :return: Train set unlabeled and labeled, test set, validation set.
    """

    train_set, test_set, valid_set = _load(max_parse_size)

    rng = np.random.RandomState(seed=seed)
    n_classes = train_set[1].max() + 1
    print n_classes

    # Create the labeled and unlabeled data evenly distributed across classes.
    x_l, y_l, x_u, y_u = create_semi_supervised(train_set, n_labeled, rng)

    train_set = (x_u, y_u)
    train_set_labeled = (x_l, y_l)

    # shuffle data
    train_x, train_t = train_set
    train_collect = np.append(train_x, train_t, axis=1)
    rng.shuffle(train_collect)
    train_set = (train_collect[:, :-n_classes], train_collect[:, -n_classes:])

    test_set = pad_targets(test_set)
    if valid_set is not None:
        valid_set = pad_targets(valid_set)

    return train_set, train_set_labeled, test_set, valid_set
Exemplo n.º 2
0
Arquivo: mnist.py Projeto: ylfzr/ADGM
def load_semi_supervised(n_labeled=100,
                         filter_std=0.1,
                         seed=123456,
                         train_valid_combine=False):
    """
    Load the MNIST dataset where only a fraction of data points are labeled. The amount
    of labeled data will be evenly distributed accross classes.
    :param n_labeled: Number of labeled data points.
    :param filter_std: The standard deviation threshold for keeping features.
    :param seed: The seed for the pseudo random shuffle of data points.
    :param train_valid_combine: If the train set and validation set should be combined.
    :return: Train set unlabeled and labeled, test set, validation set.
    """

    # train_set, test_set, valid_set = _download()
    # train_set, test_set, _ = _download()
    # # Combine the train set and validation set.
    # if train_valid_combine:
    #     train_set = np.append(train_set[0], valid_set[0], axis=0), np.append(train_set[1], valid_set[1], axis=0)
    train_set, valid_set, test_set = data_generator()
    print 'data_ready'
    rng = np.random.RandomState(seed=seed)

    # Create the labeled and unlabeled data evenly distributed across classes.
    x_l, y_l, x_u, y_u = create_semi_supervised(train_set, n_labeled, rng)

    # Filter out the features with a low standard deviation.
    if filter_std > .0:
        idx_keep = np.std(x_u, axis=0) > filter_std
        x_l, x_u = x_l[:, idx_keep], x_u[:, idx_keep]
        valid_set = (valid_set[0][:, idx_keep], valid_set[1])
        test_set = (test_set[0][:, idx_keep], test_set[1])

    train_set = (x_u, y_u)
    train_set_labeled = (x_l, y_l)

    # shuffle data
    train_x, train_t = train_set
    train_collect = np.append(train_x, train_t, axis=1)
    rng.shuffle(train_collect)
    train_set = (train_collect[:, :-10], train_collect[:, -10:])

    test_set = pad_targets(test_set)
    if valid_set is not None:
        valid_set = pad_targets(valid_set)

    return train_set, train_set_labeled, test_set, valid_set
Exemplo n.º 3
0
def load_semi_supervised(n_labeled=100,
                         cut_off=1000,
                         seed=123456,
                         conv=False,
                         extra=False):
    """
    Load the SVHN dataset where only a fraction of data points are labeled. The amount
    of labeled data will be evenly distributed accross classes.
    :param n_labeled: Number of labeled data points.
    :param cut_off: A cut off constant so that the data set is divisable.
    :param seed: The seed for the pseudo random shuffle of data points.
    :param conv: Boolean whether the images should be vectorized or not.
    :param extra: Include the extra set or not.
    :return: Train set unlabeled and labeled, test set, validation set.
    """

    rng = np.random.RandomState(seed=seed)
    train_set, test_set, valid_set = _download(extra)

    # Create the labeled and unlabeled data evenly distributed across classes.
    x_l, y_l, x_u, y_u = create_semi_supervised(train_set, n_labeled, rng)

    train_set = (x_u, y_u)
    train_set_labeled = (x_l, y_l)
    train_x, train_t = train_set

    # shuffle data
    train_collect = np.append(train_x, train_t, axis=1)
    rng.shuffle(train_collect)
    train_set = (train_collect[:, :-10], train_collect[:, -10:])

    train_set = cut_off_dataset(train_set, cut_off, rng)

    test_set = pad_targets(test_set)
    if valid_set is not None:
        valid_set = pad_targets(valid_set)

    if conv:
        train_set = _gen_conv(train_set)
        test_set = _gen_conv(test_set)
        if valid_set is not None:
            valid_set = _gen_conv(valid_set)

    return train_set, train_set_labeled, test_set, valid_set
Exemplo n.º 4
0
def load_semi_supervised(n_labeled=100, filter_std=0.1, seed=123456, train_valid_combine=False):
    """
    Load the MNIST dataset where only a fraction of data points are labeled. The amount
    of labeled data will be evenly distributed accross classes.
    :param n_labeled: Number of labeled data points.
    :param filter_std: The standard deviation threshold for keeping features.
    :param seed: The seed for the pseudo random shuffle of data points.
    :param train_valid_combine: If the train set and validation set should be combined.
    :return: Train set unlabeled and labeled, test set, validation set.
    """

    train_set, test_set, valid_set = _download()

    # Combine the train set and validation set.
    if train_valid_combine:
        train_set = np.append(train_set[0], valid_set[0], axis=0), np.append(train_set[1], valid_set[1], axis=0)

    rng = np.random.RandomState(seed=seed)

    # Create the labeled and unlabeled data evenly distributed across classes.
    x_l, y_l, x_u, y_u = create_semi_supervised(train_set, n_labeled, rng)

    # Filter out the features with a low standard deviation.
    if filter_std > .0:
        idx_keep = np.std(x_u, axis=0) > filter_std
        x_l, x_u = x_l[:, idx_keep], x_u[:, idx_keep]
        valid_set = (valid_set[0][:, idx_keep], valid_set[1])
        test_set = (test_set[0][:, idx_keep], test_set[1])

    train_set = (x_u, y_u)
    train_set_labeled = (x_l, y_l)

    # shuffle data
    train_x, train_t = train_set
    train_collect = np.append(train_x, train_t, axis=1)
    rng.shuffle(train_collect)
    train_set = (train_collect[:, :-10], train_collect[:, -10:])

    test_set = pad_targets(test_set)
    if valid_set is not None:
        valid_set = pad_targets(valid_set)

    return train_set, train_set_labeled, test_set, valid_set
def load_semi_supervised(n_labeled=1000, cut_off=1000, seed=123456, conv=False, extra=False):
    """
    Load the SVHN dataset where only a fraction of data points are labeled. The amount
    of labeled data will be evenly distributed accross classes.
    :param n_labeled: Number of labeled data points.
    :param cut_off: A cut off constant so that the data set is divisable.
    :param seed: The seed for the pseudo random shuffle of data points.
    :param conv: Boolean whether the images should be vectorized or not.
    :param extra: Include the extra set or not.
    :return: Train set unlabeled and labeled, test set, validation set.
    """

    rng = np.random.RandomState(seed=seed)
    train_set, test_set, valid_set = _download(extra)

    # Create the labeled and unlabeled data evenly distributed across classes.
    x_l, y_l, x_u, y_u = create_semi_supervised(train_set, n_labeled, rng)

    train_set = (x_u, y_u)
    train_set_labeled = (x_l, y_l)
    train_x, train_t = train_set

    # shuffle data
    train_collect = np.append(train_x, train_t, axis=1)
    rng.shuffle(train_collect)
    train_set = (train_collect[:, :-10], train_collect[:, -10:])

    train_set = cut_off_dataset(train_set, cut_off, rng)

    test_set = pad_targets(test_set)
    if valid_set is not None:
        valid_set = pad_targets(valid_set)

    if conv:
        train_set = _gen_conv(train_set)
        test_set = _gen_conv(test_set)
        if valid_set is not None:
            valid_set = _gen_conv(valid_set)

    return train_set, train_set_labeled, test_set, valid_set
Exemplo n.º 6
0
def load_semi_supervised(n_labeled=100, cut_off=100, seed=123456, expand_channels=False, remove_channels=False):
    """
    Load the NORB dataset where only a fraction of data points are labeled. The amount
    of labeled data will be evenly distributed accross classes.
    :param n_labeled: Number of labeled data points.
    :param cut_off: A cut off constant so that the data set is divisable.
    :param seed: The seed for the pseudo random shuffle of data points.
    :param expand_channels: The pairwise images are rolled out to one dataset.
    :param remove_channels: The second image in each pair is removed.
    :return: Train set unlabeled and labeled, test set, validation set.
    """

    rng = np.random.RandomState(seed=seed)
    train_set, test_set, valid_set = _download()

    # Create the labeled and unlabeled data evenly distributed across classes.
    x_l, y_l, x_u, y_u = create_semi_supervised(train_set, n_labeled, rng)

    if expand_channels:
        x_l = x_l.reshape((-1, 2, x_l.shape[1] / 2.))
        x_l_1 = x_l[:, 0, :]
        x_l_2 = x_l[:, 1, :]
        x_l = np.append(x_l_1, x_l_2, axis=0)
        y_l = np.append(y_l, y_l, axis=0)
        x_u = x_u.reshape((-1, 2, x_u.shape[1] / 2.))
        x_u_1 = x_u[:, 0, :]
        x_u_2 = x_u[:, 1, :]
        x_u = np.append(x_u_1, x_u_2, axis=0)
        y_u = np.append(y_u, y_u, axis=0)

        test_x, test_t = test_set
        test_x = test_x.reshape((-1, 2, test_x.shape[1] / 2.))
        test_x = np.append(test_x[:, 0, :], test_x[:, 1, :], axis=0)
        test_t = np.append(test_t, test_t, axis=0)
        test_set = (test_x, test_t)

        valid_x, valid_t = valid_set
        valid_x = valid_x.reshape((-1, 2, valid_x.shape[1] / 2.))
        valid_x = np.append(valid_x[:, 0, :], valid_x[:, 1, :], axis=0)
        valid_t = np.append(valid_t, valid_t, axis=0)
        valid_set = (valid_x, valid_t)

    elif remove_channels:
        x_l = x_l.reshape((-1, 2, x_l.shape[1] / 2.))
        x_l = x_l[:, 0, :].reshape((-1, x_l.shape[-1]))
        x_u = x_u.reshape((-1, 2, x_u.shape[1] / 2.))
        x_u = x_u[:, 0, :].reshape((-1, x_u.shape[-1]))

        test_x, test_t = test_set
        test_x = test_x.reshape((-1, 2, test_x.shape[1] / 2.))
        test_x = test_x[:, 0, :].reshape((-1, test_x.shape[-1]))
        test_set = (test_x, test_t)

        valid_x, valid_t = valid_set
        valid_x = valid_x.reshape((-1, 2, valid_x.shape[1] / 2.))
        valid_x = valid_x[:, 0, :].reshape((-1, valid_x.shape[-1]))
        valid_set = (valid_x, valid_t)

    train_set = (x_u, y_u)
    train_set_labeled = (x_l, y_l)
    train_x, train_t = train_set

    # shuffle data
    train_collect = np.append(train_x, train_t, axis=1)
    rng.shuffle(train_collect)
    train_set = (train_collect[:, :-5], train_collect[:, -5:])

    train_set = cut_off_dataset(train_set, cut_off, rng)

    test_set = pad_targets(test_set)
    if valid_set is not None:
        valid_set = pad_targets(valid_set)

    return train_set, train_set_labeled, test_set, valid_set
Exemplo n.º 7
0
def load_semi_supervised(n_labeled=100, cut_off=100, seed=123456, expand_channels=False, remove_channels=False):
    """
    Load the NORB dataset where only a fraction of data points are labeled. The amount
    of labeled data will be evenly distributed accross classes.
    :param n_labeled: Number of labeled data points.
    :param cut_off: A cut off constant so that the data set is divisable.
    :param seed: The seed for the pseudo random shuffle of data points.
    :param expand_channels: The pairwise images are rolled out to one dataset.
    :param remove_channels: The second image in each pair is removed.
    :return: Train set unlabeled and labeled, test set, validation set.
    """

    rng = np.random.RandomState(seed=seed)
    train_set, test_set, valid_set = _download()

    # Create the labeled and unlabeled data evenly distributed across classes.
    x_l, y_l, x_u, y_u = create_semi_supervised(train_set, n_labeled, rng)

    if expand_channels:
        x_l = x_l.reshape((-1, 2, x_l.shape[1] / 2.))
        x_l_1 = x_l[:, 0, :]
        x_l_2 = x_l[:, 1, :]
        x_l = np.append(x_l_1, x_l_2, axis=0)
        y_l = np.append(y_l, y_l, axis=0)
        x_u = x_u.reshape((-1, 2, x_u.shape[1] / 2.))
        x_u_1 = x_u[:, 0, :]
        x_u_2 = x_u[:, 1, :]
        x_u = np.append(x_u_1, x_u_2, axis=0)
        y_u = np.append(y_u, y_u, axis=0)

        test_x, test_t = test_set
        test_x = test_x.reshape((-1, 2, test_x.shape[1] / 2.))
        test_x = np.append(test_x[:, 0, :], test_x[:, 1, :], axis=0)
        test_t = np.append(test_t, test_t, axis=0)
        test_set = (test_x, test_t)

        valid_x, valid_t = valid_set
        valid_x = valid_x.reshape((-1, 2, valid_x.shape[1] / 2.))
        valid_x = np.append(valid_x[:, 0, :], valid_x[:, 1, :], axis=0)
        valid_t = np.append(valid_t, valid_t, axis=0)
        valid_set = (valid_x, valid_t)

    elif remove_channels:
        x_l = x_l.reshape((-1, 2, x_l.shape[1] / 2.))
        x_l = x_l[:, 0, :].reshape((-1, x_l.shape[-1]))
        x_u = x_u.reshape((-1, 2, x_u.shape[1] / 2.))
        x_u = x_u[:, 0, :].reshape((-1, x_u.shape[-1]))

        test_x, test_t = test_set
        test_x = test_x.reshape((-1, 2, test_x.shape[1] / 2.))
        test_x = test_x[:, 0, :].reshape((-1, test_x.shape[-1]))
        test_set = (test_x, test_t)

        valid_x, valid_t = valid_set
        valid_x = valid_x.reshape((-1, 2, valid_x.shape[1] / 2.))
        valid_x = valid_x[:, 0, :].reshape((-1, valid_x.shape[-1]))
        valid_set = (valid_x, valid_t)

    train_set = (x_u, y_u)
    train_set_labeled = (x_l, y_l)
    train_x, train_t = train_set

    # shuffle data
    train_collect = np.append(train_x, train_t, axis=1)
    rng.shuffle(train_collect)
    train_set = (train_collect[:, :-5], train_collect[:, -5:])

    train_set = cut_off_dataset(train_set, cut_off, rng)

    test_set = pad_targets(test_set)
    if valid_set is not None:
        valid_set = pad_targets(valid_set)

    return train_set, train_set_labeled, test_set, valid_set