class SemiDataSet(object): def __init__(self, features, labels, unlabeled_features): self.unlabeled_features = unlabeled_features # Unlabled DataSet self.unlabeled_ds = DataSet(features, labels) # Labeled DataSet self.num_examples = self.unlabeled_ds.num_examples indices = numpy.arange(self.num_examples) shuffled_indices = numpy.random.permutation(indices) features = features[shuffled_indices] labels = labels[shuffled_indices] y = numpy.array([numpy.arange(10)[l == 1][0] for l in labels]) idx = indices[y == 0][:5] n_classes = y.max() + 1 n_from_each_class = unlabeled_features / n_classes i_labeled = [] for c in range(n_classes): i = indices[y == c][:n_from_each_class] i_labeled += list(i) l_images = features[i_labeled] l_labels = labels[i_labeled] self.labeled_ds = DataSet(l_images, l_labels) def next_batch(self, batch_size): unlabeled_images, _ = self.unlabeled_ds.next_batch(batch_size) if batch_size > self.unlabeled_features: labeled_images, labels = self.labeled_ds.next_batch(self.unlabeled_features) else: labeled_images, labels = self.labeled_ds.next_batch(batch_size) images = numpy.vstack([labeled_images, unlabeled_images]) return images, labels
def get_two_spirals_data_set_collection(): train_features, train_labels = two_spirals(2000) test_features, test_labels = two_spirals(1000) train = DataSet(train_features, train_labels) test = DataSet(test_features, test_labels) return DataSetCollection("two spirals", train, test, normalize=False)
def get_cifar_100_data_set_collection(root_path=CIFAR_DATA_DIR, one_hot=True, use_fine_labels=True, validation_size=0, validation_ratio=None): """Get the cifar 100 data set requires files to be downloaded and extracted into cifar-100-python directory within root path Args: root_path (str): one_hot (bool): If True converts sparse labels to one hot encoding use_fine_labels (bool): If true use full 100 labels, if False use 10 categories Returns: DataSetCollection """ root_path = root_path + "/cifar-100-python" features_train, labels_train = _load_cifar_100_set(root_path + "/train", use_fine_labels) features_test, labels_test = _load_cifar_100_set(root_path + "/test", use_fine_labels) if one_hot: num_classes = 100 if use_fine_labels else 10 labels_train = dense_to_one_hot(labels_train, num_classes) labels_test = dense_to_one_hot(labels_test, num_classes) if not validation_size and validation_ratio: validation_size = int( (len(labels_train) + len(labels_test)) * validation_ratio) if validation_size: features_validation = features_train[:validation_size] labels_validation = labels_train[:validation_size] features_train = features_train[validation_size:] labels_train = labels_train[validation_size:] validation = DataSet(features_validation, labels_validation, to_binary=True) else: validation = None train = DataSet(features_train, labels_train, to_binary=True) test = DataSet(features_test, labels_test, to_binary=True) collection = DataSetCollection('CIFAR-100' + ('-fine' if use_fine_labels else '-coarse'), train, test, validation=validation, normalize=True) return collection
def test_one_batch_iteration_exact_partial_batch(self): batch_size = 10 data_set = DataSet(np.random.normal(size=(25, 10)), np.random.normal(size=(25, 1))) results = list(data_set.one_iteration_in_batches(batch_size)) self.assertEqual(len(results), 2) self.assertEqual(len(results[0][0]), batch_size) self.assertEqual(len(results[0][1]), batch_size) self.assertEqual(len(results[-1][0]), batch_size) self.assertEqual(len(results[-1][1]), batch_size)
def get_cifar_10_data_set_collection(root_path=CIFAR_DATA_DIR, one_hot=True, validation_size=0, validation_ratio=None): """Get the cifar 100 data set requires files to be downloaded and extracted into cifar-10-batches-py directory within root path Args: root_path (str): one_hot (bool): If True converts sparse labels to one hot encoding Returns: DataSetCollection """ root_path += "/cifar-10-batches-py" features_train, labels_train, features_test, labels_test = _load(root_path) if one_hot: labels_train = dense_to_one_hot(labels_train) labels_test = dense_to_one_hot(labels_test) if not validation_size and validation_ratio: validation_size = int( (len(labels_train) + len(labels_test)) * validation_ratio) if validation_size: features_validation = features_train[validation_size:] labels_validation = labels_train[validation_size:] features_train = features_train[validation_size:] labels_train = labels_train[validation_size:] validation = DataSet(features_validation, labels_validation, to_binary=True) else: validation = None train = DataSet(features_train, labels_train, to_binary=True) test = DataSet(features_test, labels_test, to_binary=True) collection = DataSetCollection('CIFAR-10', train, test, validation=validation, normalize=True) return collection
def __init__(self, features, labels, unlabeled_features): self.unlabeled_features = unlabeled_features # Unlabled DataSet self.unlabeled_ds = DataSet(features, labels) # Labeled DataSet self.num_examples = self.unlabeled_ds.num_examples indices = numpy.arange(self.num_examples) shuffled_indices = numpy.random.permutation(indices) features = features[shuffled_indices] labels = labels[shuffled_indices] y = numpy.array([numpy.arange(10)[l == 1][0] for l in labels]) idx = indices[y == 0][:5] n_classes = y.max() + 1 n_from_each_class = unlabeled_features / n_classes i_labeled = [] for c in range(n_classes): i = indices[y == c][:n_from_each_class] i_labeled += list(i) l_images = features[i_labeled] l_labels = labels[i_labeled] self.labeled_ds = DataSet(l_images, l_labels)
def test_num_examples(self): data_set = DataSet(np.random.normal(size=(100, 10)), np.random.normal(size=(100, 1))) self.assertEqual(data_set.num_examples, 100)
def get_xor_data_set_collection(): features, labels = xor() train = DataSet(features, labels) test = DataSet(features, labels) return DataSetCollection("xor", train, test, normalize=False)
def get_mnist_data_set_collection(train_dir=os.path.dirname(__file__) + "/MNIST_data", number_labeled_examples=None, one_hot=True, validation_size=0, validation_ratio=None, limit_train_size=None, flatten=True): """Load mnist data Args: train_dir (str): directory to store the downloaded data, or to where it has previously been downloaded number_labeled_examples (int): For semi supervised learning, how many labels to use, if None we use supervised learning one_hot (bool): If True labels will be one hot vectors, not ints validation_size (int): Number of items to move to validation set limit_train_size (int): If set limit number of training items to this flatten (bool): If true data set is flattened to simply be array of image values, not 3d array of [width, height, depth] Returns: DataSetCollection """ TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' local_file = _maybe_download(TRAIN_IMAGES, train_dir) train_images = _extract_images(local_file) local_file = _maybe_download(TRAIN_LABELS, train_dir) train_labels = _extract_labels(local_file, one_hot=one_hot) local_file = _maybe_download(TEST_IMAGES, train_dir) test_images = _extract_images(local_file) local_file = _maybe_download(TEST_LABELS, train_dir) test_labels = _extract_labels(local_file, one_hot=one_hot) if not validation_size and validation_ratio: validation_size = int( (len(train_labels) + len(test_labels)) * validation_ratio) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] if limit_train_size: train_images = train_images[:limit_train_size] train_labels = train_labels[:limit_train_size] if number_labeled_examples is None: train = DataSet(train_images, train_labels, flatten=flatten, to_binary=True) else: train = SemiDataSet(train_images, train_labels, number_labeled_examples) test = DataSet(test_images, test_labels, flatten=flatten, to_binary=True) if validation_size: validation = DataSet(validation_images, validation_labels, flatten=flatten, to_binary=True) else: validation = None return DataSetCollection('MNIST', train, test, validation)