def read_data_sets(train_dir, one_hot=False,): TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES) train_images = extract_images(local_file) local_file = base.maybe_download(TRAIN_LABELS, train_dir, SOURCE_URL + TRAIN_LABELS) train_labels = extract_labels(local_file, one_hot=one_hot) local_file = base.maybe_download(TEST_IMAGES, train_dir, SOURCE_URL + TEST_IMAGES) test_images = extract_images(local_file) local_file = base.maybe_download(TEST_LABELS, train_dir, SOURCE_URL + TEST_LABELS) test_labels = extract_labels(local_file, one_hot=one_hot) train_images = np.reshape(train_images, newshape=(train_images.shape[0], (train_images.shape[1]*train_images.shape[2]*train_images.shape[3]))) / 255.0 test_images = np.reshape(test_images, newshape=(test_images.shape[0], (test_images.shape[1]*test_images.shape[2]*test_images.shape[3]))) / 255.0 train_labels = train_labels / 1.0 test_labels = test_labels / 1.0 return train_images, train_labels, test_images, test_labels
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES) with open(local_file, 'rb') as f: train_images = extract_images(f) local_file = base.maybe_download(TRAIN_LABELS, train_dir, SOURCE_URL + TRAIN_LABELS) with open(local_file, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) local_file = base.maybe_download(TEST_IMAGES, train_dir, SOURCE_URL + TEST_IMAGES) with open(local_file, 'rb') as f: test_images = extract_images(f) local_file = base.maybe_download(TEST_LABELS, train_dir, SOURCE_URL + TEST_LABELS) with open(local_file, 'rb') as f: test_labels = extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.' .format(len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) validation = DataSet(validation_images, validation_labels, dtype=dtype, reshape=reshape) test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape) return base.Datasets(train=train, validation=validation, test=test)
def train_and_eval(job_dir=None, model_type='WIDE_AND_DEEP'): print("Begin training and evaluation") # if local eval and no args passed, default if job_dir is None: job_dir = 'models/' # Ensure path has a '/' at the end if job_dir[-1] != '/': job_dir += '/' gcs_base = 'https://storage.googleapis.com/' gcs_path = 'cloudml-public/census/data/' trainfile = 'adult.data.csv' testfile = 'adult.test.csv' local_path = 'dataset_files' train_file = base.maybe_download( trainfile, local_path, gcs_base + gcs_path + trainfile) test_file = base.maybe_download( testfile, local_path, gcs_base + gcs_path + testfile) training_mode = 'learn_runner' train_steps = 1000 test_steps = 100 model_dir = job_dir + 'model_' + model_type + '_' + str(int(time.time())) print("Saving model checkpoints to " + model_dir) export_dir = model_dir + '/exports' # Manually train and export model if training_mode == 'manual': # In this function, editing below here is unlikely to be needed m = build_estimator(model_type, model_dir) m.fit(input_fn=generate_input_fn(train_file), steps=train_steps) print('fit done') results = m.evaluate(input_fn=generate_input_fn(test_file), steps=test_steps) print('evaluate done') print('Accuracy: %s' % results['accuracy']) export_folder = m.export_savedmodel( export_dir_base = export_dir, input_fn=serving_input_fn ) print('Model exported to ' + export_dir) elif training_mode == 'learn_runner': # use learn_runner experiment_fn = generate_experiment( model_dir, train_file, test_file, model_type) metrics, output_folder = learn_runner.run(experiment_fn, model_dir) print('Accuracy: {}'.format(metrics['accuracy'])) print('Model exported to {}'.format(output_folder))
def download(d): """Binds voxforge_url, archive_dir, total, and counter into this scope Downloads the given file :param d: a tuple consisting of (index, file) where index is the index of the file to download and file is the name of the file to download """ (i, file) = d download_url = voxforge_url + '/' + file c = counter.increment() print('Downloading file {} ({}/{})...'.format(i+1, c, total)) base.maybe_download(filename_of(download_url), archive_dir, download_url)
def _download_and_preprocess_data(data_dir): # Conditionally download data LDC93S1_BASE = "LDC93S1" LDC93S1_BASE_URL = "https://catalog.ldc.upenn.edu/desc/addenda/" local_file = base.maybe_download(LDC93S1_BASE + ".wav", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".wav") trans_file = base.maybe_download(LDC93S1_BASE + ".txt", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".txt") with open(trans_file, "r") as fin: transcript = ' '.join(fin.read().strip().lower().split(' ')[2:]).replace('.', '') df = pandas.DataFrame(data=[(os.path.abspath(local_file), os.path.getsize(local_file), transcript)], columns=["wav_filename", "wav_filesize", "transcript"]) df.to_csv(os.path.join(data_dir, "ldc93s1.csv"), index=False)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) TRAIN_IMAGES = 'trainImage.txt.gz' TRAIN_LABELS = 'trainImageLabel.txt.gz' TEST_IMAGES = 'testImage.txt.gz' TEST_LABELS = 'testImageLabel.txt.gz' VALIDATION_SIZE = 36 local_file = base.maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES) train_images = extract_images(local_file) local_file = base.maybe_download(TRAIN_LABELS, train_dir, SOURCE_URL + TRAIN_LABELS) train_labels = extract_labels(local_file, one_hot=one_hot) local_file = base.maybe_download(TEST_IMAGES, train_dir, SOURCE_URL + TEST_IMAGES) test_images = extract_images(local_file) local_file = base.maybe_download(TEST_LABELS, train_dir, SOURCE_URL + TEST_LABELS) test_labels = extract_labels(local_file, one_hot=one_hot) validation_images = train_images[:VALIDATION_SIZE] validation_labels = train_labels[:VALIDATION_SIZE] # train_images = train_images[VALIDATION_SIZE:] # train_labels = train_labels[VALIDATION_SIZE:] train_images = validation_images train_labels = validation_labels train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) validation = DataSet(validation_images, validation_labels, dtype=dtype, reshape=reshape) test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape) return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' VALIDATION_SIZE = 5000 local_file = base.maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES) train_images = extract_images(local_file) local_file = base.maybe_download(TRAIN_LABELS, train_dir, SOURCE_URL + TRAIN_LABELS) train_labels = extract_labels(local_file, one_hot=one_hot) local_file = base.maybe_download(TEST_IMAGES, train_dir, SOURCE_URL + TEST_IMAGES) test_images = extract_images(local_file) local_file = base.maybe_download(TEST_LABELS, train_dir, SOURCE_URL + TEST_LABELS) test_labels = extract_labels(local_file, one_hot=one_hot) validation_images = train_images[:VALIDATION_SIZE] validation_labels = train_labels[:VALIDATION_SIZE] train_images = train_images[VALIDATION_SIZE:] train_labels = train_labels[VALIDATION_SIZE:] train = DataSet(train_images, train_labels, start_id=0, dtype=dtype) validation = DataSet(validation_images, validation_labels, start_id=len(train_images), dtype=dtype) test = DataSet(test_images, test_labels, start_id=(len(train_images) + len(validation_images)), dtype=dtype) return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(data_dir): filename = "cifar-100-python.tar.gz" print("getting data") SOURCE_URL = 'https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz' local_file = base.maybe_download(filename, data_dir, SOURCE_URL) print('Extracting', filename) train_images,train_labels =[],[] test_images,test_labels =[],[] with gfile.Open(data_dir+"/"+filename, 'rb') as f, tarfile.open(fileobj=f) as tar: for x in tar.getnames(): if "data_batch" in x: i,l = _get_data(tar.extractfile(x)) train_images.extend(i.reshape((i.shape[0],32,32,3))) train_labels.extend(l) if "test_batch" in x: i,l = _get_data(tar.extractfile(x)) test_images.extend(i.reshape((i.shape[0],32,32,3))) test_labels.extend(l) train_images = np.array(train_images) test_images = np.array(test_images) train_labels = np.array(train_labels) test_labels = np.array(test_labels) train = DataSet(train_images, train_labels,dtype=dtypes.uint8,depth=100) test = DataSet(test_images, test_labels,dtype=dtypes.uint8,depth=100) return base.Datasets(train=train, validation=None, test=test)
def maybe_download_dbpedia(data_dir): """Download if DBpedia data is not present.""" train_path = os.path.join(data_dir, 'dbpedia_csv/train.csv') test_path = os.path.join(data_dir, 'dbpedia_csv/test.csv') if not (gfile.Exists(train_path) and gfile.Exists(test_path)): archive_path = base.maybe_download( 'dbpedia_csv.tar.gz', data_dir, DBPEDIA_URL) tfile = tarfile.open(archive_path, 'r:*') tfile.extractall(data_dir)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=tf.float32): class DataSets(object): pass data_sets = DataSets() if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) data_sets.train = fake() data_sets.validation = fake() data_sets.test = fake() return data_sets TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' VALIDATION_SIZE = 5000 local_file = maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES) train_images = extract_images(local_file) local_file = maybe_download(TRAIN_LABELS, train_dir, SOURCE_URL + TRAIN_LABELS) train_labels = extract_labels(local_file, one_hot=one_hot) local_file = maybe_download(TEST_IMAGES, train_dir, SOURCE_URL + TEST_IMAGES) test_images = extract_images(local_file) local_file = maybe_download(TEST_LABELS, train_dir, SOURCE_URL + TEST_LABELS) test_labels = extract_labels(local_file, one_hot=one_hot) validation_images = train_images[:VALIDATION_SIZE] validation_labels = train_labels[:VALIDATION_SIZE] train_images = train_images[VALIDATION_SIZE:] train_labels = train_labels[VALIDATION_SIZE:] data_sets.train = DataSet(train_images, train_labels, dtype=dtype) data_sets.validation = DataSet(validation_images, validation_labels, dtype=dtype) data_sets.test = DataSet(test_images, test_labels, dtype=dtype) return data_sets
def get_dbpedia(data_dir): train_path = os.path.join(data_dir, 'dbpedia_csv/train.csv') test_path = os.path.join(data_dir, 'dbpedia_csv/test.csv') if not (gfile.Exists(train_path) and gfile.Exists(test_path)): archive_path = base.maybe_download('dbpedia_csv.tar.gz', data_dir, DBPEDIA_URL) tfile = tarfile.open(archive_path, 'r:*') tfile.extractall(data_dir) train = base.load_csv(train_path, np.int32, 0, has_header=False) test = base.load_csv(test_path, np.int32, 0, has_header=False) datasets = base.Datasets(train=train, validation=None, test=test) return datasets
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000): TRAIN_TEST_IMAGES = 'cifar-10-python.tar.gz' SOURCE_TRAIN_TEST = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz' local_file = base.maybe_download(TRAIN_TEST_IMAGES, train_dir, SOURCE_TRAIN_TEST) with open(local_file, 'rb') as f: train_images, train_labels, test_images, test_labels = extract_images( f) # local_file = base.maybe_download(TRAIN_LABELS, train_dir, # SOURCE_URL + TRAIN_LABELS) # with open(local_file, 'rb') as f: # train_labels = extract_labels(f, one_hot=one_hot) # local_file = base.maybe_download(TEST_IMAGES, train_dir, # SOURCE_URL + TEST_IMAGES) # with open(local_file, 'rb') as f: # test_images = extract_images(f) # local_file = base.maybe_download(TEST_LABELS, train_dir, # SOURCE_URL + TEST_LABELS) # with open(local_file, 'rb') as f: # test_labels = extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.' .format(len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) validation = DataSet(validation_images, validation_labels, dtype=dtype, reshape=reshape) test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape) return base.Datasets(train=train, validation=validation, test=test)
def _download_and_preprocess_data(data_dir): # Conditionally download data TED_DATA = "TEDLIUM_release2.tar.gz" TED_DATA_URL = "http://www.openslr.org/resources/19/TEDLIUM_release2.tar.gz" local_file = base.maybe_download(TED_DATA, data_dir, TED_DATA_URL) # Conditionally extract TED data TED_DIR = "TEDLIUM_release2" _maybe_extract(data_dir, TED_DIR, local_file) # Conditionally convert TED sph data to wav _maybe_convert_wav(data_dir, TED_DIR) # Conditionally split TED wav and text data into sentences train_files, dev_files, test_files = _maybe_split_sentences(data_dir, TED_DIR) # Write sets to disk as CSV files train_files.to_csv(path.join(data_dir, "ted-train.csv"), index=False) dev_files.to_csv(path.join(data_dir, "ted-dev.csv"), index=False) test_files.to_csv(path.join(data_dir, "ted-test.csv"), index=False)
def _download_and_preprocess_data(data_dir): # Conditionally download data TED_DATA = "TEDLIUM_release2.tar.gz" TED_DATA_URL = "http://www.openslr.org/resources/19/TEDLIUM_release2.tar.gz" local_file = base.maybe_download(TED_DATA, data_dir, TED_DATA_URL) # Conditionally extract TED data TED_DIR = "TEDLIUM_release2" _maybe_extract(data_dir, TED_DIR, local_file) # Conditionally convert TED sph data to wav _maybe_convert_wav(data_dir, TED_DIR) # Conditionally split TED wav and text data into sentences train_files, dev_files, test_files = _maybe_split_sentences( data_dir, TED_DIR) # Write sets to disk as CSV files train_files.to_csv(path.join(data_dir, "ted-train.csv"), index=False) dev_files.to_csv(path.join(data_dir, "ted-dev.csv"), index=False) test_files.to_csv(path.join(data_dir, "ted-test.csv"), index=False)
def get_mnist_images(): import gzip from tensorflow.contrib.learn.python.learn.datasets import base import numpy def extract_images(f): """Extract the images into a 4D uint8 numpy array [index, y, x, depth]. Args: f: A file object that can be passed into a gzip reader. Returns: data: A 4D uint8 numpy array [index, y, x, depth]. Raises: ValueError: If the bytestream does not start with 2051. """ print('Extracting', f.name) with gzip.GzipFile(fileobj=f) as bytestream: magic = _read32(bytestream) if magic != 2051: raise ValueError('Invalid magic number %d in MNIST image file: %s' % (magic, f.name)) num_images = _read32(bytestream) rows = _read32(bytestream) cols = _read32(bytestream) buf = bytestream.read(rows * cols * num_images) data = numpy.frombuffer(buf, dtype=numpy.uint8) data = data.reshape(num_images, rows, cols, 1) return data def _read32(bytestream): dt = numpy.dtype(numpy.uint32).newbyteorder('>') return numpy.frombuffer(bytestream.read(4), dtype=dt)[0] TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' source_url = 'https://storage.googleapis.com/cvdf-datasets/mnist/' local_file = base.maybe_download(TRAIN_IMAGES, '/tmp', source_url + TRAIN_IMAGES) train_images = extract_images(open(local_file, 'rb')) train_images = train_images.reshape(60000, 28**2).T.astype(np.float64)/255 return train_images
def read_data_sets(graph, data_dir, batch_size, numcep, numcontext, thread_count=8): # Conditionally download data TED_DATA = "TEDLIUM_release2.tar.gz" TED_DATA_URL = "http://www.openslr.org/resources/19/TEDLIUM_release2.tar.gz" local_file = base.maybe_download(TED_DATA, data_dir, TED_DATA_URL) # Conditionally extract TED data TED_DIR = "TEDLIUM_release2" _maybe_extract(data_dir, TED_DIR, local_file) # Conditionally convert TED sph data to wav _maybe_convert_wav(data_dir, TED_DIR) # Conditionally split TED wav data _maybe_split_wav(data_dir, TED_DIR) # Conditionally split TED stm data _maybe_split_stm(data_dir, TED_DIR) # Create dev DataSet dev = _read_data_set(graph, data_dir, TED_DIR, "dev", thread_count, batch_size, numcep, numcontext) # Create test DataSet test = _read_data_set(graph, data_dir, TED_DIR, "test", thread_count, batch_size, numcep, numcontext) # Create train DataSet train = _read_data_set(graph, data_dir, TED_DIR, "train", thread_count, batch_size, numcep, numcontext) # Return DataSets return DataSets(train, dev, test)
def read_data_set(name): if name == 'mnist': return input_data.read_data_sets(FLAGS.data_dir_mnist), 28, 28, 1 elif name == 'frey_faces': maybe_download('frey_rawface.mat', FLAGS.data_dir_frey, 'http://www.cs.nyu.edu/~roweis/data/frey_rawface.mat') images = sio.loadmat(FLAGS.data_dir_frey + '/frey_rawface.mat', squeeze_me=True) img_rows, img_cols = 28, 20 n_pixels = img_rows * img_cols images = images["ff"].T.reshape((-1, img_rows, img_cols)) train_images, test_images = train_test_split(images, test_size=0.185) train_images = train_images.reshape((-1, n_pixels)) test_images = test_images.reshape((-1, n_pixels)) train = DataSet(train_images, dtype=dtypes.float32, seed=None) test = DataSet(test_images, dtype=dtypes.float32, seed=None) return Datasets(train=train, test=test), 20, 28, 1 elif name == 'svhn': maybe_download('train_32x32.mat', FLAGS.data_dir_svhn, 'http://ufldl.stanford.edu/housenumbers/train_32x32.mat') train_images = sio.loadmat(FLAGS.data_dir_svhn + '/train_32x32.mat')['X'] train_images = np.transpose(train_images, [3, 0, 1, 2]) train_images = np.reshape(train_images, [-1, 32*32*3]) maybe_download('test_32x32.mat', FLAGS.data_dir_svhn, 'http://ufldl.stanford.edu/housenumbers/test_32x32.mat') test_images = sio.loadmat(FLAGS.data_dir_svhn + '/test_32x32.mat')['X'] test_images = np.transpose(test_images, [3, 0, 1, 2]) test_images = np.reshape(test_images, [-1, 32 * 32 * 3]) train = DataSet(train_images, dtype=dtypes.float32, seed=None) test = DataSet(test_images, dtype=dtypes.float32, seed=None) return Datasets(train=train, test=test), 32, 32, 3 elif name == 'cifar10': ds = CIFAR10.loadCIFAR10(8) train = DataSet(ds['train_set'], dtype=dtypes.float32, seed=None) test = DataSet(ds['test_set'], dtype=dtypes.float32, seed=None) return Datasets(train=train, test=test), 8, 8, 3 elif name == 'cifar10_full': ds = CIFAR10.loadCIFAR10(32) train = DataSet(ds['train_set'], dtype=dtypes.float32, seed=None) test = DataSet(ds['test_set'], dtype=dtypes.float32, seed=None) return Datasets(train=train, test=test), 32, 32, 3 else: print('No such data set')
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES) with open(local_file, 'rb') as f
def read_data_sets(data_dir): filename = "cifar-10-python.tar.gz" print("getting data") SOURCE_URL = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz' if rank == 0: local_file = base.maybe_download(filename, data_dir, SOURCE_URL) else: while not os.path.isfile(data_dir + "/" + filename): pass print('Extracting', filename) train_images, train_labels = [], [] test_images, test_labels = [], [] with gfile.Open(data_dir + "/" + filename, 'rb') as f, tarfile.open(fileobj=f) as tar: for x in tar.getnames(): if "data_batch" in x: i, l = _get_data(tar.extractfile(x)) train_images.extend( i.reshape((i.shape[0], 3, 32, 32)).transpose(0, 2, 3, 1)) train_labels.extend(l) if "test_batch" in x: i, l = _get_data(tar.extractfile(x)) test_images.extend( i.reshape((i.shape[0], 3, 32, 32)).transpose(0, 2, 3, 1)) test_labels.extend(l) train_images = np.array(train_images) test_images = np.array(test_images) train_labels = np.array(train_labels) test_labels = np.array(test_labels) train = DataSet(train_images, train_labels, dtype=dtypes.uint8, depth=10) test = DataSet(test_images, test_labels, dtype=dtypes.uint8, depth=10) return base.Datasets(train=train, validation=None, test=test)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=1000, seed=None): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype, seed=seed) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES) with gfile.Open(local_file, 'rb') as f: train_images = extract_images(f) local_file = base.maybe_download(TRAIN_LABELS, train_dir, SOURCE_URL + TRAIN_LABELS) with gfile.Open(local_file, 'rb') as f: train_labels = extract_labels(f) local_file = base.maybe_download(TEST_IMAGES, train_dir, SOURCE_URL + TEST_IMAGES) with gfile.Open(local_file, 'rb') as f: test_images = extract_images(f) local_file = base.maybe_download(TEST_LABELS, train_dir, SOURCE_URL + TEST_LABELS) with gfile.Open(local_file, 'rb') as f: test_labels = extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) # now we filter the data index = numpy.where(test_labels < 10) test_images = test_images[index] test_labels = test_labels[index] test_labels = dense_to_one_hot(test_labels, 10) index = numpy.where(train_labels < 10) train_images = train_images[index] train_labels = train_labels[index] train_labels = dense_to_one_hot(train_labels, 10) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] options = dict(dtype=dtype, reshape=reshape, seed=seed) train = DataSet(train_images, train_labels, **options) validation = DataSet(validation_images, validation_labels, **options) test = DataSet(test_images, test_labels, **options) return base.Datasets(train=train, validation=validation, test=test)
def get_mnist_images(max_images=0, fold='train'): """Returns mnist images, batch dimension last.""" import gzip from tensorflow.contrib.learn.python.learn.datasets import base import numpy def extract_images(f): """Extract the images into a 4D uint8 numpy array [index, y, x, depth]. Args: f: A file object that can be passed into a gzip reader. Returns: data: A 4D uint8 numpy array [index, y, x, depth]. Raises: ValueError: If the bytestream does not start with 2051. """ # print('Extracting', f.name) # todo: remove with gzip.GzipFile(fileobj=f) as bytestream: magic = _read32(bytestream) if magic != 2051: raise ValueError('Invalid magic number %d in MNIST image file: %s' % (magic, f.name)) num_images = _read32(bytestream) if max_images: num_images = max_images rows = _read32(bytestream) cols = _read32(bytestream) buf = bytestream.read(rows * cols * num_images) data = numpy.frombuffer(buf, dtype=numpy.uint8) data = data.reshape(num_images, rows, cols, 1) return data def _read32(bytestream): dt = numpy.dtype(numpy.uint32).newbyteorder('>') return numpy.frombuffer(bytestream.read(4), dtype=dt)[0] if fold == 'train': # todo: rename TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' elif fold == 'test': TRAIN_IMAGES = 't10k-images-idx3-ubyte.gz' else: assert False, 'unknown fold %s'%(fold) source_url = 'https://storage.googleapis.com/cvdf-datasets/mnist/' local_file = base.maybe_download(TRAIN_IMAGES, '/tmp', source_url + TRAIN_IMAGES) train_images = extract_images(open(local_file, 'rb')) dsize = train_images.shape[0] if fold == 'train': if not max_images: dsize == 60000 else: dsize = max_images assert dsize <= 60000 else: if not max_images: dsize == 60000 else: dsize = max_images assert dsize <= 10000 train_images = train_images.reshape(dsize, 28**2).T.astype(np.float64)/255 train_images = np.ascontiguousarray(train_images) return train_images.astype(default_np_dtype)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, train_size=50000, valid_size=10000, seed=None, source_url=DEFAULT_SOURCE_URL): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype, seed=seed) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) if not source_url: # empty string check source_url = DEFAULT_SOURCE_URL TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, source_url + TRAIN_IMAGES) with gfile.Open(local_file, 'rb') as f: train_images = extract_images(f) # train_num_examples = train_images.shape[0] # ikeys = set() # for i in range(train_num_examples): # inonzero_m, inonzero_n, inonzero_l = train_images[i].nonzero() # ikey = [] # for m, n, l in zip(inonzero_m, inonzero_n, inonzero_l): # ikey.append(str(train_images[i, m, n, l])) # ikey = '_'.join(ikey) # ikey = hashlib.sha224(ikey) # ikey = ikey.hexdigest() # # print('%d %s' % (i, ikey)) # ikeys.add(ikey) # print('#ikey=%d' % (len(ikeys))) local_file = base.maybe_download(TRAIN_LABELS, train_dir, source_url + TRAIN_LABELS) with gfile.Open(local_file, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) local_file = base.maybe_download(TEST_IMAGES, train_dir, source_url + TEST_IMAGES) with gfile.Open(local_file, 'rb') as f: test_images = extract_images(f) local_file = base.maybe_download(TEST_LABELS, train_dir, source_url + TEST_LABELS) with gfile.Open(local_file, 'rb') as f: test_labels = extract_labels(f, one_hot=one_hot) if not 0 <= train_size <= len(train_images): raise ValueError( 'train size should be between 0 and {}. Received: {}.'.format( len(train_images), train_size)) if not 0 <= valid_size <= len(train_images): raise ValueError( 'valid size should be between 0 and {}. Received: {}.'.format( len(train_images), valid_size)) valid_images = train_images[:valid_size] valid_labels = train_labels[:valid_size] # train_images = train_images[valid_size:] # train_labels = train_labels[valid_size:] train_images = train_images[len(train_images) - train_size:] train_labels = train_labels[len(train_labels) - train_size:] # print('train image={} label={}'.format(train_images.shape, train_labels.shape)) # train_label_cn = {} # for train_label in train_labels: # train_label = train_label.nonzero()[0][0] # train_label_cn[train_label] = train_label_cn.get(train_label, 0) + 1 # for train_label, count in train_label_cn.items(): # print('train label=%d count=%d' % (train_label, count)) options = dict(dtype=dtype, reshape=reshape, seed=seed) train = DataSet(train_images, train_labels, **options) validation = DataSet(valid_images, valid_labels, **options) test = DataSet(test_images, test_labels, **options) return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000, num_classes=75, seed=None, source_url=DEFAULT_SOURCE_URL, train_imgaes= 'train-swallowsound-images-idx3-float.gz', train_labels='train-swallowsound-labels-idx1-ubyte.gz', test_imgaes='t10k-swallowsound-images-idx3-float.gz', test_labels='t10k-swallowsound-labels-idx1-ubyte.gz', gzip_compress=True, MSB=True): if fake_data: def fake(): return DataSet( [], [], fake_data=True, one_hot=one_hot, dtype=dtype, seed=seed) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) if not source_url: # empty string check source_url = DEFAULT_SOURCE_URL TRAIN_IMAGES = train_imgaes TRAIN_LABELS = train_labels TEST_IMAGES = test_imgaes TEST_LABELS = test_labels local_file = base.maybe_download(TRAIN_IMAGES, train_dir, source_url + TRAIN_IMAGES) with gfile.Open(local_file, 'rb') as f: train_images = extract_images(f,gzip_compress=gzip_compress,MSB=MSB) local_file = base.maybe_download(TRAIN_LABELS, train_dir, source_url + TRAIN_LABELS) with gfile.Open(local_file, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot,num_classes = num_classes,gzip_compress=gzip_compress,MSB=MSB) local_file = base.maybe_download(TEST_IMAGES, train_dir, source_url + TEST_IMAGES) with gfile.Open(local_file, 'rb') as f: test_images = extract_images(f,gzip_compress=gzip_compress,MSB=MSB) local_file = base.maybe_download(TEST_LABELS, train_dir, source_url + TEST_LABELS) with gfile.Open(local_file, 'rb') as f: test_labels = extract_labels(f, one_hot=one_hot,num_classes = num_classes,gzip_compress=gzip_compress,MSB=MSB) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.' .format(len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] options = dict(dtype=dtype, reshape=reshape, seed=seed) train = DataSet(train_images, train_labels, **options) validation = DataSet(validation_images, validation_labels, **options) test = DataSet(test_images, test_labels, **options) return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, splits=[-1, 5000, -1]): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES) train_images = extract_images(local_file) local_file = base.maybe_download(TRAIN_LABELS, train_dir, SOURCE_URL + TRAIN_LABELS) train_labels = extract_labels(local_file, one_hot=one_hot) local_file = base.maybe_download(TEST_IMAGES, train_dir, SOURCE_URL + TEST_IMAGES) test_images = extract_images(local_file) local_file = base.maybe_download(TEST_LABELS, train_dir, SOURCE_URL + TEST_LABELS) test_labels = extract_labels(local_file, one_hot=one_hot) validation_images = train_images[:5000] validation_labels = train_labels[:5000] train_images = train_images[5000:] train_labels = train_labels[5000:] [TRAIN_SIZE, VALIDATION_SIZE, TEST_SIZE] = splits # subsample the dataset if neccessary if (TRAIN_SIZE != -1): train_images, train_labels = stratified_subsampling( train_images, train_labels, TRAIN_SIZE) validation_image, validation_label = stratified_subsampling( validation_images, validation_labels, VALIDATION_SIZE) if (TEST_SIZE != -1): test_image, test_label = stratified_subsampling( test_images, test_labels, TEST_SIZE) train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) validation = DataSet(validation_images, validation_labels, dtype=dtype, reshape=reshape) test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape) return base.Datasets(train=train, validation=validation, test=test)
train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES) with open(local_file, 'rb') as f train_images = extract_images(f) local_file = base.maybe_download(TRAIN_LABELS, train_dir, SOURCE_URL + TRAIN_LABELS) with open(local_file, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) local_file = base.maybe_download(TEST_IMAGES, train_dir, SOURCE_URL + TEST_IMAGES) with open(local_file, 'rb') as f: test_images = extract_images(f) local_file = base.maybe_download(TEST_LABELS, train_dir, SOURCE_URL + TEST_LABELS) with open(local_file, 'rb') as f: test_labels = extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError(
sess = tf.InteractiveSession() tf.global_variables_initializer().run() # Train (10, 100, 1000) for index in range(5000): batch_xs, batch_ys = mnist.train.next_batch(100) sess.run(train_step, feed_dict={X: batch_xs, Y_: batch_ys}) print(sess.run(accuracy, feed_dict={X: mnist.validation.images, Y_: mnist.validation.labels})) # Test trained model before submission print(sess.run(accuracy, feed_dict={X: mnist.test.images, Y_: mnist.test.labels})) # kaggle test data if km.DOWNLOAD_DATASETS: base.maybe_download(km.KAGGLE_TEST_CSV, km.DATA_DIR, km.SOURCE_URL + km.KAGGLE_TEST_CSV) kaggle_test_images = pd.read_csv(km.DATA_DIR + km.KAGGLE_TEST_CSV).values.astype('float32') kaggle_test_images = np.reshape(kaggle_test_images, (kaggle_test_images.shape[0], 28, 28, 1)) # convert from [0:255] => [0.0:1.0] kaggle_test_images = np.multiply(kaggle_test_images, 1.0 / 255.0) predictions_kaggle = sess.run(tf.argmax(tf.nn.softmax(Y), 1), feed_dict={X: kaggle_test_images}) with open(km.SUBMISSION_FILE, 'w') as submission: submission.write('ImageId,Label\n') for index, prediction in enumerate(predictions_kaggle): submission.write('{0},{1}\n'.format(index + 1, prediction)) print("prediction submission written to {0}".format(km.SUBMISSION_FILE))
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=700, seed=None): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype, seed=seed) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) TRAIN_IMAGES = 'train_prime_images.gz' TRAIN_LABELS = 'train_prime_labels.gz' TEST_IMAGES = 'test_prime_images.gz' TEST_LABELS = 'test_prime_labels.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES) with open(local_file, 'rb') as f: train_images = extract_images(f) local_file = base.maybe_download(TRAIN_LABELS, train_dir, SOURCE_URL + TRAIN_LABELS) with open(local_file, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) local_file = base.maybe_download(TEST_IMAGES, train_dir, SOURCE_URL + TEST_IMAGES) with open(local_file, 'rb') as f: test_images = extract_images(f) local_file = base.maybe_download(TEST_LABELS, train_dir, SOURCE_URL + TEST_LABELS) with open(local_file, 'rb') as f: test_labels = extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] options = dict(dtype=dtype, reshape=reshape, seed=seed) train = DataSet(train_images, train_labels, **options) validation = DataSet(validation_images, validation_labels, **options) test = DataSet(test_images, test_labels, **options) return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000, seed=None): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype, seed=seed) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) # TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' # TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' # TEST_IMAGES = 't10k-images-idx3-ubyte.gz' # TEST_LABELS = 't10k-labels-idx1-ubyte.gz' TRAIN_IMAGES = 'emnist-balanced-train-images-idx3-ubyte.gz' TRAIN_LABELS = 'emnist-balanced-train-labels-idx1-ubyte.gz' TEST_IMAGES = 'emnist-balanced-test-images-idx3-ubyte.gz' TEST_LABELS = 'emnist-balanced-test-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES) with gfile.Open(local_file, 'rb') as f: train_images = extract_images(f) # local_file = base.maybe_download(TRAIN_LABELS, train_dir, # SOURCE_URL + TRAIN_LABELS) # with gfile.Open(local_file, 'rb') as f: # train_labels = extract_labels(f, one_hot=one_hot) local_file = base.maybe_download(TEST_IMAGES, train_dir, SOURCE_URL + TEST_IMAGES) with gfile.Open(local_file, 'rb') as f: test_images = extract_images(f) # local_file = base.maybe_download(TEST_LABELS, train_dir, # SOURCE_URL + TEST_LABELS) # with gfile.Open(local_file, 'rb') as f: # test_labels = extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) validation_images = train_images[:validation_size] # validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] # train_labels = train_labels[validation_size:] options = dict(dtype=dtype, reshape=reshape, seed=seed) train = DataSet(train_images, **options) validation = DataSet(validation_images, **options) test = DataSet(test_images, **options) return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(data_path, fake_data=False, one_hot=True, percentage_train=1., validation_size=5000, source_url=DEFAULT_SOURCE_URL): train_dir = data_path class DataSets(object): pass data_sets = DataSets() if fake_data: data_sets.train = DataSet([], [], fake_data=True, one_hot=True) data_sets.val = DataSet([], [], fake_data=True, one_hot=True) data_sets.test = DataSet([], [], fake_data=True, one_hot=True) return data_sets if not source_url: # empty string check source_url = DEFAULT_SOURCE_URL TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, source_url + TRAIN_IMAGES) with gfile.Open(local_file, 'rb') as f: train_images = extract_images(f) local_file = base.maybe_download(TRAIN_LABELS, train_dir, source_url + TRAIN_LABELS) with gfile.Open(local_file, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) local_file = base.maybe_download(TEST_IMAGES, train_dir, source_url + TEST_IMAGES) with gfile.Open(local_file, 'rb') as f: test_images = extract_images(f) local_file = base.maybe_download(TEST_LABELS, train_dir, source_url + TEST_LABELS) with gfile.Open(local_file, 'rb') as f: test_labels = extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format(len(train_images), validation_size)) val_images = train_images[:validation_size] val_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] np.random.seed(42) # add random permutation n_train = train_images.shape[0] perm = np.random.permutation(n_train) train_images = train_images[perm] train_labels = train_labels[perm] n_val = val_images.shape[0] perm = np.random.permutation(n_val) val_images = val_images[perm] val_labels = val_labels[perm] n_test = test_images.shape[0] if percentage_train != 1.: train_size = int(percentage_train*train_images.shape[0]) Xtrain_images, Xval_images, ytrain, yval = train_test_split(train_images, train_labels, train_size=train_size) train_images = Xtrain_images train_labels = ytrain data_sets.train = DataSet(train_images, train_labels, fake_data=True, one_hot=True) data_sets.val = DataSet(val_images, val_labels, fake_data=True, one_hot=True) data_sets.test = DataSet(test_images, test_labels, fake_data=True, one_hot=True) return data_sets
def custom_kaggle_mnist(): """ downloads and parses mnist train dataset for kaggle digit recognizer parsing and one_hot copied https://www.kaggle.com/kakauandme/tensorflow-deep-nn """ if DOWNLOAD_DATASETS: base.maybe_download(KAGGLE_TRAIN_CSV, DATA_DIR, SOURCE_URL + KAGGLE_TRAIN_CSV) # Import data from datasource, see https://www.kaggle.com/kakauandme/tensorflow-deep-nn # read training data from CSV file data = pd.read_csv(DATA_DIR + KAGGLE_TRAIN_CSV) from sklearn.utils import shuffle ## data = shuffle(data, random_state=42) images = data.iloc[:, 1:].values images = images.astype(np.float) images = np.reshape(images, (images.shape[0], 28, 28, 1)) # convert from [0:255] => [0.0:1.0] ## images = np.multiply(images, 1.0 / 255.0) print('number of images in downloaded train dataset: {0[0]}'.format( images.shape)) labels_flat = data.iloc[:, 0].values labels_count = np.unique(labels_flat).shape[0] def dense_to_one_hot(labels_dense, num_classes): num_labels = labels_dense.shape[0] index_offset = np.arange(num_labels) * num_classes labels_one_hot = np.zeros((num_labels, num_classes)) labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1 return labels_one_hot labels = dense_to_one_hot(labels_flat, labels_count) ## labels = labels.astype(np.uint8) # split data into training & validation mnist_train_images = images[:TRAIN_SIZE] mnist_train_labels = labels[:TRAIN_SIZE] print('number of train images: {0[0]}'.format(mnist_train_images.shape)) mnist_valid_images = images[TRAIN_SIZE:TRAIN_SIZE + VALID_SIZE] mnist_valid_labels = labels[TRAIN_SIZE:TRAIN_SIZE + VALID_SIZE] print('number of valid images: {0[0]}'.format(mnist_valid_images.shape)) mnist_test_images = images[TRAIN_SIZE + VALID_SIZE:images.shape[0]] mnist_test_labels = labels[TRAIN_SIZE + VALID_SIZE:images.shape[0]] print('number of test images: {0[0]}'.format(mnist_test_images.shape)) train = DataSet(mnist_train_images, mnist_train_labels, dtype=np.float32, reshape=False) valid = DataSet(mnist_valid_images, mnist_valid_labels, dtype=np.float32, reshape=False) test = DataSet(mnist_test_images, mnist_test_labels, dtype=np.float32, reshape=False) return base.Datasets(train=train, validation=valid, test=test)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=0): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) gz_file_name = 'cifar-10-python.tar.gz' local_file = base.maybe_download(gz_file_name, train_dir, SOURCE_URL + gz_file_name) train_images = [] train_labels = [] for i in range(1, 6): with open( os.path.join(train_dir, 'cifar-10-batches-py', 'data_batch_%d' % i)) as f: # batch = pickle.load(f, encoding='latin1') batch = numpy.load(os.path.join(train_dir, 'cifar-10-batches-py', 'data_batch_%d' % i), 'rb', encoding='latin1') tmp_images = batch['data'].reshape([-1, 3, 32, 32]) train_images.append(tmp_images.transpose([0, 2, 3, 1])) train_labels += batch['labels'] train_images = numpy.concatenate(train_images) train_labels = numpy.array(train_labels) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) validation = DataSet(validation_images, validation_labels, dtype=dtype, reshape=reshape) # test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape) test = None return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000, seed=None, source_url=DEFAULT_SOURCE_URL): if fake_data: def fake(): return DataSet( [], [], fake_data=True, one_hot=one_hot, dtype=dtype, seed=seed) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) if not source_url: # empty string check source_url = DEFAULT_SOURCE_URL # print("using %s" % source_url) TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, source_url + TRAIN_IMAGES) with gfile.Open(local_file, 'rb') as f: train_images = extract_images(f) local_file = base.maybe_download(TRAIN_LABELS, train_dir, source_url + TRAIN_LABELS) with gfile.Open(local_file, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) local_file = base.maybe_download(TEST_IMAGES, train_dir, source_url + TEST_IMAGES) with gfile.Open(local_file, 'rb') as f: test_images = extract_images(f) local_file = base.maybe_download(TEST_LABELS, train_dir, source_url + TEST_LABELS) with gfile.Open(local_file, 'rb') as f: test_labels = extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.' .format(len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] options = dict(dtype=dtype, reshape=reshape, seed=seed) train = DataSet(train_images, train_labels, **options) validation = DataSet(validation_images, validation_labels, **options) test = DataSet(test_images, test_labels, **options) return base.Datasets(train=train, validation=validation, test=test)
tf.logging.set_verbosity(tf.logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('--job-dir', help='GCS location to write checkpoints and export models', required=False) args = parser.parse_args() job_dir = args.job_dir # Data sets IRIS_TRAINING_FILE = "iris_training.csv" IRIS_TEST_FILE = "iris_test.csv" gcs_folder = 'https://storage.googleapis.com/dataset-uploader/iris/' IRIS_TRAINING = base.maybe_download(IRIS_TRAINING_FILE, '.', gcs_folder + IRIS_TRAINING_FILE) IRIS_TEST = base.maybe_download(IRIS_TEST_FILE, '.', gcs_folder + IRIS_TEST_FILE) # Load datasets. training_set = base.load_csv_with_header(filename=IRIS_TRAINING, features_dtype=np.float64, target_dtype=np.int) test_set = base.load_csv_with_header(filename=IRIS_TEST, features_dtype=np.float64, target_dtype=np.int) # Specify that all features have real-value data feature_columns = [ tf.contrib.layers.real_valued_column("flower_features", dimension=4) ]
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=False, validation_size=5000, worker_id=-1, n_workers=-1): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES) train_images = extract_data(local_file, 60000) local_file = base.maybe_download(TRAIN_LABELS, train_dir, SOURCE_URL + TRAIN_LABELS) train_labels = extract_labels(local_file, 60000) local_file = base.maybe_download(TEST_IMAGES, train_dir, SOURCE_URL + TEST_IMAGES) test_images = extract_data(local_file, 10000) local_file = base.maybe_download(TEST_LABELS, train_dir, SOURCE_URL + TEST_LABELS) test_labels = extract_labels(local_file, 10000) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) validation_images = test_images validation_labels = test_labels train_images = train_images train_labels = train_labels train_images_binary, train_labels_binary, test_images_binary, test_labels_binary = extract_for_binary( train_set=train_images, train_labels=train_labels, test_set=test_images, test_labels=test_labels) sampled_train_images, sampled_train_labels = down_sample( train_images_binary, train_labels_binary, down_sample_num=1024) new_data, new_labels = aug_data_set(sampled_train_images, sampled_train_labels, times_expand=1, aug_type='noise') # train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) # train = DataSet(sampled_train_images, sampled_train_labels, dtype=dtype, reshape=reshape) train = DataSet(new_data, new_labels, dtype=dtype, reshape=reshape) validation = DataSet(validation_images, validation_labels, dtype=dtype, reshape=reshape) print(new_data.shape, new_labels.shape) print("=================================================================") return base.Datasets(train=train, validation=validation, test=None)
def _download_and_preprocess_data(data_dir): # Conditionally download data to data_dir print( "Downloading Librivox data set (55GB) into {} if not already present..." .format(data_dir)) with progressbar.ProgressBar(max_value=7, widget=progressbar.AdaptiveETA) as bar: TRAIN_CLEAN_100_URL = "http://www.openslr.org/resources/12/train-clean-100.tar.gz" TRAIN_CLEAN_360_URL = "http://www.openslr.org/resources/12/train-clean-360.tar.gz" TRAIN_OTHER_500_URL = "http://www.openslr.org/resources/12/train-other-500.tar.gz" DEV_CLEAN_URL = "http://www.openslr.org/resources/12/dev-clean.tar.gz" DEV_OTHER_URL = "http://www.openslr.org/resources/12/dev-other.tar.gz" TEST_CLEAN_URL = "http://www.openslr.org/resources/12/test-clean.tar.gz" TEST_OTHER_URL = "http://www.openslr.org/resources/12/test-other.tar.gz" def filename_of(x): return os.path.split(x)[1] train_clean_100 = base.maybe_download(filename_of(TRAIN_CLEAN_100_URL), data_dir, TRAIN_CLEAN_100_URL) bar.update(0) train_clean_360 = base.maybe_download(filename_of(TRAIN_CLEAN_360_URL), data_dir, TRAIN_CLEAN_360_URL) bar.update(1) train_other_500 = base.maybe_download(filename_of(TRAIN_OTHER_500_URL), data_dir, TRAIN_OTHER_500_URL) bar.update(2) dev_clean = base.maybe_download(filename_of(DEV_CLEAN_URL), data_dir, DEV_CLEAN_URL) bar.update(3) dev_other = base.maybe_download(filename_of(DEV_OTHER_URL), data_dir, DEV_OTHER_URL) bar.update(4) test_clean = base.maybe_download(filename_of(TEST_CLEAN_URL), data_dir, TEST_CLEAN_URL) bar.update(5) test_other = base.maybe_download(filename_of(TEST_OTHER_URL), data_dir, TEST_OTHER_URL) bar.update(6) # Conditionally extract LibriSpeech data # We extract each archive into data_dir, but test for existence in # data_dir/LibriSpeech because the archives share that root. print("Extracting librivox data if not already extracted...") with progressbar.ProgressBar(max_value=7, widget=progressbar.AdaptiveETA) as bar: LIBRIVOX_DIR = "LibriSpeech" work_dir = os.path.join(data_dir, LIBRIVOX_DIR) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-100"), train_clean_100) bar.update(0) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-360"), train_clean_360) bar.update(1) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-other-500"), train_other_500) bar.update(2) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "dev-clean"), dev_clean) bar.update(3) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "dev-other"), dev_other) bar.update(4) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "test-clean"), test_clean) bar.update(5) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "test-other"), test_other) bar.update(6) # Convert FLAC data to wav, from: # data_dir/LibriSpeech/split/1/2/1-2-3.flac # to: # data_dir/LibriSpeech/split-wav/1-2-3.wav # # And split LibriSpeech transcriptions, from: # data_dir/LibriSpeech/split/1/2/1-2.trans.txt # to: # data_dir/LibriSpeech/split-wav/1-2-0.txt # data_dir/LibriSpeech/split-wav/1-2-1.txt # data_dir/LibriSpeech/split-wav/1-2-2.txt # ... print("Converting FLAC to WAV and splitting transcriptions...") with progressbar.ProgressBar(max_value=7, widget=progressbar.AdaptiveETA) as bar: train_100 = _convert_audio_and_split_sentences(work_dir, "train-clean-100", "train-clean-100-wav") bar.update(0) train_360 = _convert_audio_and_split_sentences(work_dir, "train-clean-360", "train-clean-360-wav") bar.update(1) train_500 = _convert_audio_and_split_sentences(work_dir, "train-other-500", "train-other-500-wav") bar.update(2) dev_clean = _convert_audio_and_split_sentences(work_dir, "dev-clean", "dev-clean-wav") bar.update(3) dev_other = _convert_audio_and_split_sentences(work_dir, "dev-other", "dev-other-wav") bar.update(4) test_clean = _convert_audio_and_split_sentences( work_dir, "test-clean", "test-clean-wav") bar.update(5) test_other = _convert_audio_and_split_sentences( work_dir, "test-other", "test-other-wav") bar.update(6) # Write sets to disk as CSV files train_100.to_csv(os.path.join(data_dir, "librivox-train-clean-100.csv"), index=False) train_360.to_csv(os.path.join(data_dir, "librivox-train-clean-360.csv"), index=False) train_500.to_csv(os.path.join(data_dir, "librivox-train-other-500.csv"), index=False) dev_clean.to_csv(os.path.join(data_dir, "librivox-dev-clean.csv"), index=False) dev_other.to_csv(os.path.join(data_dir, "librivox-dev-other.csv"), index=False) test_clean.to_csv(os.path.join(data_dir, "librivox-test-clean.csv"), index=False) test_other.to_csv(os.path.join(data_dir, "librivox-test-other.csv"), index=False)
def read_data_sets(data_dir, train_batch_size, dev_batch_size, test_batch_size, numcep, numcontext, thread_count=8, limit_dev=0, limit_test=0, limit_train=0, sets=[]): # Conditionally download data TED_DATA = "TEDLIUM_release1.tar.gz" TED_DATA_URL = "http://www.openslr.org/resources/7/TEDLIUM_release1.tar.gz" local_file = base.maybe_download(TED_DATA, data_dir, TED_DATA_URL) # Conditionally extract TED data TED_DIR = "TEDLIUM_release1" _maybe_extract(data_dir, TED_DIR, local_file) # Conditionally convert TED sph data to wav _maybe_convert_wav(data_dir, TED_DIR) # Conditionally split TED wav data _maybe_split_wav(data_dir, TED_DIR) # Conditionally split TED stm data _maybe_split_stm(data_dir, TED_DIR) # Create dev DataSet dev = None if "dev" in sets: dev = _read_data_set(data_dir, TED_DIR, "dev", thread_count, dev_batch_size, numcep, numcontext, limit=limit_dev) # Create test DataSet test = None if "test" in sets: test = _read_data_set(data_dir, TED_DIR, "test", thread_count, test_batch_size, numcep, numcontext, limit=limit_test) # Create train DataSet train = None if "train" in sets: train = _read_data_set(data_dir, TED_DIR, "train", thread_count, train_batch_size, numcep, numcontext, limit=limit_train) # Return DataSets return DataSets(train, dev, test)
def _download_and_preprocess_data(data_dir): # Conditionally download data to data_dir print("Downloading Librivox data set (55GB) into {} if not already present...".format(data_dir)) with progressbar.ProgressBar(max_value=7, widget=progressbar.AdaptiveETA) as bar: TRAIN_CLEAN_100_URL = "http://www.openslr.org/resources/12/train-clean-100.tar.gz" TRAIN_CLEAN_360_URL = "http://www.openslr.org/resources/12/train-clean-360.tar.gz" TRAIN_OTHER_500_URL = "http://www.openslr.org/resources/12/train-other-500.tar.gz" DEV_CLEAN_URL = "http://www.openslr.org/resources/12/dev-clean.tar.gz" DEV_OTHER_URL = "http://www.openslr.org/resources/12/dev-other.tar.gz" TEST_CLEAN_URL = "http://www.openslr.org/resources/12/test-clean.tar.gz" TEST_OTHER_URL = "http://www.openslr.org/resources/12/test-other.tar.gz" def filename_of(x): return os.path.split(x)[1] train_clean_100 = base.maybe_download(filename_of(TRAIN_CLEAN_100_URL), data_dir, TRAIN_CLEAN_100_URL) bar.update(0) train_clean_360 = base.maybe_download(filename_of(TRAIN_CLEAN_360_URL), data_dir, TRAIN_CLEAN_360_URL) bar.update(1) train_other_500 = base.maybe_download(filename_of(TRAIN_OTHER_500_URL), data_dir, TRAIN_OTHER_500_URL) bar.update(2) dev_clean = base.maybe_download(filename_of(DEV_CLEAN_URL), data_dir, DEV_CLEAN_URL) bar.update(3) dev_other = base.maybe_download(filename_of(DEV_OTHER_URL), data_dir, DEV_OTHER_URL) bar.update(4) test_clean = base.maybe_download(filename_of(TEST_CLEAN_URL), data_dir, TEST_CLEAN_URL) bar.update(5) test_other = base.maybe_download(filename_of(TEST_OTHER_URL), data_dir, TEST_OTHER_URL) bar.update(6) # Conditionally extract LibriSpeech data # We extract each archive into data_dir, but test for existence in # data_dir/LibriSpeech because the archives share that root. print("Extracting librivox data if not already extracted...") with progressbar.ProgressBar(max_value=7, widget=progressbar.AdaptiveETA) as bar: LIBRIVOX_DIR = "LibriSpeech" work_dir = os.path.join(data_dir, LIBRIVOX_DIR) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-100"), train_clean_100) bar.update(0) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-360"), train_clean_360) bar.update(1) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-other-500"), train_other_500) bar.update(2) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "dev-clean"), dev_clean) bar.update(3) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "dev-other"), dev_other) bar.update(4) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "test-clean"), test_clean) bar.update(5) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "test-other"), test_other) bar.update(6) # Convert FLAC data to wav, from: # data_dir/LibriSpeech/split/1/2/1-2-3.flac # to: # data_dir/LibriSpeech/split-wav/1-2-3.wav # # And split LibriSpeech transcriptions, from: # data_dir/LibriSpeech/split/1/2/1-2.trans.txt # to: # data_dir/LibriSpeech/split-wav/1-2-0.txt # data_dir/LibriSpeech/split-wav/1-2-1.txt # data_dir/LibriSpeech/split-wav/1-2-2.txt # ... print("Converting FLAC to WAV and splitting transcriptions...") with progressbar.ProgressBar(max_value=7, widget=progressbar.AdaptiveETA) as bar: train_100 = _convert_audio_and_split_sentences(work_dir, "train-clean-100", "train-clean-100-wav") bar.update(0) train_360 = _convert_audio_and_split_sentences(work_dir, "train-clean-360", "train-clean-360-wav") bar.update(1) train_500 = _convert_audio_and_split_sentences(work_dir, "train-other-500", "train-other-500-wav") bar.update(2) dev_clean = _convert_audio_and_split_sentences(work_dir, "dev-clean", "dev-clean-wav") bar.update(3) dev_other = _convert_audio_and_split_sentences(work_dir, "dev-other", "dev-other-wav") bar.update(4) test_clean = _convert_audio_and_split_sentences(work_dir, "test-clean", "test-clean-wav") bar.update(5) test_other = _convert_audio_and_split_sentences(work_dir, "test-other", "test-other-wav") bar.update(6) # Write sets to disk as CSV files train_100.to_csv(os.path.join(data_dir, "librivox-train-clean-100.csv"), index=False) train_360.to_csv(os.path.join(data_dir, "librivox-train-clean-360.csv"), index=False) train_500.to_csv(os.path.join(data_dir, "librivox-train-other-500.csv"), index=False) dev_clean.to_csv(os.path.join(data_dir, "librivox-dev-clean.csv"), index=False) dev_other.to_csv(os.path.join(data_dir, "librivox-dev-other.csv"), index=False) test_clean.to_csv(os.path.join(data_dir, "librivox-test-clean.csv"), index=False) test_other.to_csv(os.path.join(data_dir, "librivox-test-other.csv"), index=False)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=False, validation_size=5000, worker_id=-1, n_workers=-1): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES) train_images = extract_data(local_file, 60000) local_file = base.maybe_download(TRAIN_LABELS, train_dir, SOURCE_URL + TRAIN_LABELS) train_labels = extract_labels(local_file, 60000) local_file = base.maybe_download(TEST_IMAGES, train_dir, SOURCE_URL + TEST_IMAGES) test_images = extract_data(local_file, 10000) local_file = base.maybe_download(TEST_LABELS, train_dir, SOURCE_URL + TEST_LABELS) test_labels = extract_labels(local_file, 10000) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) validation_images = test_images validation_labels = test_labels train_images = train_images train_labels = train_labels # convert labels to on-hot labels here: train_labels_tmp = numpy.zeros((train_labels.shape[0], NUM_LABELS)) train_labels_tmp[numpy.arange(len(train_labels_tmp)), train_labels] += 1 valid_labels_tmp = numpy.zeros((validation_labels.shape[0], NUM_LABELS)) valid_labels_tmp[numpy.arange(len(valid_labels_tmp)), validation_labels] += 1 #train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) train = DataSet(train_images, train_labels_tmp, dtype=dtype, reshape=reshape) validation = DataSet(validation_images, valid_labels_tmp, dtype=dtype, reshape=reshape) return base.Datasets(train=train, validation=validation, test=None)
def read_data_sets(data_dir, train_batch_size, dev_batch_size, test_batch_size, numcep, numcontext, thread_count=8, limit_dev=0, limit_test=0, limit_train=0, sets=[]): # Check if we can convert FLAC with SoX before we start sox_help_out = subprocess.check_output(["sox", "-h"]) if sox_help_out.find("flac") == -1: print("Error: SoX doesn't support FLAC. Please install SoX with FLAC support and try again.") exit(1) # Conditionally download data to data_dir print("Downloading Librivox data sets if not already present...") with progressbar.ProgressBar(max_value=7, widget=progressbar.AdaptiveETA) as bar: TRAIN_CLEAN_100_URL = "http://www.openslr.org/resources/12/train-clean-100.tar.gz" TRAIN_CLEAN_360_URL = "http://www.openslr.org/resources/12/train-clean-360.tar.gz" TRAIN_OTHER_500_URL = "http://www.openslr.org/resources/12/train-other-500.tar.gz" DEV_CLEAN_URL = "http://www.openslr.org/resources/12/dev-clean.tar.gz" DEV_OTHER_URL = "http://www.openslr.org/resources/12/dev-other.tar.gz" TEST_CLEAN_URL = "http://www.openslr.org/resources/12/test-clean.tar.gz" TEST_OTHER_URL = "http://www.openslr.org/resources/12/test-other.tar.gz" def filename_of(x): return path.split(x)[1] train_clean_100 = base.maybe_download(filename_of(TRAIN_CLEAN_100_URL), data_dir, TRAIN_CLEAN_100_URL) bar.update(0) train_clean_360 = base.maybe_download(filename_of(TRAIN_CLEAN_360_URL), data_dir, TRAIN_CLEAN_360_URL) bar.update(1) train_other_500 = base.maybe_download(filename_of(TRAIN_OTHER_500_URL), data_dir, TRAIN_OTHER_500_URL) bar.update(2) dev_clean = base.maybe_download(filename_of(DEV_CLEAN_URL), data_dir, DEV_CLEAN_URL) bar.update(3) dev_other = base.maybe_download(filename_of(DEV_OTHER_URL), data_dir, DEV_OTHER_URL) bar.update(4) test_clean = base.maybe_download(filename_of(TEST_CLEAN_URL), data_dir, TEST_CLEAN_URL) bar.update(5) test_other = base.maybe_download(filename_of(TEST_OTHER_URL), data_dir, TEST_OTHER_URL) bar.update(6) # Conditionally extract LibriSpeech data # We extract each archive into data_dir, but test for existence in # data_dir/LibriSpeech because the archives share that root. print("Extracting librivox data if not already extracted...") with progressbar.ProgressBar(max_value=7, widget=progressbar.AdaptiveETA) as bar: LIBRIVOX_DIR = "LibriSpeech" work_dir = os.path.join(data_dir, LIBRIVOX_DIR) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-100"), train_clean_100) bar.update(0) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-360"), train_clean_360) bar.update(1) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-other-500"), train_other_500) bar.update(2) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "dev-clean"), dev_clean) bar.update(3) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "dev-other"), dev_other) bar.update(4) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "test-clean"), test_clean) bar.update(5) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "test-other"), test_other) bar.update(6) # Conditionally convert FLAC data to wav, from: # data_dir/LibriSpeech/split/1/2/1-2-3.flac # to: # data_dir/LibriSpeech/split-wav/1-2-3.wav print("Converting Librivox data from flac to wav if not already converted...") with progressbar.ProgressBar(max_value=7, widget=progressbar.AdaptiveETA) as bar: _maybe_convert_wav(work_dir, "train-clean-100", "train-clean-100-wav") bar.update(0) _maybe_convert_wav(work_dir, "train-clean-360", "train-clean-360-wav") bar.update(1) _maybe_convert_wav(work_dir, "train-other-500", "train-other-500-wav") bar.update(2) _maybe_convert_wav(work_dir, "dev-clean", "dev-clean-wav") bar.update(3) _maybe_convert_wav(work_dir, "dev-other", "dev-other-wav") bar.update(4) _maybe_convert_wav(work_dir, "test-clean", "test-clean-wav") bar.update(5) _maybe_convert_wav(work_dir, "test-other", "test-other-wav") bar.update(6) # Conditionally split LibriSpeech transcriptions, from: # data_dir/LibriSpeech/split/1/2/1-2.trans.txt # to: # data_dir/LibriSpeech/split-wav/1-2-0.txt # data_dir/LibriSpeech/split-wav/1-2-1.txt # data_dir/LibriSpeech/split-wav/1-2-2.txt # ... print("Splitting transcriptions if not already split ...") with progressbar.ProgressBar(max_value=7, widget=progressbar.AdaptiveETA) as bar: _maybe_split_transcriptions(work_dir, "train-clean-100", "train-clean-100-wav") bar.update(0) _maybe_split_transcriptions(work_dir, "train-clean-360", "train-clean-360-wav") bar.update(1) _maybe_split_transcriptions(work_dir, "train-other-500", "train-other-500-wav") bar.update(2) _maybe_split_transcriptions(work_dir, "dev-clean", "dev-clean-wav") bar.update(3) _maybe_split_transcriptions(work_dir, "dev-other", "dev-other-wav") bar.update(4) _maybe_split_transcriptions(work_dir, "test-clean", "test-clean-wav") bar.update(5) _maybe_split_transcriptions(work_dir, "test-other", "test-other-wav") bar.update(6) print("Finished pre-processing librivox. Initializing dataset...") # Create train DataSet from all the train archives train = None if "train" in sets: train = _read_data_set(work_dir, "train-*-wav", thread_count, train_batch_size, numcep, numcontext, limit=limit_train) # Create dev DataSet from all the dev archives dev = None if "dev" in sets: dev = _read_data_set(work_dir, "dev-*-wav", thread_count, dev_batch_size, numcep, numcontext, limit=limit_dev) # Create test DataSet from all the test archives test = None if "test" in sets: test = _read_data_set(work_dir, "test-*-wav", thread_count, test_batch_size, numcep, numcontext, limit=limit_test) # Return DataSets return DataSets(train, dev, test)
def read_data_sets( train_dir, shard_index, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000, ): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() sharded_train = fake() # return base.Datasets(train=train, validation=validation, test=test, sharded_train=sharded_train) return base.Datasets(train=train, validation=validation, test=test) TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES) with open(local_file, 'rb') as f: train_images = extract_images(f) local_file = base.maybe_download(TRAIN_LABELS, train_dir, SOURCE_URL + TRAIN_LABELS) with open(local_file, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) local_file = base.maybe_download(TEST_IMAGES, train_dir, SOURCE_URL + TEST_IMAGES) with open(local_file, 'rb') as f: test_images = extract_images(f) local_file = base.maybe_download(TEST_LABELS, train_dir, SOURCE_URL + TEST_LABELS) with open(local_file, 'rb') as f: test_labels = extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] # print(shard_index) # print(type(shard_index)) sharded_train_images = train_images[shard_index] sharded_train_labels = train_labels[shard_index] print(sharded_train_labels) train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) validation = DataSet(validation_images, validation_labels, dtype=dtype, reshape=reshape) test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape) sharded_train = DataSet(sharded_train_images, sharded_train_labels, dtype=dtype, reshape=reshape) train = sharded_train return base.Datasets(train=train, validation=validation, test=test) # return base.Datasets(train=train, validation=validation, test=test, sharded_train=sharded_train) # def load_mnist(train_dir='MNIST-data'): # return read_data_sets(train_dir)
def _load_data(self): work_directory = '.faces_data' images_path = maybe_download('img_align_celeba.zip', work_directory, FACES_IMAGES_URL) labels_path = maybe_download('list_attr_celeba.txt', work_directory, FACES_LABELS_URL) # Load labels. image_count = 0 attributes = [] attributes_classes = ['Male', 'Young', 'Smiling', 'Attractive'] label_map = {} with open(labels_path, 'r') as labels_file: for line_no, line in enumerate(labels_file): if line_no == 0: # Parse example count. image_count = int(line) continue elif line_no == 1: # Parse header. attributes = line.split() continue # Parse line and determine class label. line = line.split() if self.options.dataset_random_labels: label = (line_no - 2) % self.class_count else: label = 0 for index, attribute in enumerate(attributes_classes): value = int(line[attributes.index(attribute) + 1]) if value == 1: label += 2**index if label > 9: continue label_map[line[0]] = label # Load images. images = np.zeros( [image_count, self.width * self.height * self.channels], dtype=np.float32) labels = np.zeros([image_count], dtype=np.int8) with zipfile.ZipFile(images_path, 'r') as images_zip: image_infos = images_zip.infolist() index = 0 progress = tqdm.tqdm(total=image_count, leave=False) for image_info in image_infos: if not image_info.filename.endswith('.jpg'): continue label = label_map.get(os.path.basename(image_info.filename), None) if label is None: continue with images_zip.open(image_info) as image_file: image = imread(image_file).astype(np.float32) # Resize image to target dimensions. h, w = image.shape[:2] image = imresize( image, [int((float(h) / w) * self.width), self.width]) j = int(round((image.shape[0] - self.height) / 2.)) image = image[j:j + self.height, :, :] image = image / 255. images[index, :] = image.flatten() labels[index] = label index += 1 progress.update() image_count = index + 1 images = images[:image_count] labels = labels[:image_count] progress.close() print('Image count:', index) print('Values: min={} max={} mean={}'.format(np.min(images), np.max(images), np.mean(images))) print('Class distribution:') for label, count in zip(*np.unique(labels, return_counts=True)): print(' {}: {}'.format(label, count)) train = DataWrapper(images, labels) test = DataWrapper(images[:1000], labels[:1000]) validation = DataWrapper(np.asarray([]), np.asarray([])) return Datasets(train=train, test=test, validation=validation)
def read_data_sets(data_dir, batch_size, numcep, numcontext, thread_count=8, limit_dev=0, limit_test=0, limit_train=0): # Check if we can convert FLAC with SoX before we start sox_help_out = subprocess.check_output(["sox", "-h"]) if sox_help_out.find("flac") == -1: print("Error: SoX doesn't support FLAC. Please install SoX with FLAC support and try again.") exit(1) # Conditionally download data to data_dir TRAIN_CLEAN_100_URL = "http://www.openslr.org/resources/12/train-clean-100.tar.gz" TRAIN_CLEAN_360_URL = "http://www.openslr.org/resources/12/train-clean-360.tar.gz" TRAIN_OTHER_500_URL = "http://www.openslr.org/resources/12/train-other-500.tar.gz" DEV_CLEAN_URL = "http://www.openslr.org/resources/12/dev-clean.tar.gz" DEV_OTHER_URL = "http://www.openslr.org/resources/12/dev-other.tar.gz" TEST_CLEAN_URL = "http://www.openslr.org/resources/12/test-clean.tar.gz" TEST_OTHER_URL = "http://www.openslr.org/resources/12/test-other.tar.gz" train_clean_100 = base.maybe_download("train-clean-100.tar.gz", data_dir, TRAIN_CLEAN_100_URL) train_clean_360 = base.maybe_download("train-clean-360.tar.gz", data_dir, TRAIN_CLEAN_360_URL) train_other_500 = base.maybe_download("train-other-500.tar.gz", data_dir, TRAIN_OTHER_500_URL) dev_clean = base.maybe_download("dev-clean.tar.gz", data_dir, DEV_CLEAN_URL) dev_other = base.maybe_download("dev-other.tar.gz", data_dir, DEV_OTHER_URL) test_clean = base.maybe_download("test-clean.tar.gz", data_dir, TEST_CLEAN_URL) test_other = base.maybe_download("test-other.tar.gz", data_dir, TEST_OTHER_URL) # Conditionally extract LibriSpeech data # We extract each archive into data_dir, but test for existence in # data_dir/LibriSpeech because the archives share that root. LIBRIVOX_DIR = "LibriSpeech" work_dir = os.path.join(data_dir, LIBRIVOX_DIR) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-100"), train_clean_100) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-360"), train_clean_360) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-other-500"), train_other_500) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "dev-clean"), dev_clean) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "dev-other"), dev_other) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "test-clean"), test_clean) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "test-other"), test_other) # Conditionally convert FLAC data to wav, from: # data_dir/LibriSpeech/split/1/2/1-2-3.flac # to: # data_dir/LibriSpeech/split-wav/1-2-3.wav _maybe_convert_wav(work_dir, "train-clean-100", "train-clean-100-wav") _maybe_convert_wav(work_dir, "train-clean-360", "train-clean-360-wav") _maybe_convert_wav(work_dir, "train-other-500", "train-other-500-wav") _maybe_convert_wav(work_dir, "dev-clean", "dev-clean-wav") _maybe_convert_wav(work_dir, "dev-other", "dev-other-wav") _maybe_convert_wav(work_dir, "test-clean", "test-clean-wav") _maybe_convert_wav(work_dir, "test-other", "test-other-wav") # Conditionally split LibriSpeech transcriptions, from: # data_dir/LibriSpeech/split/1/2/1-2.trans.txt # to: # data_dir/LibriSpeech/split-wav/1-2-0.txt # data_dir/LibriSpeech/split-wav/1-2-1.txt # data_dir/LibriSpeech/split-wav/1-2-2.txt # ... _maybe_split_transcriptions(work_dir, "train-clean-100", "train-clean-100-wav") _maybe_split_transcriptions(work_dir, "train-clean-360", "train-clean-360-wav") _maybe_split_transcriptions(work_dir, "train-other-500", "train-other-500-wav") _maybe_split_transcriptions(work_dir, "dev-clean", "dev-clean-wav") _maybe_split_transcriptions(work_dir, "dev-other", "dev-other-wav") _maybe_split_transcriptions(work_dir, "test-clean", "test-clean-wav") _maybe_split_transcriptions(work_dir, "test-other", "test-other-wav") # Create train DataSet from all the train archives train = _read_data_set(work_dir, "train-*-wav", thread_count, batch_size, numcep, numcontext, limit=limit_train) # Create dev DataSet from all the dev archives dev = _read_data_set(work_dir, "dev-*-wav", thread_count, batch_size, numcep, numcontext, limit=limit_dev) # Create test DataSet from all the test archives test = _read_data_set(work_dir, "test-*-wav", thread_count, batch_size, numcep, numcontext, limit=limit_test) # Return DataSets return DataSets(train, dev, test)
def load_mnist(train_dir, validation_size=5000): SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/' TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES) with open(local_file, 'rb') as f: train_images = extract_images(f) local_file = base.maybe_download(TRAIN_LABELS, train_dir, SOURCE_URL + TRAIN_LABELS) with open(local_file, 'rb') as f: train_labels = extract_labels(f) local_file = base.maybe_download(TEST_IMAGES, train_dir, SOURCE_URL + TEST_IMAGES) with open(local_file, 'rb') as f: test_images = extract_images(f) local_file = base.maybe_download(TEST_LABELS, train_dir, SOURCE_URL + TEST_LABELS) with open(local_file, 'rb') as f: test_labels = extract_labels(f) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] # print(np.shape(train_labels)) # plt.imshow(np.reshape(train_images[100], (28, 28)), cmap='gray', interpolation='none') # train_images = train_images[np.where((train_labels == 3) | (train_labels == 5))[0]] # train_labels = train_labels[np.where((train_labels == 3) | (train_labels == 5))[0]] # test_images = test_images[np.where((test_labels == 3) | (test_labels == 5))[0]] # test_labels = test_labels[np.where((test_labels == 3) | (test_labels == 5))[0]] # validation_images = validation_images[np.where((validation_labels == 3) | (validation_labels == 5))[0]] # validation_labels = validation_labels[np.where((validation_labels == 3) | (validation_labels == 5))[0]] train_images = train_images.astype(np.float32) / 255 validation_images = validation_images.astype(np.float32) / 255 test_images = test_images.astype(np.float32) / 255 # train_labels = label_binarize(train_labels, classes=[3,5])[:,0] # test_labels = label_binarize(test_labels, classes=[3,5])[:,0] # validation_labels = label_binarize(validation_labels, classes=[3,5])[:,0] train = DataSet(train_images, train_labels) validation = DataSet(validation_images, validation_labels) test = DataSet(test_images, test_labels) return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=1): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' #TRAIN_IMAGES = 'Texture_Sample.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 'test-images-idx3-ubyte.gz' #TEST_IMAGES = 'Texture_Sample.gz' TEST_LABELS = 'test-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES) with open(local_file, 'rb') as f: train_images = extract_images(f) local_file = base.maybe_download(TRAIN_LABELS, train_dir, SOURCE_URL + TRAIN_LABELS) with open(local_file, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) local_file = base.maybe_download(TEST_IMAGES, train_dir, SOURCE_URL + TEST_IMAGES) with open(local_file, 'rb') as f: test_images = extract_images(f) local_file = base.maybe_download(TEST_LABELS, train_dir, SOURCE_URL + TEST_LABELS) with open(local_file, 'rb') as f: test_labels = extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[0:train_images.shape[0]] print('=======================================================') print('shape of images :' + str(train_images.shape)) print('shape of labels :' + str(train_labels.shape)) print('=======================================================') train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) validation = DataSet(validation_images, validation_labels, dtype=dtype, reshape=reshape) test_labels = test_labels[0:test_images.shape[0]] test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape) return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(work_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=None, seed=None): if fake_data: def fake(): return DataSet([], [], fake_data=True, image_dims=32 * 32 * 3, num_class=10, one_hot=one_hot, dtype=dtype, seed=seed) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) root_data_dir = os.path.join(work_dir, "cifar-10-batches-py") if not os.path.exists(root_data_dir): # no data directory found # download gz file print( "Trying to download cifar data (if the tar.gz file is not available)" ) gz_fpath = base.maybe_download("cifar-10-python.tar.gz", work_dir, _SOURCE_URL) print("Extracting data in {}".format(root_data_dir)) with tarfile.open(gz_fpath) as tar: tar.extractall(work_dir) else: print("cifar data directory found {}".format(root_data_dir)) print("loading data...") X_train, Y_train, X_test, Y_test = load_CIFAR10(root_data_dir) if one_hot: num_class_train = len(np.unique(Y_train)) num_class_test = len(np.unique(Y_test)) assert num_class_test == num_class_train, \ "number of classes mismatch: {} and {}".format(num_class_train, num_class_test) Y_train = dense_to_one_hot(Y_train, num_class_train) Y_test = dense_to_one_hot(Y_test, num_class_test) if validation_size is None: validation_size = int(X_train.shape[0] / 10) valid_idx = np.random.choice(range(X_train.shape[0]), validation_size) mask = np.array([ True if row_idx in valid_idx else False for row_idx in range(X_train.shape[0]) ]) X_train, X_valid = X_train[~mask], X_train[mask] Y_train, Y_valid = Y_train[~mask], Y_train[mask] train_dataset = DataSet(X_train, Y_train, one_hot=one_hot, dtype=dtype, reshape=reshape, seed=seed) valid_dataset = DataSet(X_valid, Y_valid, one_hot=one_hot, dtype=dtype, reshape=reshape, seed=seed) test_dataset = DataSet(X_test, Y_test, one_hot=one_hot, dtype=dtype, reshape=reshape, seed=seed) return base.Datasets(train=train_dataset, validation=valid_dataset, test=test_dataset)
def read_data_sets(data_path, fake_data=False, one_hot=True, subsets=False, init_probs=[], percentage_train=1., corrupt_labels=False, unbalance=False, unbalance_dict=None, validation_size=5000, source_url=DEFAULT_SOURCE_URL): """ Returns a data provider for a dataset :param data_path: local directory to store data :param fake_data (optional): flag to indicate whether data should be reshaped :param one_hot (optional): flag to indicate whether data is one-hot encoded :param init_probs (optional): initial per-class probabilities :param percentage_train (optional): percentage of training data :param validation_size (optional): validation size :param source_url (optional): url where data can be found """ if unbalance_dict is None: unbalance_dict = {"percentage": 20, "label1": 0, "label2": 8} train_dir = data_path class DataSets(object): pass data_sets = DataSets() if fake_data: data_sets.train = DataSet([], [], fake_data=True, one_hot=True) data_sets.val = DataSet([], [], fake_data=True, one_hot=True) data_sets.test = DataSet([], [], fake_data=True, one_hot=True) return data_sets if not source_url: # empty string check source_url = DEFAULT_SOURCE_URL TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, source_url + TRAIN_IMAGES) with gfile.Open(local_file, 'rb') as f: train_images = extract_images(f) local_file = base.maybe_download(TRAIN_LABELS, train_dir, source_url + TRAIN_LABELS) with gfile.Open(local_file, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) local_file = base.maybe_download(TEST_IMAGES, train_dir, source_url + TEST_IMAGES) with gfile.Open(local_file, 'rb') as f: test_images = extract_images(f) local_file = base.maybe_download(TEST_LABELS, train_dir, source_url + TEST_LABELS) with gfile.Open(local_file, 'rb') as f: test_labels = extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) val_images = train_images[:validation_size] val_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] n_test = test_images.shape[0] n_val = val_images.shape[0] n_train = train_images.shape[0] if not init_probs: print('RANDOM INIT PROBABILITIES') probs = np.random.rand(n_train) else: init_probs = np.asarray(init_probs) probs_class = np.asarray(1.0 * init_probs / np.sum(init_probs), np.float32) dense_train_labels = np.argmax(train_labels, axis=1) probs = np.zeros_like(dense_train_labels, np.float32) for k in range(0, np.unique(dense_train_labels).max() + 1): i = np.where(dense_train_labels == k)[0] probs[i] = probs_class[k] train_probs = np.squeeze( normalize(np.expand_dims(probs, 1), axis=0, norm='l1')) val_probs = np.squeeze( normalize(np.expand_dims(np.ones(n_val, np.float32), 1), axis=0, norm='l1')) test_probs = np.squeeze( normalize(np.expand_dims(np.ones(n_test, np.float32), 1), axis=0, norm='l1')) # For experiments with limited amount of data if percentage_train != 1.: train_size = int(percentage_train * train_images.shape[0]) Xtrain_images, Xval_images, ytrain, yval, ptrain, probs_val = train_test_split( train_images, train_labels, train_probs, train_size=train_size, random_state=0) train_images = Xtrain_images train_labels = ytrain train_probs = ptrain # For experiments with class-imbalance distribution if unbalance: print('CLASS-IMBALANCE') n_classes = len(np.unique(np.argmax(train_labels, 1))) reduceto = 0.01 * unbalance_dict[0]['percentage'] label1 = unbalance_dict[0]['label1'] label2 = unbalance_dict[0]['label2'] pick_ids = [] newsize = 0 all_classes = np.arange(0, n_classes) all_classes = np.delete(all_classes, np.where(all_classes == label1)[0]) all_classes = np.delete(all_classes, np.where(all_classes == label2)[0]) for lab in [label1, label2]: allids = np.where(np.argmax(train_labels, 1) == lab)[0] selectedids = np.random.choice(allids, int(reduceto * allids.shape[0]), replace=False) pick_ids.append(selectedids) newsize += len(selectedids) new_ids = convert_list_to_array(pick_ids, newsize) other_ids = [] othersize = 0 for lab in all_classes.tolist(): selectedids = np.where(np.argmax(train_labels, 1) == lab)[0] other_ids.append(selectedids) othersize += len(selectedids) keep_ids = convert_list_to_array(other_ids, othersize) # new_ids: contains the indices of the reduced (imbalance) classes # keep_ids: contains the indices of the rest (keep the same class distribution) resulting_ids = np.concatenate((new_ids, keep_ids)) np.random.shuffle(resulting_ids) train_images = train_images[resulting_ids, ...] train_labels = train_labels[resulting_ids, ...] train_probs = train_probs[resulting_ids] train_indices = np.zeros(train_labels.shape[0]) val_indices = np.zeros(val_labels.shape[0]) test_indices = np.zeros(test_labels.shape[0]) if corrupt_labels: print('NOISE / CORRUPT LABELS') percentage_corrupted_labels = 30 number_corrupted_labels = int(1.0 * percentage_corrupted_labels / 100 * train_labels.shape[0]) dense_train_labels = np.argmax(train_labels, 1) old_train_labels = np.copy(dense_train_labels) idx_train_labels = np.arange(train_labels.shape[0]) idx_to_be_corrupted = np.random.choice(idx_train_labels, number_corrupted_labels, replace=False) train_indices[idx_to_be_corrupted] = 1 dense_train_labels[idx_to_be_corrupted] += 1 dense_train_labels[np.where(dense_train_labels == 10)[0]] = 0 train_labels = dense_to_one_hot(dense_train_labels, n_class=10) data_sets.train = DataSet(train_images, train_labels, train_probs, train_indices, fake_data=True, one_hot=True, subsets=subsets) data_sets.val = DataSet(val_images, val_labels, val_probs, val_indices, fake_data=True, one_hot=True, subsets=False) data_sets.test = DataSet(test_images, test_labels, test_probs, test_indices, fake_data=True, one_hot=True, subsets=False) return data_sets