def test_sparse_ule(): skip_if_no_data() # Test loading of transfer data train, valid, test, transfer = utlc.load_sparse_dataset("ule", normalize=True, transfer=True) assert train.shape[0] == transfer.shape[0]
def test_all_sparse_utlc(): skip_if_no_data() for name in ['harry', 'terry', 'ule']: print "Loading sparse ", name train, valid, test = utlc.load_sparse_dataset(name, normalize=True) nb_elem = numpy.prod(train.shape) mi = train.data.min() ma = train.data.max() mi = min(0, mi) ma = max(0, ma) su = train.data.sum() mean = float(su) / nb_elem print name, "dtype, max, min, mean, nb non-zero, nb element, %sparse" print train.dtype, ma, mi, mean, train.nnz, nb_elem, ( nb_elem - float(train.nnz)) / nb_elem print name, "max, min, mean, std (all stats on non-zero element)" print train.data.max(), train.data.min(), train.data.mean( ), train.data.std() assert scipy.sparse.issparse( train), "train is not sparse for %s dataset" % name assert scipy.sparse.issparse( valid), "valid is not sparse for %s dataset" % name assert scipy.sparse.issparse( test), "test is not sparse for %s dataset" % name assert train.shape[1] == test.shape[1] == valid.shape[ 1], "shapes of sparse %s dataset do not match" % name
def test_sparse_ule(): skip_if_no_data() # Test loading of transfer data train, valid, test, transfer = utlc.load_sparse_dataset("ule", normalize=True, transfer=True) assert train.shape[0] == transfer.shape[0]
def load_data(conf): """ Loads a specified dataset according to the parameters in the dictionary Parameters ---------- conf : WRITEME Returns ------- WRITEME """ logger.info('... loading dataset') # Special case for sparse format if conf.get('sparse', False): expected = inspect.getargspec(load_sparse_dataset)[0][1:] data = load_sparse_dataset(conf['dataset'], **subdict(conf, expected)) valid, test = data[1:3] # Sparse TERRY data on LISA servers contains an extra null first row in # valid and test subsets. if conf['dataset'] == 'terry': valid = valid[1:] test = test[1:] assert valid.shape[0] == test.shape[0] == 4096, \ 'Sparse TERRY data loaded has wrong number of examples' if len(data) == 3: return [data[0], valid, test] else: return [data[0], valid, test, data[3]] # Load as the usual ndarray expected = inspect.getargspec(load_ndarray_dataset)[0][1:] data = load_ndarray_dataset(conf['dataset'], **subdict(conf, expected)) # Special case for on-the-fly normalization if conf.get('normalize_on_the_fly', False): return data # Allocate shared variables def shared_dataset(data_x): """Function that loads the dataset into shared variables""" if conf.get('normalize', True): return sharedX(data_x, borrow=True) else: return theano.shared(theano._asarray(data_x), borrow=True) return map(shared_dataset, data)
def load_data(conf): """ Loads a specified dataset according to the parameters in the dictionary Parameters ---------- conf : WRITEME Returns ------- WRITEME """ print '... loading dataset' # Special case for sparse format if conf.get('sparse', False): expected = inspect.getargspec(load_sparse_dataset)[0][1:] data = load_sparse_dataset(conf['dataset'], **subdict(conf, expected)) valid, test = data[1:3] # Sparse TERRY data on LISA servers contains an extra null first row in # valid and test subsets. if conf['dataset'] == 'terry': valid = valid[1:] test = test[1:] assert valid.shape[0] == test.shape[0] == 4096, \ 'Sparse TERRY data loaded has wrong number of examples' if len(data) == 3: return [data[0], valid, test] else: return [data[0], valid, test, data[3]] # Load as the usual ndarray expected = inspect.getargspec(load_ndarray_dataset)[0][1:] data = load_ndarray_dataset(conf['dataset'], **subdict(conf, expected)) # Special case for on-the-fly normalization if conf.get('normalize_on_the_fly', False): return data # Allocate shared variables def shared_dataset(data_x): """Function that loads the dataset into shared variables""" if conf.get('normalize', True): return sharedX(data_x, borrow=True) else: return theano.shared(theano._asarray(data_x), borrow=True) return map(shared_dataset, data)
def test_all_sparse_utlc(): skip_if_no_data() for name in ['harry','terry','ule']: print "Loading sparse ", name train, valid, test = utlc.load_sparse_dataset(name, normalize=True) nb_elem = numpy.prod(train.shape) mi = train.data.min() ma = train.data.max() mi = min(0, mi) ma = max(0, ma) su = train.data.sum() mean = float(su)/nb_elem print name,"dtype, max, min, mean, nb non-zero, nb element, %sparse" print train.dtype, ma, mi, mean, train.nnz, nb_elem, (nb_elem-float(train.nnz))/nb_elem print name,"max, min, mean, std (all stats on non-zero element)" print train.data.max(), train.data.min(), train.data.mean(), train.data.std() assert scipy.sparse.issparse(train), "train is not sparse for %s dataset" % name assert scipy.sparse.issparse(valid), "valid is not sparse for %s dataset" % name assert scipy.sparse.issparse(test), "test is not sparse for %s dataset" % name assert train.shape[1]==test.shape[1]==valid.shape[1], "shapes of sparse %s dataset do not match" % name
def test_all_sparse_utlc(): skip_if_no_data() for name in ['harry', 'terry', 'ule']: print("Loading sparse ", name) train, valid, test = utlc.load_sparse_dataset(name, normalize=True) nb_elem = numpy.prod(train.shape) mi = train.data.min() ma = train.data.max() mi = min(0, mi) ma = max(0, ma) su = train.data.sum() mean = float(su) / nb_elem print(name, "dtype, max, min, mean, nb non-zero, nb element, %sparse") print(train.dtype, ma, mi, mean, train.nnz, end='') print(nb_elem, (nb_elem - float(train.nnz)) / nb_elem) print(name, "max, min, mean, std (all stats on non-zero element)") print(train.data.max(), train.data.min(), end='') print(train.data.mean(), train.data.std()) assert scipy.sparse.issparse(train) assert scipy.sparse.issparse(valid) assert scipy.sparse.issparse(test) assert train.shape[1] == test.shape[1] == valid.shape[1]