def load_data(conf): """ Loads a specified dataset according to the parameters in the dictionary Parameters ---------- conf : WRITEME Returns ------- WRITEME """ logger.info('... loading dataset') # Special case for sparse format if conf.get('sparse', False): expected = inspect.getargspec(load_sparse_dataset)[0][1:] data = load_sparse_dataset(conf['dataset'], **subdict(conf, expected)) valid, test = data[1:3] # Sparse TERRY data on LISA servers contains an extra null first row in # valid and test subsets. if conf['dataset'] == 'terry': valid = valid[1:] test = test[1:] assert valid.shape[0] == test.shape[0] == 4096, \ 'Sparse TERRY data loaded has wrong number of examples' if len(data) == 3: return [data[0], valid, test] else: return [data[0], valid, test, data[3]] # Load as the usual ndarray expected = inspect.getargspec(load_ndarray_dataset)[0][1:] data = load_ndarray_dataset(conf['dataset'], **subdict(conf, expected)) # Special case for on-the-fly normalization if conf.get('normalize_on_the_fly', False): return data # Allocate shared variables def shared_dataset(data_x): """Function that loads the dataset into shared variables""" if conf.get('normalize', True): return sharedX(data_x, borrow=True) else: return theano.shared(theano._asarray(data_x), borrow=True) return map(shared_dataset, data)
def load_data(conf): """ Loads a specified dataset according to the parameters in the dictionary Parameters ---------- conf : WRITEME Returns ------- WRITEME """ print '... loading dataset' # Special case for sparse format if conf.get('sparse', False): expected = inspect.getargspec(load_sparse_dataset)[0][1:] data = load_sparse_dataset(conf['dataset'], **subdict(conf, expected)) valid, test = data[1:3] # Sparse TERRY data on LISA servers contains an extra null first row in # valid and test subsets. if conf['dataset'] == 'terry': valid = valid[1:] test = test[1:] assert valid.shape[0] == test.shape[0] == 4096, \ 'Sparse TERRY data loaded has wrong number of examples' if len(data) == 3: return [data[0], valid, test] else: return [data[0], valid, test, data[3]] # Load as the usual ndarray expected = inspect.getargspec(load_ndarray_dataset)[0][1:] data = load_ndarray_dataset(conf['dataset'], **subdict(conf, expected)) # Special case for on-the-fly normalization if conf.get('normalize_on_the_fly', False): return data # Allocate shared variables def shared_dataset(data_x): """Function that loads the dataset into shared variables""" if conf.get('normalize', True): return sharedX(data_x, borrow=True) else: return theano.shared(theano._asarray(data_x), borrow=True) return map(shared_dataset, data)
def create_submission(conf, transform_valid, transform_test=None, features=None): """ Create a submission file given a configuration dictionary and a computation function. Note that it always reload the datasets to ensure valid & test are not permuted. """ if transform_test is None: transform_test = transform_valid # Load the dataset, without permuting valid and test kwargs = subdict(conf, ['dataset', 'normalize', 'normalize_on_the_fly', 'sparse']) kwargs.update(randomize_valid=False, randomize_test=False) valid_set, test_set = load_data(kwargs)[1:3] # Sparse datasets are not stored as Theano shared vars. if not conf.get('sparse', False): valid_set = valid_set.get_value(borrow=True) test_set = test_set.get_value(borrow=True) # Prefilter features, if needed. if features is not None: valid_set = valid_set[:, features] test_set = test_set[:, features] # Valid and test representations valid_repr = transform_valid(valid_set) test_repr = transform_test(test_set) # Convert into text info save_submission(conf, valid_repr, test_repr)
def fromdict(cls, conf, **kwargs): """ Alternative way to build a block, by using a dictionary """ arglist = [] kwargs.update(conf) # Loop over all superclasses of cls # NB : Supposes that "cls" is the first element returned by "getmro()" for elem in inspect.getmro(cls): # Extend arglist with arguments of elem.__init__ argspec = inspect.getargspec(elem.__init__) arglist.extend(argspec[0]) # If a keyworkds argument is not expected, then break the loop if argspec[2] is None: break # Build the class with appropriated arguments return cls(**subdict(kwargs, arglist))
def create_submission(conf, transform_valid, transform_test=None, features=None): """ Create a submission file given a configuration dictionary and a computation function. Note that it always reload the datasets to ensure valid & test are not permuted. Parameters ---------- transform_valid : WRITEME transform_test : WRITEME features : WRITEME """ if transform_test is None: transform_test = transform_valid # Load the dataset, without permuting valid and test kwargs = subdict( conf, ['dataset', 'normalize', 'normalize_on_the_fly', 'sparse']) kwargs.update(randomize_valid=False, randomize_test=False) valid_set, test_set = load_data(kwargs)[1:3] # Sparse datasets are not stored as Theano shared vars. if not conf.get('sparse', False): valid_set = valid_set.get_value(borrow=True) test_set = test_set.get_value(borrow=True) # Prefilter features, if needed. if features is not None: valid_set = valid_set[:, features] test_set = test_set[:, features] # Valid and test representations valid_repr = transform_valid(valid_set) test_repr = transform_test(test_set) # Convert into text info save_submission(conf, valid_repr, test_repr)