def testConv1DOutputSize(self): try: x = ftensor3('x') #batch, channels, dim s = (None, 15, 94) filters = 25 filter_size = 2 padding = 2 stride = 2 conv1 = Conv1D(inputs=(s, x), n_filters=filters, filter_size=filter_size, padding=padding, stride=stride, outdir=None) f1 = function(inputs=[x], outputs=conv1.get_outputs().shape, allow_input_downcast=True) x1 = np.ones((100, 15, 94)) outs = f1(x1) self.compareSizes(outs=outs, output_size=conv1.output_size, in_size=s, batches=100) finally: if 'x' in locals(): del x if 'conv1' in locals(): del conv1 if 'f1' in locals(): del f1 if 'outs' in locals(): del outs if 'x1' in locals(): del x1
def testConv2DOutputSize(self): try: x = ftensor4('x') # batch, channels, height, width s = (None, 3, 25, 32) filters = 25 filter_size = 5 padding = 3 stride = 3 conv1 = Conv2D(inputs=(s, x), n_filters=filters, filter_size=filter_size, padding=padding, stride=stride, outdir=None) f1 = function(inputs=[x], outputs=conv1.get_outputs().shape, allow_input_downcast=True) x1 = np.ones((100, 3, 25, 32)) outs = f1(x1) self.compareSizes(outs=outs, output_size=conv1.output_size, in_size=s, batches=100) finally: if 'x' in locals(): del x if 'conv1' in locals(): del conv1 if 'f1' in locals(): del f1 if 'outs' in locals(): del outs if 'x1' in locals(): del x1
def compile_run_fn(self): """ This is a helper function to compile the f_run function for computing the model's outputs given inputs. Compile and set the f_run function used for `run()`. It sets the `self.f_run` attribute to the f_run function. .. note:: The run function defaults like so:: self.f_run = function(inputs = raise_to_list(self.get_inputs()), outputs = self.get_outputs(), updates = self.get_updates(), name = 'f_run') """ if not hasattr(self, 'f_run'): log.debug("Compiling f_run...") t = time.time() self.f_run = function(inputs = raise_to_list(self.get_inputs()), outputs = self.get_outputs(), updates = self.get_updates(), name = 'f_run') log.debug("Compilation done. Took %s", make_time_units_string(time.time() - t)) else: log.warn('f_run already exists!')
def compile_run_fn(self): """ This is a helper function to compile the f_run function for computing the model's outputs given inputs. Compile and set the f_run function used for `run()`. It sets the `self.f_run` attribute to the f_run function. .. note:: The run function defaults like so:: self.f_run = function(inputs = raise_to_list(self.get_inputs()), outputs = self.get_outputs(), updates = self.get_updates(), name = 'f_run') """ if not hasattr(self, 'f_run'): log.debug("Compiling f_run...") t = time.time() self.f_run = function(inputs=raise_to_list(self.get_inputs()), outputs=self.get_outputs(), updates=self.get_updates(), name='f_run') log.debug("Compilation done. Took %s", make_time_units_string(time.time() - t)) else: log.warn('f_run already exists!')
def generate(self, initial=None, n_steps=None): """ Generate visible inputs from the model for `n_steps` and starting at recurrent hidden state `initial`. Parameters ---------- initial : tensor Recurrent hidden state to start generation from. n_steps : int Number of generation steps to do. Returns ------- tuple(array_like, array_like) The generated inputs and the ending recurrent hidden states. """ # compile the generate function! if not hasattr(self, 'f_generate'): log.debug("compiling f_generate...") self.f_generate = function(inputs=[self.generate_u0, self.n_steps], outputs=[self.x_ts, self.u_t], updates=self.updates_generate) log.debug("compilation done!") initial = initial or self.u0.eval() n_steps = n_steps or self.generate_n_steps return self.f_generate(initial, n_steps)
def run(self, input): """ This method will return the Prototype's output (run through the `f_run` function), given an input. The input comes from all unique inputs to the models in the Prototype as calculated from `get_inputs()` and the outputs computed similarly from `get_outputs`. Try to avoid re-compiling the theano function created for run - check a `hasattr(self, 'f_run')` or something similar first. Parameters ---------- input: array_like Theano/numpy tensor-like object that is the input into the model's computation graph. Returns ------- array_like Theano/numpy tensor-like object that is the output of the model's computation graph. """ # make sure the input is raised to a list - we are going to splat it! input = raise_to_list(input) # first check if we already made an f_run function if hasattr(self, 'f_run'): return self.f_run(*input) # otherwise, compile it! else: inputs = self.get_inputs() outputs = self.get_outputs() updates = self.get_updates() t = time.time() log.info("Compiling f_run...") self.f_run = function(inputs=inputs, outputs=outputs, updates=updates, name="f_run") log.info("Compilation done! Took %s", make_time_units_string(time.time() - t)) return self.f_run(*input)
def _compile_csl_fn(): """ BUG HERE, not doing properly by chains (still has the bug, I don't see it) This is taking too much GPU mem mean: N(# of chains)*K(samples per chain)*D(data dim) minibatch: M(# of examples)*D (data dim) M * N matrix where each element is LL of one example against one chain. This function is for computing CSL over parallel chains of minibatches. Returns ------- theano function Function computing M * N matrix where each element is LL of one example against one chain. """ # when means is a 3D tensor (N, K, D) # When there are N chains, each chain having K samples of dimension D log.debug('building theano fn for Bernoulli CSL') means = T.tensor3('chains') minibatch = T.matrix('inputs') # how many chains CSL average over N = 5 # minibatch size M = 10 # data dim D = 784 minibatch.tag.test_value = as_floatX(numpy.random.binomial(1, 0.5, size=(M, D))) # chain length K = 100 means.tag.test_value = as_floatX(numpy.random.uniform(size=(N, K, D))) # computing LL # the length of each chain sample_size = means.shape[1] _minibatch = minibatch.dimshuffle(0, 'x', 'x', 1) _means = means.dimshuffle('x', 0, 1, 2) A = T.log(sample_size) B = _minibatch * T.log(_means) + (1. - _minibatch) * T.log(1. - _means) C = B.sum(axis=3) D = log_sum_exp_theano(C, axis=2) E = D - A # G = E.mean(axis=1) f = function( inputs=[minibatch, means], outputs=E, name='CSL_independent_bernoulli_fn' ) return f
def _compile_csl_fn(): """ BUG HERE, not doing properly by chains (still has the bug, I don't see it) This is taking too much GPU mem mean: N(# of chains)*K(samples per chain)*D(data dim) minibatch: M(# of examples)*D (data dim) M * N matrix where each element is LL of one example against one chain. This function is for computing CSL over parallel chains of minibatches. Returns ------- theano function Function computing M * N matrix where each element is LL of one example against one chain. """ # when means is a 3D tensor (N, K, D) # When there are N chains, each chain having K samples of dimension D log.debug('building theano fn for Bernoulli CSL') means = T.tensor3('chains') minibatch = T.matrix('inputs') # how many chains CSL average over N = 5 # minibatch size M = 10 # data dim D = 784 minibatch.tag.test_value = as_floatX( numpy.random.binomial(1, 0.5, size=(M, D))) # chain length K = 100 means.tag.test_value = as_floatX(numpy.random.uniform(size=(N, K, D))) # computing LL # the length of each chain sample_size = means.shape[1] _minibatch = minibatch.dimshuffle(0, 'x', 'x', 1) _means = means.dimshuffle('x', 0, 1, 2) A = T.log(sample_size) B = _minibatch * T.log(_means) + (1. - _minibatch) * T.log(1. - _means) C = B.sum(axis=3) D = log_sum_exp_theano(C, axis=2) E = D - A # G = E.mean(axis=1) f = function(inputs=[minibatch, means], outputs=E, name='CSL_independent_bernoulli_fn') return f
def _compile_csl_fn_v2(mu): """ p(x) = sum_h p(x|h)p(h) where p(x|h) is independent Bernoulli with a vector mu, mu_i for dim_i This function is for computing CSL over minibatches (in a single chain). Parameters ---------- mu : array_like mu is (N,D) numpy array Returns ------- theano function Function computing the Bernoulli CSL log likelihood. """ # log.debug('building theano fn for Bernoulli CSL') x = T.fmatrix('inputs') x.tag.test_value = as_floatX(numpy.random.uniform(size=(10, 784))) mu = numpy.clip(mu, 1e-10, (1 - (1e-5))) mu = mu[None, :, :] inner_1 = numpy.log(mu) inner_2 = numpy.log(1. - mu) k = mu.shape[1] D = mu.shape[2] # there are two terms in the log(p(x|mu)) term_1 = -T.log(k) c = T.sum(x.dimshuffle(0, 'x', 1) * inner_1 + (1. - x.dimshuffle(0, 'x', 1)) * inner_2, axis=2) debug = c.sum(axis=1) term_2 = log_sum_exp_theano(c, axis=1) log_likelihood = term_1 + term_2 f = function([x], log_likelihood, name='CSL_independent_bernoulli_fn') return f
def run(self, input): """ This method will return the model's output (run through the function), given an input. In the case that input_hooks or hidden_hooks are used, the function should use them appropriately and assume they are the input. Try to avoid re-compiling the theano function created for run - check a hasattr(self, 'f_run') or something similar first. I recommend creating your theano f_run in a create_computation_graph method to be called after the class initializes. ------------------ :param input: Theano/numpy tensor-like object that is the input into the model's computation graph. :type input: tensor :return: Theano/numpy tensor-like object that is the output of the model's computation graph. :rtype: tensor """ # set any noise switches to zero if len(self.get_noise_switch()) > 0: vals = [switch.get_value() for switch in self.get_noise_switch()] [switch.set_value(0.) for switch in self.get_noise_switch()] # check if the run function is already compiled, otherwise compile it! if not hasattr(self, 'f_run'): log.debug("Compiling f_run...") t = time.time() self.f_run = function(inputs = raise_to_list(self.get_inputs()), outputs = self.get_outputs(), updates = self.get_updates()) log.debug("Compilation done. Took %s", make_time_units_string(time.time() - t)) # because we use the splat to account for multiple inputs to the function, make sure input is a list. input = raise_to_list(input) # return the results of the run function! output = self.f_run(*input) # reset the noise switches if len(self.get_noise_switch()) > 0: [switch.set_value(val) for switch, val in zip(self.get_noise_switch(), vals)] return output
def generate(self, initial=None, n_steps=None): """ Generate visible inputs from the model for n_steps and starting at recurrent hidden state initial :param initial: recurrent hidden state to start generation from :type initial: tensor :param n_steps: number of generation steps to do :type n_steps: int :return: the generated inputs and the ending recurrent hidden state :rtype: matrix, matrix """ # compile the generate function! if not hasattr(self, 'f_generate'): self.f_generate = function(inputs=[self.generate_u0, self.n_steps], outputs=[self.v_ts, self.u_t], updates=self.updates_generate) initial = initial or self.u0.eval() n_steps = n_steps or self.generate_n_steps return self.f_generate(initial, n_steps)
def generate(self, initial=None, n_steps=None): """ Generate visible inputs from the model for n_steps and starting at recurrent hidden state initial :param initial: recurrent hidden state to start generation from :type initial: tensor :param n_steps: number of generation steps to do :type n_steps: int :return: the generated inputs and the ending recurrent hidden state :rtype: matrix, matrix """ # compile the generate function! if not hasattr(self, 'f_generate'): self.f_generate = function(inputs=[self. generate_u0, self.n_steps], outputs=[self.v_ts, self.u_t], updates=self.updates_generate) initial = initial or self.u0.eval() n_steps = n_steps or self.generate_n_steps return self.f_generate(initial, n_steps)
def train(self, monitor_channels=None, train_outservice=None, plot=None, continue_training=False): """ This method performs the training!!! It is an online training method that goes over minibatches from the dataset for a number of epochs, updating parameters after each minibatch. You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully. Parameters ---------- monitor_channels : list(MonitorsChannel or Monitor), optional The list of channels or monitors containing monitor expressions/variables to compile and evaluate on the data. train_outservice : OutService, optional The OutService to use for the automatically created train_cost monitor. Default of None just outputs to logs. plot : Plot, optional The Plot object to use if we want to graph the outputs (uses bokeh server). continue_training : bool Whether to continue training from a previous point. """ ############################################### # theano index variable to use on the dataset # ############################################### # index to a [mini]batch - both start and end data_idx = T.iscalar('data_index') data_end_idx = T.iscalar('data_end_index') function_input = [data_idx, data_end_idx] batch_slice = slice(data_idx, data_end_idx) # compute number of minibatches for training, validation and testing # shapes is list of list - input list of datasets to optimizer (for multiple inputs), and each dataset # could be a list of shared variables (like multiple sequences from files) train_data_shapes = raise_to_list(self.dataset.getDataShape(TRAIN)) valid_data_shapes = raise_to_list(self.dataset.getDataShape(VALID)) test_data_shapes = raise_to_list(self.dataset.getDataShape(TEST)) # train_batches is going to be lists of tuples that contain the start and end indices for train data. # this is more useful in the case of datasets that are lists of sequences, so that the start and end # indices can make sure a batch does not cross the sequence boundary on the concatenated data train_data_lens = [shape[0] for shape in train_data_shapes] self.train_batches = self._get_batch_indices(train_data_lens) if valid_data_shapes is not None: valid_data_lens = [shape[0] for shape in valid_data_shapes] self.valid_batches = self._get_batch_indices(valid_data_lens) else: self.valid_batches = None if test_data_shapes is not None: test_data_lens = [shape[0] for shape in test_data_shapes] self.test_batches = self._get_batch_indices(test_data_lens) else: self.test_batches = None # create the givens for the input function as pairs of (input_variable: sliced_data) train_givens = self._get_givens_subset(TRAIN, batch_slice) valid_givens = self._get_givens_subset(VALID, batch_slice) test_givens = self._get_givens_subset(TEST, batch_slice) # Now time to create the gradient updates for the model - make sure to handle the possible # list of costs used for pretraining of certain parts of the model. train_costs = raise_to_list(self.model.get_train_cost()) train_updates = [] self.gradients = [] for i, train_cost in enumerate(train_costs): # Now create the training cost function for the model to use while training - update parameters # gradient! gradients, _ = self.model.get_gradient(cost=train_cost) self.gradients.append(gradients) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD! # It tells how to update the params each training epoch gradient_updates = self.get_updates(gradients) # Combine the updates from the model also if applicable updates = self.model.get_updates() if updates: updates.update(gradient_updates) else: updates = gradient_updates train_updates.append(updates) # grab the model parameters to use during training self.params = self.model.get_params() log.info("%s params: %s", str(type(self.model)), str(self.params)) # deal with the monitor channels if they were given (or take them from the plot) if monitor_channels is None and plot is not None and len(plot.channels) > 0: monitor_channels = plot.channels self.train_monitors_dict = {} self.valid_monitors_dict = {} self.test_monitors_dict = {} self.train_monitors_outservice_dict = {} self.valid_monitors_outservice_dict = {} self.test_monitors_outservice_dict = {} if monitor_channels: # collapse the appropriate monitors into their (name, expression, out_service) tuples train_collapsed = collapse_channels(monitor_channels, train=True) valid_collapsed = collapse_channels(monitor_channels, valid=True) test_collapsed = collapse_channels(monitor_channels, test=True) # get name: expression dictionary self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed]) self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed]) self.test_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in test_collapsed]) # get name: outservice dictionary self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed]) self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed]) self.test_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in test_collapsed]) # finally deal with an outservice provided to monitor training cost self.train_outservice = train_outservice # remove redundant files made by the fileservice for the train monitor. # TODO: THIS FEELS LIKE A HACK. I don't like it. if isinstance(self.train_outservice, FileService): os.remove(self.train_outservice.valid_filename) os.remove(self.train_outservice.test_filename) ####################################### # compile train and monitor functions # ####################################### train_functions = [] for i in range(len(train_costs)): updates = train_updates[i] train_cost = train_costs[i] # Compile the training function! log.info('Compiling f_learn %d/%d function for model %s...', i + 1, len(train_updates), str(type(self.model))) t = time.time() f_learn = function(inputs=function_input, updates=updates, outputs=[train_cost] + self.train_monitors_dict.values(), givens=train_givens, name='f_learn_%d' % i) log.info('f_learn compilation took %s', make_time_units_string(time.time() - t)) train_functions.append(f_learn) # figure out if we want valid and test self.valid_flag = (self.dataset.getSubset(VALID)[0] is not None) and (len(self.valid_monitors_dict) > 0) self.test_flag = (self.dataset.getSubset(TEST)[0] is not None) and (len(self.test_monitors_dict) > 0) # Now compile the monitor functions! log.debug("Compiling monitor functions...") monitor_t = time.time() # valid monitors if self.valid_flag: self.valid_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=self.valid_monitors_dict.values(), givens=valid_givens, name='valid_monitor_function' ) else: self.valid_monitor_function = None # test monitors if self.test_flag: self.test_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=self.test_monitors_dict.values(), givens=test_givens, name='test_monitor_function' ) else: self.test_monitor_function = None log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t)) ################## # start training # ################## # make sure to deal with a list of train_cost functions - for layer-wise pretraining! # this list of training functions was created during __init__() start_time = time.time() for func_i, train_function in enumerate(train_functions): log.info("-----------TRAINING %s function %d/%d FOR %d EPOCHS (continue_training=%s)-----------", str(type(self.model)), func_i + 1, len(train_functions), self.n_epoch, str(continue_training)) log.debug("Train dataset size is: %s", self.dataset.getDataShape(TRAIN)) if self.dataset.getSubset(VALID)[0] is not None: log.debug("Valid dataset size is: %s", self.dataset.getDataShape(VALID)) if self.dataset.getSubset(TEST)[0] is not None: log.debug("Test dataset size is: %s", self.dataset.getDataShape(TEST)) self.STOP = False self.epoch_counter = 0 if not continue_training: # reset any decay params for decay_param in self.get_decay_params(): decay_param.reset() self.times = [] self.best_cost = numpy.inf self.best_params = None self.patience = 0 t = time.time() while not self.STOP: try: self.STOP = self._perform_one_epoch(train_function, plot) except KeyboardInterrupt: log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT") self.STOP = True # save params if self.best_params is not None: log.debug("Restoring best model parameters...") set_shared_values(self.params, self.best_params) log.debug("Saving model parameters...") self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl') log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t)) log.info("------------TOTAL %s TRAIN TIME TOOK %s---------", str(type(self.model)), make_time_units_string(time.time() - start_time))
def __init__(self, model, dataset, iterator_class=SequentialIterator, config=None, defaults=_defaults, rng=None, n_epoch=None, batch_size=None, minimum_batch_size=None, save_frequency=None, early_stop_threshold=None, early_stop_length=None, learning_rate=None, lr_decay=None, lr_factor=None, momentum=None, momentum_decay=None, momentum_factor=None, nesterov_momentum=None, flag_para_load=None): # superclass init super(SGD, self).__init__(config=config, defaults=defaults) # config and defaults are now combined in self.args! yay! self.model = model self.dataset = dataset self.iterator = iterator_class # Training epochs - how many times to iterate over the whole dataset self.n_epoch = n_epoch or self.args.get('n_epoch') # Dataset iteration batch sizes - number of examples in each calculation self.batch_size = batch_size or self.args.get('batch_size') self.minimum_batch_size = minimum_batch_size or self.args.get( 'minimum_batch_size') # Number of epochs between saving model parameters self.save_frequency = save_frequency or self.args.get('save_frequency') # Early stopping threshold and patience - by how much does the cost have to improve over a number of epochs self.early_stop_threshold = early_stop_threshold or self.args.get( 'early_stop_threshold') self.early_stop_length = early_stop_length or self.args.get( 'early_stop_length') # Learning rate - how drastic of a step do the parameters change lr = learning_rate or self.args.get('learning_rate') self.learning_rate = sharedX(lr, 'learning_rate') self.lr_scalers = self.model.get_lr_scalers() if lr_decay or self.args.get('lr_decay'): self.learning_rate_decay = get_decay_function( lr_decay or self.args.get('lr_decay'), self.learning_rate, self.learning_rate.get_value(), lr_factor or self.args.get('lr_factor')) # Momentum - smoothing over the parameter changes (see Hinton) self.momentum = sharedX(momentum or self.args.get('momentum'), 'momentum') if self.args.get('momentum_decay'): self.momentum_decay = get_decay_function( momentum_decay or self.args.get('momentum_decay'), self.momentum, self.momentum.get_value(), momentum_factor or self.args.get('momentum_factor')) self.nesterov_momentum = nesterov_momentum or self.args.get( 'nesterov_momentum') # RNG for working on random iterator if rng is None: random.seed(123) self.rng = random else: self.rng = rng self.params = self.model.get_params() # Now create the training cost function for the model to use while training - update parameters log.info("%s params: %s", str(type(self.model)), str(self.params)) # gradient! gradient = grad(self.model.get_train_cost(), self.params) grads = OrderedDict(zip(self.params, gradient)) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD, including AdaDelta! # It tells how to update the params each training epoch gradient_updates = self.get_updates(grads) # Combine the updates from the model also if applicable train_updates = model.get_updates() if train_updates: train_updates.update(gradient_updates) else: train_updates = gradient_updates # Compile the training function! log.info('Compiling f_learn function for model %s...', str(type(self.model))) t = time.time() self.f_learn = function(inputs=model.get_inputs(), updates=train_updates, outputs=self.model.get_train_cost(), name='f_learn') log.info('f_learn compilation took %s', make_time_units_string(time.time() - t)) # Determine if this function is unsupervised or not by looking at the number of inputs to the f_learn function. # If there is only one input, it is unsupervised, otherwise, it is supervised. # This workaround was provided by Pascal Lamblin on the theano-users google group num_inputs = len( [i for i in self.f_learn.maker.inputs if not i.shared]) if num_inputs == 1: log.debug("Model is unsupervised: 1 input to f_learn.") self.unsupervised = True elif num_inputs == 2: log.debug("Model is supervised: 2 inputs to f_learn.") self.unsupervised = False else: log.error( "Number of inputs to f_learn on model %s was %s. Needs to be 1 for unsupervised or 2 for supervised.", str(type(self.model)), str(num_inputs)) raise AssertionError( "Number of inputs to f_learn on model %s was %s. Needs to be 1 for unsupervised or 2 for supervised." % str(type(self.model)), str(num_inputs)) # grab the function(s) to use to monitor different model values during training self.monitors = self.model.get_monitors()
def __init__(self, config=None, defaults=_defaults, inputs_hook=None, hiddens_hook=None, params_hook=None, input_size=None, hidden_size=None, corruption_level=None, hidden_activation=None, visible_activation=None, cost_function=None): # Now, initialize with Model class to combine config and defaults! # Here, defaults is defined via a dictionary. However, you could also # pass a filename to a JSON or YAML file with the same format. super(DenoisingAutoencoder, self).__init__(config=config, defaults=defaults) # Any parameter from the 'config' will overwrite the 'defaults' dictionary. # These parameters are now accessible from the 'self.args' variable! # When accessing model parameters, it is best practice to try to find the parameters # explicitly passed first, and then go to the 'self.args' configuration. # Define model hyperparameters # deal with the inputs_hook and hiddens_hook for the size parameters! # if the hook exists, grab the size from the first element of the tuple. if inputs_hook: input_size = inputs_hook[0] # otherwise, grab the size from the configurations. else: input_size = input_size or self.args.get('input_size') if hiddens_hook: hidden_size = hiddens_hook[0] else: hidden_size = hidden_size or self.args.get('hidden_size') corruption_level = corruption_level or self.args.get( 'corruption_level') # use the helper methods to grab appropriate activation functions from names! hidden_act_name = hidden_activation or self.args.get( 'hidden_activation') hidden_activation = get_activation_function(hidden_act_name) visible_act_name = visible_activation or self.args.get( 'visible_activation') visible_activation = get_activation_function(visible_act_name) # do the same for the cost function cost_func_name = cost_function or self.args.get('cost_function') cost_function = get_cost_function(cost_func_name) # Now, define the symbolic input to the model (Theano) # We use a matrix rather than a vector so that minibatch processing can be done in parallel. # Make sure to deal with 'inputs_hook' if it exists! if inputs_hook: # grab the new input variable from the inputs_hook tuple x = inputs_hook[1] else: x = T.fmatrix("X") self.inputs = [x] # Build the model's parameters - a weight matrix and two bias vectors # Make sure to deal with 'params_hook' if it exists! if params_hook: # check to see if it contains the three necessary variables assert len(params_hook ) == 3, "Not correct number of params to DAE, needs 3!" W, b0, b1 = params_hook else: W = get_weights_uniform(shape=(input_size, hidden_size), name="W") b0 = get_bias(shape=input_size, name="b0") b1 = get_bias(shape=hidden_size, name="b1") self.params = [W, b0, b1] # Perform the computation for a denoising autoencoder! # first, add noise (corrupt) the input corrupted_input = salt_and_pepper(input=x, corruption_level=corruption_level) # next, compute the hidden layer given the inputs (the encoding function) # We don't need to worry about hiddens_hook during training, because we can't # compute a cost without having the input! # hiddens_hook is more for the predict function and linking methods below. hiddens = hidden_activation(T.dot(corrupted_input, W) + b1) # finally, create the reconstruction from the hidden layer (we tie the weights with W.T) reconstruction = visible_activation(T.dot(hiddens, W.T) + b0) # the training cost is reconstruction error self.train_cost = cost_function(output=reconstruction, target=x) # Compile everything into a Theano function for prediction! # When using real-world data in predictions, we wouldn't corrupt the input first. # Therefore, create another version of the hiddens and reconstruction without adding the noise. # Here is where we would handle hiddens_hook because this is a generative model! # For the predict function, it would take in the hiddens instead of the input variable x. if hiddens_hook: self.hiddens = hiddens_hook[1] else: self.hiddens = hidden_activation(T.dot(x, W) + b1) # make the reconstruction (generated) from the hiddens self.recon_predict = visible_activation(T.dot(self.hiddens, W.T) + b0) # now compile the predict function accordingly - if it used x or hiddens as the input. if hiddens_hook: self.f_predict = function(inputs=[self.hiddens], outputs=self.recon_predict) else: self.f_predict = function(inputs=[x], outputs=self.recon_predict)
def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, input_size=28*28, hidden_size=1000, noise_level=0.4, hidden_activation='tanh', visible_activation='sigmoid', cost_function='binary_crossentropy'): # initialize the Model superclass super(DenoisingAutoencoder, self).__init__( **{arg: val for (arg, val) in locals().iteritems() if arg is not 'self'} ) # Define model hyperparameters # deal with the inputs_hook and hiddens_hook for the size parameters! # if the hook exists, grab the size from the first element of the tuple. if self.inputs_hook is not None: assert len(self.inputs_hook) == 2, "Was expecting inputs_hook to be a tuple." self.input_size = inputs_hook[0] if self.hiddens_hook is not None: assert len(self.hiddens_hook) == 2, "was expecting hiddens_hook to be a tuple." hidden_size = hiddens_hook[0] # use the helper methods to grab appropriate activation functions from names! hidden_activation = get_activation_function(hidden_activation) visible_activation = get_activation_function(visible_activation) # do the same for the cost function cost_function = get_cost_function(cost_function) # Now, define the symbolic input to the model (Theano) # We use a matrix rather than a vector so that minibatch processing can be done in parallel. # Make sure to deal with 'inputs_hook' if it exists! if self.inputs_hook is not None: # grab the new input variable from the inputs_hook tuple x = self.inputs_hook[1] else: x = T.matrix("X") self.inputs = [x] # Build the model's parameters - a weight matrix and two bias vectors # Make sure to deal with 'params_hook' if it exists! if self.params_hook: # check to see if it contains the three necessary variables assert len(self.params_hook) == 3, "Not correct number of params to DAE, needs 3!" W, b0, b1 = self.params_hook else: W = get_weights_uniform(shape=(self.input_size, hidden_size), name="W") b0 = get_bias(shape=self.input_size, name="b0") b1 = get_bias(shape=hidden_size, name="b1") self.params = [W, b0, b1] # Perform the computation for a denoising autoencoder! # first, add noise (corrupt) the input corrupted_input = salt_and_pepper(input=x, noise_level=noise_level) # next, run the hidden layer given the inputs (the encoding function) # We don't need to worry about hiddens_hook during training, because we can't # run a cost without having the input! # hiddens_hook is more for the run function and linking methods below. hiddens = hidden_activation(T.dot(corrupted_input, W) + b1) # finally, create the reconstruction from the hidden layer (we tie the weights with W.T) reconstruction = visible_activation(T.dot(hiddens, W.T) + b0) # the training cost is reconstruction error self.train_cost = cost_function(output=reconstruction, target=x) # Compile everything into a Theano function for prediction! # When using real-world data in predictions, we wouldn't corrupt the input first. # Therefore, create another version of the hiddens and reconstruction without adding the noise. # Here is where we would handle hiddens_hook because this is a generative model! # For the run function, it would take in the hiddens instead of the input variable x. if self.hiddens_hook is not None: self.hiddens = self.hiddens_hook[1] else: self.hiddens = hidden_activation(T.dot(x, W) + b1) # make the reconstruction (generated) from the hiddens self.recon_predict = visible_activation(T.dot(self.hiddens, W.T) + b0) # now compile the run function accordingly - if it used x or hiddens as the input. if self.hiddens_hook is not None: self.f_run = function(inputs=[self.hiddens], outputs=self.recon_predict) else: self.f_run = function(inputs=[x], outputs=self.recon_predict)
def _build_computation_graph(self): ###################### BUILD NETWORK ########################## # whether or not to mirror the input images before feeding them into the network if self.flag_datalayer: layer_1_input = mirror_images( input=self.x, image_shape=(self.batch_size, 3, 256, 256), # bc01 format cropsize=227, rand=self.rand, flag_rand=self.rand_crop) else: layer_1_input = self.x # 4D tensor (going to be in bc01 format) # Start with 5 convolutional pooling layers log.debug("convpool layer 1...") convpool_layer1 = ConvPoolLayer(inputs_hook=((self.batch_size, 3, 227, 227), layer_1_input), filter_shape=(96, 3, 11, 11), convstride=4, padsize=0, group=1, poolsize=3, poolstride=2, bias_init=0.0, local_response_normalization=True) # Add this layer's parameters! self.params += convpool_layer1.get_params() log.debug("convpool layer 2...") convpool_layer2 = ConvPoolLayer(inputs_hook=(( self.batch_size, 96, 27, 27, ), convpool_layer1.get_outputs()), filter_shape=(256, 96, 5, 5), convstride=1, padsize=2, group=2, poolsize=3, poolstride=2, bias_init=0.1, local_response_normalization=True) # Add this layer's parameters! self.params += convpool_layer2.get_params() log.debug("convpool layer 3...") convpool_layer3 = ConvPoolLayer( inputs_hook=((self.batch_size, 256, 13, 13), convpool_layer2.get_outputs()), filter_shape=(384, 256, 3, 3), convstride=1, padsize=1, group=1, poolsize=1, poolstride=0, bias_init=0.0, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer3.get_params() log.debug("convpool layer 4...") convpool_layer4 = ConvPoolLayer( inputs_hook=((self.batch_size, 384, 13, 13), convpool_layer3.get_outputs()), filter_shape=(384, 384, 3, 3), convstride=1, padsize=1, group=2, poolsize=1, poolstride=0, bias_init=0.1, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer4.get_params() log.debug("convpool layer 5...") convpool_layer5 = ConvPoolLayer( inputs_hook=((self.batch_size, 384, 13, 13), convpool_layer4.get_outputs()), filter_shape=(256, 384, 3, 3), convstride=1, padsize=1, group=2, poolsize=3, poolstride=2, bias_init=0.0, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer5.get_params() # Now onto the fully-connected layers! fc_config = { 'activation': 'rectifier', # type of activation function to use for output 'weights_init': 'gaussian', # either 'gaussian' or 'uniform' - how to initialize weights 'weights_mean': 0.0, # mean for gaussian weights init 'weights_std': 0.005, # standard deviation for gaussian weights init 'bias_init': 0.0 # how to initialize the bias parameter } log.debug("fully connected layer 1 (model layer 6)...") # we want to have dropout applied to the training version, but not the test version. fc_layer6_input = T.flatten(convpool_layer5.get_outputs(), 2) fc_layer6 = BasicLayer(inputs_hook=(9216, fc_layer6_input), output_size=4096, noise='dropout', noise_level=0.5, **fc_config) # Add this layer's parameters! self.params += fc_layer6.get_params() # Add the dropout noise switch self.noise_switches += fc_layer6.get_noise_switch() log.debug("fully connected layer 2 (model layer 7)...") fc_layer7 = BasicLayer(inputs_hook=(4096, fc_layer6.get_outputs()), output_size=4096, noise='dropout', noise_level=0.5, **fc_config) # Add this layer's parameters! self.params += fc_layer7.get_params() # Add the dropout noise switch self.noise_switches += fc_layer7.get_noise_switch() # last layer is a softmax prediction output layer softmax_config = { 'weights_init': 'gaussian', 'weights_mean': 0.0, 'weights_std': 0.005, 'bias_init': 0.0 } log.debug("softmax classification layer (model layer 8)...") softmax_layer8 = SoftmaxLayer(inputs_hook=(4096, fc_layer7.get_outputs()), output_size=1000, **softmax_config) # Add this layer's parameters! self.params += softmax_layer8.get_params() # finally the softmax output from the whole thing! self.output = softmax_layer8.get_outputs() self.targets = softmax_layer8.get_targets() ##################### # Cost and monitors # ##################### self.train_cost = softmax_layer8.negative_log_likelihood() cost = softmax_layer8.negative_log_likelihood() errors = softmax_layer8.errors() train_errors = softmax_layer8.errors() self.monitors = OrderedDict([('cost', cost), ('errors', errors), ('dropout_errors', train_errors)]) ######################### # Compile the functions # ######################### log.debug("Compiling functions!") t = time.time() log.debug("f_run...") # use the actual argmax from the classification self.f_run = function(inputs=[self.x], outputs=softmax_layer8.get_argmax_prediction()) log.debug("compilation took %s", make_time_units_string(time.time() - t))
def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, input_size=28 * 28, hidden_size=1000, noise_level=0.4, hidden_activation='tanh', visible_activation='sigmoid', cost_function='binary_crossentropy'): # initialize the Model superclass super(DenoisingAutoencoder, self).__init__(**{ arg: val for (arg, val) in locals().iteritems() if arg is not 'self' }) # Define model hyperparameters # deal with the inputs_hook and hiddens_hook for the size parameters! # if the hook exists, grab the size from the first element of the tuple. if self.inputs_hook is not None: assert len(self.inputs_hook ) == 2, "Was expecting inputs_hook to be a tuple." self.input_size = inputs_hook[0] if self.hiddens_hook is not None: assert len(self.hiddens_hook ) == 2, "was expecting hiddens_hook to be a tuple." hidden_size = hiddens_hook[0] # use the helper methods to grab appropriate activation functions from names! hidden_activation = get_activation_function(hidden_activation) visible_activation = get_activation_function(visible_activation) # do the same for the cost function cost_function = get_cost_function(cost_function) # Now, define the symbolic input to the model (Theano) # We use a matrix rather than a vector so that minibatch processing can be done in parallel. # Make sure to deal with 'inputs_hook' if it exists! if self.inputs_hook is not None: # grab the new input variable from the inputs_hook tuple x = self.inputs_hook[1] else: x = T.matrix("X") self.inputs = [x] # Build the model's parameters - a weight matrix and two bias vectors # Make sure to deal with 'params_hook' if it exists! if self.params_hook: # check to see if it contains the three necessary variables assert len(self.params_hook ) == 3, "Not correct number of params to DAE, needs 3!" W, b0, b1 = self.params_hook else: W = get_weights_uniform(shape=(self.input_size, hidden_size), name="W") b0 = get_bias(shape=self.input_size, name="b0") b1 = get_bias(shape=hidden_size, name="b1") self.params = [W, b0, b1] # Perform the computation for a denoising autoencoder! # first, add noise (corrupt) the input corrupted_input = salt_and_pepper(input=x, noise_level=noise_level) # next, run the hidden layer given the inputs (the encoding function) # We don't need to worry about hiddens_hook during training, because we can't # run a cost without having the input! # hiddens_hook is more for the run function and linking methods below. hiddens = hidden_activation(T.dot(corrupted_input, W) + b1) # finally, create the reconstruction from the hidden layer (we tie the weights with W.T) reconstruction = visible_activation(T.dot(hiddens, W.T) + b0) # the training cost is reconstruction error self.train_cost = cost_function(output=reconstruction, target=x) # Compile everything into a Theano function for prediction! # When using real-world data in predictions, we wouldn't corrupt the input first. # Therefore, create another version of the hiddens and reconstruction without adding the noise. # Here is where we would handle hiddens_hook because this is a generative model! # For the run function, it would take in the hiddens instead of the input variable x. if self.hiddens_hook is not None: self.hiddens = self.hiddens_hook[1] else: self.hiddens = hidden_activation(T.dot(x, W) + b1) # make the reconstruction (generated) from the hiddens self.recon_predict = visible_activation(T.dot(self.hiddens, W.T) + b0) # now compile the run function accordingly - if it used x or hiddens as the input. if self.hiddens_hook is not None: self.f_run = function(inputs=[self.hiddens], outputs=self.recon_predict) else: self.f_run = function(inputs=[x], outputs=self.recon_predict)
def build_computation_graph(self): ################# # Build the GSN # ################# log.debug("Building GSN graphs...") # GSN for training - with noise specified in initialization # if there is no hiddens_hook, build the GSN normally using the input X if not self.hiddens_flag: p_X_chain, _ = self.build_gsn(add_noise=self.add_noise) # if there is a hiddens_hook, we want to change the order layers are updated and make this purely # generative from the hiddens else: p_X_chain, _, = self.build_gsn(hiddens=self.hiddens, add_noise=self.add_noise, reverse=True) # GSN for prediction - same as above but no noise # deal with hiddens_hook exactly as above. if not self.hiddens_flag: p_X_chain_recon, recon_hiddens = self.build_gsn(add_noise=False) else: p_X_chain_recon, recon_hiddens = self.build_gsn(hiddens=self.hiddens, add_noise=False, reverse=True) #################### # Costs and output # #################### log.debug('Cost w.r.t p(X|...) at every step in the graph for the GSN') # use the noisy ones for training cost costs = [self.cost_function(output=rX, target=self.X, **self.cost_args) for rX in p_X_chain] self.show_cost = costs[-1] # for a monitor to show progress cost = numpy.sum(costs) # THIS IS THE TRAINING COST - RECONSTRUCTION OF OUTPUT FROM NOISY GRAPH # use the non-noisy graph for prediction gsn_costs_recon = [self.cost_function(output=rX, target=self.X, **self.cost_args) for rX in p_X_chain_recon] # another monitor, same as self.show_cost but on the non-noisy graph. self.monitor = gsn_costs_recon[-1] # this should be considered the main output of the computation, the sample after the # last walkback from the non-noisy graph. output = p_X_chain_recon[-1] # these should be considered the model's hidden representation - the hidden representation after # the last walkback from the non-noisy graph. hiddens = recon_hiddens train_mse = T.mean(T.sqr(p_X_chain[-1] - self.X), axis=0) train_mse = T.mean(train_mse) mse = T.mean(T.sqr(p_X_chain_recon[-1] - self.X), axis=0) mse = T.mean(mse) monitors = OrderedDict([('noisy_recon_cost', self.show_cost), ('recon_cost', self.monitor), ('mse', mse), ('train_mse', train_mse)]) ############ # Sampling # ############ # the input to the sampling function X_sample = T.matrix("X_sampling") self.network_state_input = [X_sample] + [T.matrix("H_sampling_"+str(i+1)) for i in range(self.layers)] # "Output" state of the network (noisy) # initialized with input, then we apply updates self.network_state_output = [X_sample] + self.network_state_input[1:] visible_pX_chain = [] # ONE update log.debug("Performing one walkback in network state sampling.") self.update_layers(self.network_state_output, visible_pX_chain, add_noise=True, reverse=False) ##################################################### # Create the run and monitor functions # ##################################################### log.debug("Compiling functions...") t = time.time() # doesn't make sense to have this if there is a hiddens_hook if not self.hiddens_flag: # THIS IS THE MAIN PREDICT FUNCTION - takes in a real matrix and produces the output from the non-noisy # computation graph log.debug("f_run...") self.f_run = function(inputs = [self.X], outputs = output, name = 'gsn_f_run') # this is a helper function - it corrupts inputs when testing the non-noisy graph (aka before feeding the # input to f_run) log.debug("f_noise...") self.f_noise = function(inputs = [self.X], outputs = self.input_noise(self.X), name = 'gsn_f_noise') # the sampling function, for creating lots of samples from the computational graph. (mostly for log-likelihood # or visualization) log.debug("f_sample...") if self.layers == 1: self.f_sample = function(inputs = [X_sample], outputs = visible_pX_chain[-1], name = 'gsn_f_sample_single_layer') else: # WHY IS THERE A WARNING???? # because the first odd layers are not used -> directly computed FROM THE EVEN layers # unused input = warn self.f_sample = function(inputs = self.network_state_input, outputs = self.network_state_output + visible_pX_chain, name = 'gsn_f_sample') log.debug("GSN compiling done. Took %s", make_time_units_string(time.time() - t)) return cost, monitors, output, hiddens
def train(self, continue_training=False): """ This method performs the training!!! :param continue_training: :type continue_training: :return: :rtype: """ # grab the model parameters to use during training self.params = self.model.get_params() log.info("%s params: %s", str(type(self.model)), str(self.params)) ############################################### # theano index variable to use on the dataset # ############################################### # index to a [mini]batch - both start and end data_idx = T.iscalar('data_index') data_end_idx = T.iscalar('data_end_index') batch_slice = slice(data_idx, data_end_idx) # compute number of minibatches for training, validation and testing # shapes is list of list - input list of datasets to optimizer (for multiple inputs), and each dataset # could be a list of shared variables (like multiple sequences from files) train_data_shapes = raise_to_list(self.dataset.getDataShape(TRAIN)) valid_data_shapes = raise_to_list(self.dataset.getDataShape(VALID)) test_data_shapes = raise_to_list(self.dataset.getDataShape(TEST)) # train_batches is going to be lists of tuples that contain the start and end indices for train data train_data_lens = [shape[0] for shape in train_data_shapes] self.train_batches = self.get_batch_indices(train_data_lens) if valid_data_shapes is not None: valid_data_lens = [shape[0] for shape in valid_data_shapes] self.valid_batches = self.get_batch_indices(valid_data_lens) else: self.valid_batches = None if test_data_shapes is not None: test_data_lens = [shape[0] for shape in test_data_shapes] self.test_batches = self.get_batch_indices(test_data_lens) else: self.test_batches = None # translate the data_idx into the givens for the model model_inputs = raise_to_list(self.model.get_inputs()) model_targets = raise_to_list(self.model.get_targets()) train_data, train_labels = self.dataset.getSubset(TRAIN) train_givens = OrderedDict(zip(model_inputs, [train_data[batch_slice]])) if model_targets is not None and len(model_targets) > 0: train_givens.update(OrderedDict(zip(model_targets, [train_labels[batch_slice]]))) valid_data, valid_labels = self.dataset.getSubset(VALID) valid_givens = OrderedDict(zip(model_inputs, [valid_data[batch_slice]])) if model_targets is not None and len(model_targets) > 0: valid_givens.update(OrderedDict(zip(model_targets, [valid_labels[batch_slice]]))) test_data, test_labels = self.dataset.getSubset(TEST) test_givens = OrderedDict(zip(model_inputs, [test_data[batch_slice]])) if model_targets is not None and len(model_targets) > 0: test_givens.update(OrderedDict(zip(model_targets, [test_labels[batch_slice]]))) # Now time to create the training cost functions for the model - make sure to handle the possible # list of costs used for pretraining of certain parts of the model. train_costs = raise_to_list(self.model.get_train_cost()) self.train_functions = [] for i, train_cost in enumerate(train_costs): # Now create the training cost function for the model to use while training - update parameters # gradient! gradients, _ = self.model.get_gradient(cost=train_cost) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD, including AdaDelta! # It tells how to update the params each training epoch gradient_updates = self.get_updates(gradients) # Combine the updates from the model also if applicable train_updates = self.model.get_updates() if train_updates: train_updates.update(gradient_updates) else: train_updates = gradient_updates # Compile the training function! log.info('Compiling f_learn %d/%d function for model %s...', i + 1, len(train_costs), str(type(self.model))) t = time.time() f_learn = function(inputs=[data_idx, data_end_idx], updates=train_updates, outputs=train_cost, givens=train_givens, name='f_learn_%d' % i) log.info('f_learn compilation took %s', make_time_units_string(time.time() - t)) self.train_functions.append(f_learn) # grab the expression(s) to use to monitor different model values during training log.debug("Compiling monitor functions...") monitor_t = time.time() self.monitors = OrderedDict(self.model.get_monitors()) self.monitor_names = self.monitors.keys() if len(self.monitors.keys()) > 0: self.train_monitor_function = function( inputs=[data_idx, data_end_idx], updates=self.model.get_updates(), outputs=self.monitors.values(), givens=train_givens, name="train_monitor_function" ) if len(self.monitors.keys()) > 0: self.valid_monitor_function = function( inputs=[data_idx, data_end_idx], updates=self.model.get_updates(), outputs=self.monitors.values(), givens=valid_givens, name="valid_monitor_function" ) if len(self.monitors.keys()) > 0: self.test_monitor_function = function( inputs=[data_idx, data_end_idx], updates=self.model.get_updates(), outputs=self.monitors.values(), givens=test_givens, name="test_monitor_function" ) log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t)) self.noise_switches = raise_to_list(self.model.get_noise_switch()) ################## # start training # ################## # make sure to deal with a list of train_cost functions - for layer-wise pretraining! # this list of training functions was created during __init__() start_time = time.time() for func_i, train_function in enumerate(self.train_functions): log.info("-----------TRAINING %s function %d/%d FOR %d EPOCHS (continue_training=%s)-----------", str(type(self.model)), func_i + 1, len(self.train_functions), self.n_epoch, str(continue_training)) log.debug("Train dataset size is: %s", self.dataset.getDataShape(TRAIN)) if self.dataset.hasSubset(VALID): log.debug("Valid dataset size is: %s", self.dataset.getDataShape(VALID)) if self.dataset.hasSubset(TEST): log.debug("Test dataset size is: %s", self.dataset.getDataShape(TEST)) self.STOP = False self.epoch_counter = 0 if not continue_training: # reset the learning rate if hasattr(self, 'learning_rate_decay') and self.learning_rate_decay: self.learning_rate_decay.reset() # reset the other model decaying functions for decay_param in self.model.get_decay_params(): decay_param.reset() self.times = [] self.best_cost = numpy.inf self.best_params = None self.patience = 0 t = time.time() while not self.STOP: try: self.STOP = self._perform_one_epoch(train_function) except KeyboardInterrupt: log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT") self.STOP = True # save params if self.best_params is not None: log.debug("Restoring best model parameters...") set_shared_values(self.params, self.best_params) log.debug("Saving model parameters...") self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl') log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t)) log.info("------------TOTAL %s TRAIN TIME TOOK %s---------", str(type(self.model)), make_time_units_string(time.time() - start_time))
def _build_computation_graph(self): ###################### BUILD NETWORK ########################## # whether or not to mirror the input images before feeding them into the network if self.flag_datalayer: layer_1_input = mirror_images(input=self.x, image_shape=(self.batch_size, 3, 256, 256), # bc01 format cropsize=227, rand=self.rand, flag_rand=self.rand_crop) else: layer_1_input = self.x # 4D tensor (going to be in bc01 format) # Start with 5 convolutional pooling layers log.debug("convpool layer 1...") convpool_layer1 = ConvPoolLayer(inputs_hook=((self.batch_size, 3, 227, 227), layer_1_input), filter_shape=(96, 3, 11, 11), convstride=4, padsize=0, group=1, poolsize=3, poolstride=2, bias_init=0.0, local_response_normalization=True) # Add this layer's parameters! self.params += convpool_layer1.get_params() log.debug("convpool layer 2...") convpool_layer2 = ConvPoolLayer(inputs_hook=((self.batch_size, 96, 27, 27, ), convpool_layer1.get_outputs()), filter_shape=(256, 96, 5, 5), convstride=1, padsize=2, group=2, poolsize=3, poolstride=2, bias_init=0.1, local_response_normalization=True) # Add this layer's parameters! self.params += convpool_layer2.get_params() log.debug("convpool layer 3...") convpool_layer3 = ConvPoolLayer(inputs_hook=((self.batch_size, 256, 13, 13), convpool_layer2.get_outputs()), filter_shape=(384, 256, 3, 3), convstride=1, padsize=1, group=1, poolsize=1, poolstride=0, bias_init=0.0, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer3.get_params() log.debug("convpool layer 4...") convpool_layer4 = ConvPoolLayer(inputs_hook=((self.batch_size, 384, 13, 13), convpool_layer3.get_outputs()), filter_shape=(384, 384, 3, 3), convstride=1, padsize=1, group=2, poolsize=1, poolstride=0, bias_init=0.1, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer4.get_params() log.debug("convpool layer 5...") convpool_layer5 = ConvPoolLayer(inputs_hook=((self.batch_size, 384, 13, 13), convpool_layer4.get_outputs()), filter_shape=(256, 384, 3, 3), convstride=1, padsize=1, group=2, poolsize=3, poolstride=2, bias_init=0.0, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer5.get_params() # Now onto the fully-connected layers! fc_config = { 'activation': 'rectifier', # type of activation function to use for output 'weights_init': 'gaussian', # either 'gaussian' or 'uniform' - how to initialize weights 'weights_mean': 0.0, # mean for gaussian weights init 'weights_std': 0.005, # standard deviation for gaussian weights init 'bias_init': 0.0 # how to initialize the bias parameter } log.debug("fully connected layer 1 (model layer 6)...") # we want to have dropout applied to the training version, but not the test version. fc_layer6_input = T.flatten(convpool_layer5.get_outputs(), 2) fc_layer6 = BasicLayer(inputs_hook=(9216, fc_layer6_input), output_size=4096, noise='dropout', noise_level=0.5, **fc_config) # Add this layer's parameters! self.params += fc_layer6.get_params() # Add the dropout noise switch self.noise_switches += fc_layer6.get_noise_switch() log.debug("fully connected layer 2 (model layer 7)...") fc_layer7 = BasicLayer(inputs_hook=(4096, fc_layer6.get_outputs()), output_size=4096, noise='dropout', noise_level=0.5, **fc_config) # Add this layer's parameters! self.params += fc_layer7.get_params() # Add the dropout noise switch self.noise_switches += fc_layer7.get_noise_switch() # last layer is a softmax prediction output layer softmax_config = { 'weights_init': 'gaussian', 'weights_mean': 0.0, 'weights_std': 0.005, 'bias_init': 0.0 } log.debug("softmax classification layer (model layer 8)...") softmax_layer8 = SoftmaxLayer(inputs_hook=(4096, fc_layer7.get_outputs()), output_size=1000, **softmax_config) # Add this layer's parameters! self.params += softmax_layer8.get_params() # finally the softmax output from the whole thing! self.output = softmax_layer8.get_outputs() self.targets = softmax_layer8.get_targets() ##################### # Cost and monitors # ##################### self.train_cost = softmax_layer8.negative_log_likelihood() cost = softmax_layer8.negative_log_likelihood() errors = softmax_layer8.errors() train_errors = softmax_layer8.errors() self.monitors = OrderedDict([('cost', cost), ('errors', errors), ('dropout_errors', train_errors)]) ######################### # Compile the functions # ######################### log.debug("Compiling functions!") t = time.time() log.debug("f_run...") # use the actual argmax from the classification self.f_run = function(inputs=[self.x], outputs=softmax_layer8.get_argmax_prediction()) log.debug("compilation took %s", make_time_units_string(time.time() - t))
def build_computation_graph(self): ################# # Build the GSN # ################# log.debug("Building GSN graphs...") # GSN for training - with noise specified in initialization # if there is no hiddens_hook, build the GSN normally using the input X if not self.hiddens_flag: p_X_chain, _ = self.build_gsn(add_noise=self.add_noise) # if there is a hiddens_hook, we want to change the order layers are updated and make this purely # generative from the hiddens else: p_X_chain, _, = self.build_gsn(hiddens=self.hiddens, add_noise=self.add_noise, reverse=True) # GSN for prediction - same as above but no noise # deal with hiddens_hook exactly as above. if not self.hiddens_flag: p_X_chain_recon, recon_hiddens = self.build_gsn(add_noise=False) else: p_X_chain_recon, recon_hiddens = self.build_gsn( hiddens=self.hiddens, add_noise=False, reverse=True) #################### # Costs and output # #################### log.debug('Cost w.r.t p(X|...) at every step in the graph for the GSN') # use the noisy ones for training cost costs = [ self.cost_function(output=rX, target=self.X, **self.cost_args) for rX in p_X_chain ] self.show_cost = costs[-1] # for a monitor to show progress cost = numpy.sum( costs ) # THIS IS THE TRAINING COST - RECONSTRUCTION OF OUTPUT FROM NOISY GRAPH # use the non-noisy graph for prediction gsn_costs_recon = [ self.cost_function(output=rX, target=self.X, **self.cost_args) for rX in p_X_chain_recon ] # another monitor, same as self.show_cost but on the non-noisy graph. self.monitor = gsn_costs_recon[-1] # this should be considered the main output of the computation, the sample after the # last walkback from the non-noisy graph. output = p_X_chain_recon[-1] # these should be considered the model's hidden representation - the hidden representation after # the last walkback from the non-noisy graph. hiddens = recon_hiddens train_mse = T.mean(T.sqr(p_X_chain[-1] - self.X), axis=0) train_mse = T.mean(train_mse) mse = T.mean(T.sqr(p_X_chain_recon[-1] - self.X), axis=0) mse = T.mean(mse) monitors = OrderedDict([('noisy_recon_cost', self.show_cost), ('recon_cost', self.monitor), ('mse', mse), ('train_mse', train_mse)]) ############ # Sampling # ############ # the input to the sampling function X_sample = T.matrix("X_sampling") self.network_state_input = [X_sample] + [ T.matrix("H_sampling_" + str(i + 1)) for i in range(self.layers) ] # "Output" state of the network (noisy) # initialized with input, then we apply updates self.network_state_output = [X_sample] + self.network_state_input[1:] visible_pX_chain = [] # ONE update log.debug("Performing one walkback in network state sampling.") self.update_layers(self.network_state_output, visible_pX_chain, add_noise=True, reverse=False) ##################################################### # Create the run and monitor functions # ##################################################### log.debug("Compiling functions...") t = time.time() # doesn't make sense to have this if there is a hiddens_hook if not self.hiddens_flag: # THIS IS THE MAIN PREDICT FUNCTION - takes in a real matrix and produces the output from the non-noisy # computation graph log.debug("f_run...") self.f_run = function(inputs=[self.X], outputs=output, name='gsn_f_run') # this is a helper function - it corrupts inputs when testing the non-noisy graph (aka before feeding the # input to f_run) log.debug("f_noise...") self.f_noise = function(inputs=[self.X], outputs=self.input_noise(self.X), name='gsn_f_noise') # the sampling function, for creating lots of samples from the computational graph. (mostly for log-likelihood # or visualization) log.debug("f_sample...") if self.layers == 1: self.f_sample = function(inputs=[X_sample], outputs=visible_pX_chain[-1], name='gsn_f_sample_single_layer') else: # WHY IS THERE A WARNING???? # because the first odd layers are not used -> directly computed FROM THE EVEN layers # unused input = warn self.f_sample = function(inputs=self.network_state_input, outputs=self.network_state_output + visible_pX_chain, name='gsn_f_sample') log.debug("GSN compiling done. Took %s", make_time_units_string(time.time() - t)) return cost, monitors, output, hiddens