def test_replace_variable_not_in_graph(): # Test if warning appears when variable is not in graph with warnings.catch_warnings(record=True) as w: x = tensor.scalar() y = x + 1 z = tensor.scalar() cg = ComputationGraph([y]) cg.replace([(y, 2 * y), (z, 2 * z)]) assert len(w) == 1 assert "not a part of" in str(w[-1].message)
def test_replace_variable_is_auxiliary(): # Test if warning appears when variable is an AUXILIARY variable with warnings.catch_warnings(record=True) as w: x = tensor.scalar() y = x + 1 add_role(y, AUXILIARY) cg = ComputationGraph([y]) cg.replace([(y, 2 * y)]) assert len(w) == 1 assert "auxiliary" in str(w[-1].message)
def test_replace_variable_is_auxiliary(): # Test if warning appears when variable is an AUXILIARY variable with warnings.catch_warnings(record=True) as w: x = tensor.scalar() y = x + 1 add_role(y, AUXILIARY) cg = ComputationGraph([y]) cg.replace([(y, 2 * y)]) assert len(w) == 1 assert "auxiliary" in str(w[-1].message)
def test_replace_variable_not_in_graph(): # Test if warning appears when variable is not in graph with warnings.catch_warnings(record=True) as w: x = tensor.scalar() y = x + 1 z = tensor.scalar() cg = ComputationGraph([y]) cg.replace([(y, 2 * y), (z, 2 * z)]) assert len(w) == 1 assert "not a part of" in str(w[-1].message)
def get_cost_graph(self, batch=True, prediction=None, prediction_mask=None): if batch: inputs = self.inputs inputs_mask = self.inputs_mask groundtruth = self.labels groundtruth_mask = self.labels_mask else: inputs, inputs_mask = self.bottom.single_to_batch_inputs( self.single_inputs) groundtruth = self.single_labels[:, None] groundtruth_mask = None if not prediction: prediction = groundtruth if not prediction_mask: prediction_mask = groundtruth_mask cost = self.cost(inputs_mask=inputs_mask, labels=prediction, labels_mask=prediction_mask, **inputs) cost_cg = ComputationGraph(cost) if self.criterion['name'].startswith("mse"): placeholder, = VariableFilter(theano_name='groundtruth')(cost_cg) cost_cg = cost_cg.replace({placeholder: groundtruth}) return cost_cg
def get_cost_graph(self, batch=True, prediction=None, prediction_mask=None): if batch: inputs = self.inputs inputs_mask = self.inputs_mask groundtruth = self.labels groundtruth_mask = self.labels_mask else: inputs, inputs_mask = self.bottom.single_to_batch_inputs( self.single_inputs) groundtruth = self.single_labels[:, None] groundtruth_mask = None if not prediction: prediction = groundtruth if not prediction_mask: prediction_mask = groundtruth_mask cost = self.cost(inputs_mask=inputs_mask, labels=prediction, labels_mask=prediction_mask, **inputs) cost_cg = ComputationGraph(cost) if self.criterion['name'].startswith("mse"): placeholder, = VariableFilter(theano_name='groundtruth')(cost_cg) cost_cg = cost_cg.replace({placeholder: groundtruth}) return cost_cg
def test_computation_graph(): x = tensor.matrix('x') y = tensor.matrix('y') z = x + y a = z.copy() a.name = 'a' b = z.copy() b.name = 'b' r = tensor.matrix('r') cg = ComputationGraph([a, b]) assert set(cg.inputs) == {x, y} assert set(cg.outputs) == {a, b} assert set(cg.variables) == {x, y, z, a, b} assert cg.variables[2] is z assert ComputationGraph(a).inputs == cg.inputs cg2 = cg.replace({z: r}) assert set(cg2.inputs) == {r} assert set([v.name for v in cg2.outputs]) == {'a', 'b'} W = theano.shared(numpy.zeros((3, 3), dtype=floatX)) cg3 = ComputationGraph([z + W]) assert set(cg3.shared_variables) == {W} cg4 = ComputationGraph([W]) assert cg4.variables == [W] w1 = W**2 cg5 = ComputationGraph([w1]) assert W in cg5.variables assert w1 in cg5.variables
def test_replace(): # Test if replace works with outputs x = tensor.scalar() y = x + 1 cg = ComputationGraph([y]) doubled_cg = cg.replace([(y, 2 * y)]) out_val = doubled_cg.outputs[0].eval({x: 2}) assert out_val == 6.0
def test_replace(): # Test if replace works with outputs x = tensor.scalar() y = x + 1 cg = ComputationGraph([y]) doubled_cg = cg.replace([(y, 2 * y)]) out_val = doubled_cg.outputs[0].eval({x: 2}) assert out_val == 6.0
def test_computation_graph(): x = tensor.matrix('x') y = tensor.matrix('y') z = x + y z.name = 'z' a = z.copy() a.name = 'a' b = z.copy() b.name = 'b' r = tensor.matrix('r') cg = ComputationGraph([a, b]) assert set(cg.inputs) == {x, y} assert set(cg.outputs) == {a, b} assert set(cg.variables) == {x, y, z, a, b} assert cg.variables[2] is z assert ComputationGraph(a).inputs == cg.inputs cg2 = cg.replace({z: r}) assert set(cg2.inputs) == {r} assert set([v.name for v in cg2.outputs]) == {'a', 'b'} W = theano.shared(numpy.zeros((3, 3), dtype=theano.config.floatX)) cg3 = ComputationGraph([z + W]) assert set(cg3.shared_variables) == {W} cg4 = ComputationGraph([W]) assert cg4.variables == [W] w1 = W ** 2 cg5 = ComputationGraph([w1]) assert W in cg5.variables assert w1 in cg5.variables # Test scan s, _ = theano.scan(lambda inp, accum: accum + inp, sequences=x, outputs_info=tensor.zeros_like(x[0])) scan = s.owner.inputs[0].owner.op cg6 = ComputationGraph(s) assert cg6.scans == [scan] assert all(v in cg6.scan_variables for v in scan.inputs + scan.outputs)
def _get_bn_params(self, output_vars): # Pick out the nodes with batch normalization vars cg = ComputationGraph(output_vars) var_filter = VariableFilter(roles=[BNPARAM]) bn_ps = var_filter(cg.variables) if len(bn_ps) == 0: logger.warn('No batch normalization parameters found - is' + ' batch normalization turned off?') self._bn = False self._counter = None self._counter_max = None bn_share = [] output_vars_replaced = output_vars else: self._bn = True assert len(set([p.name for p in bn_ps])) == len(bn_ps), \ 'Some batch norm params have the same name' logger.info('Batch norm parameters: %s' % ', '.join([p.name for p in bn_ps])) # Filter out the shared variables from the model updates def filter_share(par): lst = [ up for up in cg.updates if up.name == 'shared_%s' % par.name ] assert len(lst) == 1 return lst[0] bn_share = list(map(filter_share, bn_ps)) # Replace the BN coefficients in the test data model - Replace the # theano variables in the test graph with the shareds output_vars_replaced = cg.replace(list(zip(bn_ps, bn_share))).outputs # Pick out the counter self._counter = self._param_from_updates(cg.updates, 'counter') self._counter_max = self._param_from_updates( cg.updates, 'counter_max') return bn_ps, bn_share, output_vars_replaced
def test_computation_graph(): x = tensor.matrix('x') y = tensor.matrix('y') z = x + y a = z.copy() a.name = 'a' b = z.copy() b.name = 'b' r = tensor.matrix('r') cg = ComputationGraph([a, b]) assert set(cg.inputs) == {x, y} assert set(cg.outputs) == {a, b} assert set(cg.variables) == {x, y, z, a, b} assert ComputationGraph(a).inputs == cg.inputs cg2 = cg.replace({z: r}) assert set(cg2.inputs) == {r} assert set([v.name for v in cg2.outputs]) == {'a', 'b'}
def test_computation_graph(): x = tensor.matrix('x') y = tensor.matrix('y') z = x + y z.name = 'z' a = z.copy() a.name = 'a' b = z.copy() b.name = 'b' r = tensor.matrix('r') cg = ComputationGraph([a, b]) assert set(cg.inputs) == {x, y} assert set(cg.outputs) == {a, b} assert set(cg.variables) == {x, y, z, a, b} assert cg.variables[2] is z assert ComputationGraph(a).inputs == cg.inputs cg2 = cg.replace({z: r}) assert set(cg2.inputs) == {r} assert set([v.name for v in cg2.outputs]) == {'a', 'b'} W = theano.shared(numpy.zeros((3, 3), dtype=theano.config.floatX)) cg3 = ComputationGraph([z + W]) assert set(cg3.shared_variables) == {W} cg4 = ComputationGraph([W]) assert cg4.variables == [W] w1 = W ** 2 cg5 = ComputationGraph([w1]) assert W in cg5.variables assert w1 in cg5.variables # Test scan s, _ = theano.scan(lambda inp, accum: accum + inp, sequences=x, outputs_info=tensor.zeros_like(x[0])) scan = s.owner.inputs[0].owner.op cg6 = ComputationGraph(s) assert cg6.scans == [scan] assert all(v in cg6.scan_variables for v in scan.inputs + scan.outputs)
def test_computation_graph(): x = tensor.matrix('x') y = tensor.matrix('y') z = x + y a = z.copy() a.name = 'a' b = z.copy() b.name = 'b' r = tensor.matrix('r') cg = ComputationGraph([a, b]) assert set(cg.inputs) == {x, y} assert set(cg.outputs) == {a, b} assert set(cg.variables) == {x, y, z, a, b} assert ComputationGraph(a).inputs == cg.inputs cg2 = cg.replace({z: r}) assert set(cg2.inputs) == {r} assert set([v.name for v in cg2.outputs]) == {'a', 'b'}
def get_cost_graph(self, batch=True, prediction=None, prediction_mask=None): if batch: recordings = self.recordings recordings_mask = self.recordings_mask groundtruth = self.labels groundtruth_mask = self.labels_mask else: recordings = self.single_recording[:, None, :] recordings_mask = tensor.ones_like(recordings[:, :, 0]) groundtruth = self.single_transcription[:, None] groundtruth_mask = None if not prediction: prediction = groundtruth if not prediction_mask: prediction_mask = groundtruth_mask cost = self.cost(recordings, recordings_mask, prediction, prediction_mask) cost_cg = ComputationGraph(cost) if self.criterion['name'].startswith("mse"): placeholder, = VariableFilter(theano_name='groundtruth')(cost_cg) cost_cg = cost_cg.replace({placeholder: groundtruth}) return cost_cg
def _get_bn_params(self, output_vars): # Pick out the nodes with batch normalization vars cg = ComputationGraph(output_vars) var_filter = VariableFilter(roles=[BNPARAM]) bn_ps = var_filter(cg.variables) if len(bn_ps) == 0: logger.warn('No batch normalization parameters found - is' + ' batch normalization turned off?') self._bn = False self._counter = None self._counter_max = None bn_share = [] output_vars_replaced = output_vars else: self._bn = True assert len(set([p.name for p in bn_ps])) == len(bn_ps), \ 'Some batch norm params have the same name' logger.info('Batch norm parameters: %s' % ', '.join([p.name for p in bn_ps])) # Filter out the shared variables from the model updates def filter_share(par): lst = [up for up in cg.updates if up.name == 'shared_%s' % par.name] assert len(lst) == 1 return lst[0] bn_share = map(filter_share, bn_ps) # Replace the BN coefficients in the test data model - Replace the # theano variables in the test graph with the shareds output_vars_replaced = cg.replace(zip(bn_ps, bn_share)).outputs # Pick out the counter self._counter = self._param_from_updates(cg.updates, 'counter') self._counter_max = self._param_from_updates(cg.updates, 'counter_max') return bn_ps, bn_share, output_vars_replaced
def apply_adaptive_noise(computation_graph, cost, variables, num_examples, parameters=None, init_sigma=1e-6, model_cost_coefficient=1.0, seed=None, gradients=None, ): """Add adaptive noise to parameters of a model. Each of the given variables will be replaced by a normal distribution with learned mean and standard deviation. A model cost is computed based on the precision of the the distributions associated with each variable. It is added to the given cost used to train the model. See: A. Graves "Practical Variational Inference for Neural Networks", NIPS 2011 Parameters ---------- computation_graph : instance of :class:`ComputationGraph` The computation graph. cost : :class:`~tensor.TensorVariable` The cost without weight noise. It should be a member of the computation_graph. variables : :class:`~tensor.TensorVariable` Variables to add noise to. num_examples : int Number of training examples. The cost of the model is divided by the number of training examples, please see A. Graves "Practical Variational Inference for Neural Networks" for justification parameters : list of :class:`~tensor.TensorVariable` parameters of the model, if gradients are given the list will not be used. Otherwise, it will be used to compute the gradients init_sigma : float, initial standard deviation of noise variables model_cost_coefficient : float, the weight of the model cost seed : int, optional The seed with which :class:`~theano.sandbox.rng_mrg.MRG_RandomStreams` is initialized, is set to 1 by default. gradients : dict, optional Adaptive weight noise introduces new parameters for which new cost and gradients must be computed. Unless the gradients paramter is given, it will use theano.grad to get the gradients Returns ------- cost : :class:`~tensor.TensorVariable` The new cost computation_graph : instance of :class:`ComputationGraph` new graph with added noise. gradients : dict a dictionary of gradients for all parameters: the original ones and the adaptive noise ones noise_brick : :class:~lvsr.graph.NoiseBrick the brick that holds all noise parameters and whose .apply method can be used to find variables added by adaptive noise """ if not seed: seed = config.default_seed rng = MRG_RandomStreams(seed) try: cost_index = computation_graph.outputs.index(cost) except ValueError: raise ValueError("cost is not part of the computation_graph") if gradients is None: if parameters is None: raise ValueError("Either gradients or parameters must be given") logger.info("Taking the cost gradient") gradients = dict(equizip(parameters, tensor.grad(cost, parameters))) else: if parameters is not None: logger.warn("Both gradients and parameters given, will ignore" "parameters") parameters = gradients.keys() gradients = OrderedDict(gradients) log_sigma_scale = 2048.0 P_noisy = variables # We will add noise to these Beta = [] # will hold means, log_stdev and stdevs P_with_noise = [] # will hold parames with added noise # These don't change P_clean = list(set(parameters).difference(P_noisy)) noise_brick = NoiseBrick() for p in P_noisy: p_u = p p_val = p.get_value(borrow=True) p_ls2 = theano.shared((numpy.zeros_like(p_val) + numpy.log(init_sigma) * 2. / log_sigma_scale ).astype(dtype=numpy.float32)) p_ls2.name = __get_name(p_u) noise_brick.parameters.append(p_ls2) p_s2 = tensor.exp(p_ls2 * log_sigma_scale) Beta.append((p_u, p_ls2, p_s2)) p_noisy = p_u + rng.normal(size=p_val.shape) * tensor.sqrt(p_s2) p_noisy = tensor.patternbroadcast(p_noisy, p.type.broadcastable) P_with_noise.append(p_noisy) # compute the prior mean and variation temp_sum = 0.0 temp_param_count = 0.0 for p_u, unused_p_ls2, unused_p_s2 in Beta: temp_sum = temp_sum + p_u.sum() temp_param_count = temp_param_count + p_u.shape.prod() prior_u = tensor.cast(temp_sum / temp_param_count, 'float32') temp_sum = 0.0 for p_u, unused_ls2, p_s2 in Beta: temp_sum = temp_sum + (p_s2).sum() + (((p_u-prior_u)**2).sum()) prior_s2 = tensor.cast(temp_sum/temp_param_count, 'float32') # convert everything to use the noisy parameters full_computation_graph = ComputationGraph(computation_graph.outputs + gradients.values()) full_computation_graph = full_computation_graph.replace( dict(zip(P_noisy, P_with_noise))) LC = 0.0 # model cost for p_u, p_ls2, p_s2 in Beta: LC = (LC + 0.5 * ((tensor.log(prior_s2) - p_ls2 * log_sigma_scale).sum()) + 1.0 / (2.0 * prior_s2) * (((p_u - prior_u)**2) + p_s2 - prior_s2 ).sum() ) LC = LC / num_examples * model_cost_coefficient train_cost = noise_brick.apply( full_computation_graph.outputs[cost_index].copy(), LC, prior_u, prior_s2) gradients = OrderedDict( zip(gradients.keys(), full_computation_graph.outputs[-len(gradients):])) # # Delete the gradients form the computational graph # del full_computation_graph.outputs[-len(gradients):] new_grads = {p: gradients.pop(p) for p in P_clean} # # Warning!!! # This only works for batch size 1 (we want that the sum of squares # be the square of the sum! # diag_hessian_estimate = {p: g**2 for p, g in gradients.iteritems()} for p_u, p_ls2, p_s2 in Beta: p_grad = gradients[p_u] p_u_grad = (model_cost_coefficient * (p_u - prior_u) / (num_examples*prior_s2) + p_grad) p_ls2_grad = (numpy.float32(model_cost_coefficient * 0.5 / num_examples * log_sigma_scale) * (p_s2/prior_s2 - 1.0) + (0.5*log_sigma_scale) * p_s2 * diag_hessian_estimate[p_u] ) new_grads[p_u] = p_u_grad new_grads[p_ls2] = p_ls2_grad return train_cost, full_computation_graph, new_grads, noise_brick
def apply_adaptive_noise( computation_graph, cost, variables, num_examples, parameters=None, init_sigma=1e-6, model_cost_coefficient=1.0, seed=None, gradients=None, ): """Add adaptive noise to parameters of a model. Each of the given variables will be replaced by a normal distribution with learned mean and standard deviation. A model cost is computed based on the precision of the the distributions associated with each variable. It is added to the given cost used to train the model. See: A. Graves "Practical Variational Inference for Neural Networks", NIPS 2011 Parameters ---------- computation_graph : instance of :class:`ComputationGraph` The computation graph. cost : :class:`~tensor.TensorVariable` The cost without weight noise. It should be a member of the computation_graph. variables : :class:`~tensor.TensorVariable` Variables to add noise to. num_examples : int Number of training examples. The cost of the model is divided by the number of training examples, please see A. Graves "Practical Variational Inference for Neural Networks" for justification parameters : list of :class:`~tensor.TensorVariable` parameters of the model, if gradients are given the list will not be used. Otherwise, it will be used to compute the gradients init_sigma : float, initial standard deviation of noise variables model_cost_coefficient : float, the weight of the model cost seed : int, optional The seed with which :class:`~theano.sandbox.rng_mrg.MRG_RandomStreams` is initialized, is set to 1 by default. gradients : dict, optional Adaptive weight noise introduces new parameters for which new cost and gradients must be computed. Unless the gradients paramter is given, it will use theano.grad to get the gradients Returns ------- cost : :class:`~tensor.TensorVariable` The new cost computation_graph : instance of :class:`ComputationGraph` new graph with added noise. gradients : dict a dictionary of gradients for all parameters: the original ones and the adaptive noise ones noise_brick : :class:~lvsr.graph.NoiseBrick the brick that holds all noise parameters and whose .apply method can be used to find variables added by adaptive noise """ if not seed: seed = config.default_seed rng = MRG_RandomStreams(seed) num_examples = numpy.asarray(num_examples, dtype='int32') try: cost_index = computation_graph.outputs.index(cost) except ValueError: raise ValueError("cost is not part of the computation_graph") if gradients is None: if parameters is None: raise ValueError("Either gradients or parameters must be given") logger.info("Taking the cost gradient") gradients = dict(equizip(parameters, tensor.grad(cost, parameters))) else: if parameters is not None: logger.warn("Both gradients and parameters given, will ignore" "parameters") parameters = gradients.keys() gradients = OrderedDict(gradients) log_sigma_scale = 2048.0 P_noisy = variables # We will add noise to these Beta = [] # will hold means, log_stdev and stdevs P_with_noise = [] # will hold parames with added noise # These don't change P_clean = list(set(parameters).difference(P_noisy)) noise_brick = NoiseBrick() for p in P_noisy: p_u = p p_val = p.get_value(borrow=True) p_ls2 = theano.shared( (numpy.zeros_like(p_val) + numpy.log(init_sigma) * 2. / log_sigma_scale).astype( dtype=numpy.float32)) p_ls2.name = __get_name(p_u) noise_brick.parameters.append(p_ls2) p_s2 = tensor.exp(p_ls2 * log_sigma_scale) Beta.append((p_u, p_ls2, p_s2)) p_noisy = p_u + rng.normal(size=p_val.shape) * tensor.sqrt(p_s2) p_noisy = tensor.patternbroadcast(p_noisy, p.type.broadcastable) P_with_noise.append(p_noisy) # compute the prior mean and variation temp_sum = 0.0 temp_param_count = 0.0 for p_u, unused_p_ls2, unused_p_s2 in Beta: temp_sum = temp_sum + p_u.sum() temp_param_count = temp_param_count + p_u.shape.prod() prior_u = tensor.cast(temp_sum / temp_param_count, 'float32') temp_sum = 0.0 for p_u, unused_ls2, p_s2 in Beta: temp_sum = temp_sum + (p_s2).sum() + (((p_u - prior_u)**2).sum()) prior_s2 = tensor.cast(temp_sum / temp_param_count, 'float32') # convert everything to use the noisy parameters full_computation_graph = ComputationGraph(computation_graph.outputs + gradients.values()) full_computation_graph = full_computation_graph.replace( dict(zip(P_noisy, P_with_noise))) LC = 0.0 # model cost for p_u, p_ls2, p_s2 in Beta: LC = (LC + 0.5 * ((tensor.log(prior_s2) - p_ls2 * log_sigma_scale).sum()) + 1.0 / (2.0 * prior_s2) * (((p_u - prior_u)**2) + p_s2 - prior_s2).sum()) LC = LC / num_examples * model_cost_coefficient train_cost = noise_brick.apply( full_computation_graph.outputs[cost_index].copy(), LC, prior_u, prior_s2) gradients = OrderedDict( zip(gradients.keys(), full_computation_graph.outputs[-len(gradients):])) # # Delete the gradients form the computational graph # del full_computation_graph.outputs[-len(gradients):] new_grads = {p: gradients.pop(p) for p in P_clean} # # Warning!!! # This only works for batch size 1 (we want that the sum of squares # be the square of the sum! # diag_hessian_estimate = {p: g**2 for p, g in gradients.iteritems()} for p_u, p_ls2, p_s2 in Beta: p_grad = gradients[p_u] p_u_grad = ( model_cost_coefficient * (p_u - prior_u) / tensor.cast(num_examples * prior_s2, 'float32') + p_grad) p_ls2_grad = ( numpy.float32(model_cost_coefficient * 0.5 / num_examples * log_sigma_scale) * (p_s2 / prior_s2 - 1.0) + (0.5 * log_sigma_scale) * p_s2 * diag_hessian_estimate[p_u]) new_grads[p_u] = p_u_grad new_grads[p_ls2] = p_ls2_grad return train_cost, full_computation_graph, new_grads, noise_brick