Пример #1
0
def test_replace_variable_not_in_graph():
    # Test if warning appears when variable is not in graph
    with warnings.catch_warnings(record=True) as w:
        x = tensor.scalar()
        y = x + 1
        z = tensor.scalar()
        cg = ComputationGraph([y])
        cg.replace([(y, 2 * y), (z, 2 * z)])
        assert len(w) == 1
        assert "not a part of" in str(w[-1].message)
Пример #2
0
def test_replace_variable_is_auxiliary():
    # Test if warning appears when variable is an AUXILIARY variable
    with warnings.catch_warnings(record=True) as w:
        x = tensor.scalar()
        y = x + 1
        add_role(y, AUXILIARY)
        cg = ComputationGraph([y])
        cg.replace([(y, 2 * y)])
        assert len(w) == 1
        assert "auxiliary" in str(w[-1].message)
Пример #3
0
def test_replace_variable_is_auxiliary():
    # Test if warning appears when variable is an AUXILIARY variable
    with warnings.catch_warnings(record=True) as w:
        x = tensor.scalar()
        y = x + 1
        add_role(y, AUXILIARY)
        cg = ComputationGraph([y])
        cg.replace([(y, 2 * y)])
        assert len(w) == 1
        assert "auxiliary" in str(w[-1].message)
Пример #4
0
def test_replace_variable_not_in_graph():
    # Test if warning appears when variable is not in graph
    with warnings.catch_warnings(record=True) as w:
        x = tensor.scalar()
        y = x + 1
        z = tensor.scalar()
        cg = ComputationGraph([y])
        cg.replace([(y, 2 * y), (z, 2 * z)])
        assert len(w) == 1
        assert "not a part of" in str(w[-1].message)
Пример #5
0
    def get_cost_graph(self, batch=True,
                       prediction=None, prediction_mask=None):

        if batch:
            inputs = self.inputs
            inputs_mask = self.inputs_mask
            groundtruth = self.labels
            groundtruth_mask = self.labels_mask
        else:
            inputs, inputs_mask = self.bottom.single_to_batch_inputs(
                self.single_inputs)
            groundtruth = self.single_labels[:, None]
            groundtruth_mask = None

        if not prediction:
            prediction = groundtruth
        if not prediction_mask:
            prediction_mask = groundtruth_mask

        cost = self.cost(inputs_mask=inputs_mask,
                         labels=prediction,
                         labels_mask=prediction_mask,
                         **inputs)
        cost_cg = ComputationGraph(cost)
        if self.criterion['name'].startswith("mse"):
            placeholder, = VariableFilter(theano_name='groundtruth')(cost_cg)
            cost_cg = cost_cg.replace({placeholder: groundtruth})
        return cost_cg
Пример #6
0
    def get_cost_graph(self,
                       batch=True,
                       prediction=None,
                       prediction_mask=None):

        if batch:
            inputs = self.inputs
            inputs_mask = self.inputs_mask
            groundtruth = self.labels
            groundtruth_mask = self.labels_mask
        else:
            inputs, inputs_mask = self.bottom.single_to_batch_inputs(
                self.single_inputs)
            groundtruth = self.single_labels[:, None]
            groundtruth_mask = None

        if not prediction:
            prediction = groundtruth
        if not prediction_mask:
            prediction_mask = groundtruth_mask

        cost = self.cost(inputs_mask=inputs_mask,
                         labels=prediction,
                         labels_mask=prediction_mask,
                         **inputs)
        cost_cg = ComputationGraph(cost)
        if self.criterion['name'].startswith("mse"):
            placeholder, = VariableFilter(theano_name='groundtruth')(cost_cg)
            cost_cg = cost_cg.replace({placeholder: groundtruth})
        return cost_cg
Пример #7
0
def test_computation_graph():
    x = tensor.matrix('x')
    y = tensor.matrix('y')
    z = x + y
    a = z.copy()
    a.name = 'a'
    b = z.copy()
    b.name = 'b'
    r = tensor.matrix('r')

    cg = ComputationGraph([a, b])
    assert set(cg.inputs) == {x, y}
    assert set(cg.outputs) == {a, b}
    assert set(cg.variables) == {x, y, z, a, b}
    assert cg.variables[2] is z
    assert ComputationGraph(a).inputs == cg.inputs

    cg2 = cg.replace({z: r})
    assert set(cg2.inputs) == {r}
    assert set([v.name for v in cg2.outputs]) == {'a', 'b'}

    W = theano.shared(numpy.zeros((3, 3), dtype=floatX))
    cg3 = ComputationGraph([z + W])
    assert set(cg3.shared_variables) == {W}

    cg4 = ComputationGraph([W])
    assert cg4.variables == [W]

    w1 = W**2
    cg5 = ComputationGraph([w1])
    assert W in cg5.variables
    assert w1 in cg5.variables
Пример #8
0
def test_replace():
    # Test if replace works with outputs
    x = tensor.scalar()
    y = x + 1
    cg = ComputationGraph([y])
    doubled_cg = cg.replace([(y, 2 * y)])
    out_val = doubled_cg.outputs[0].eval({x: 2})
    assert out_val == 6.0
Пример #9
0
def test_replace():
    # Test if replace works with outputs
    x = tensor.scalar()
    y = x + 1
    cg = ComputationGraph([y])
    doubled_cg = cg.replace([(y, 2 * y)])
    out_val = doubled_cg.outputs[0].eval({x: 2})
    assert out_val == 6.0
Пример #10
0
def test_computation_graph():
    x = tensor.matrix('x')
    y = tensor.matrix('y')
    z = x + y
    z.name = 'z'
    a = z.copy()
    a.name = 'a'
    b = z.copy()
    b.name = 'b'
    r = tensor.matrix('r')

    cg = ComputationGraph([a, b])
    assert set(cg.inputs) == {x, y}
    assert set(cg.outputs) == {a, b}
    assert set(cg.variables) == {x, y, z, a, b}
    assert cg.variables[2] is z
    assert ComputationGraph(a).inputs == cg.inputs

    cg2 = cg.replace({z: r})
    assert set(cg2.inputs) == {r}
    assert set([v.name for v in cg2.outputs]) == {'a', 'b'}

    W = theano.shared(numpy.zeros((3, 3),
                                  dtype=theano.config.floatX))
    cg3 = ComputationGraph([z + W])
    assert set(cg3.shared_variables) == {W}

    cg4 = ComputationGraph([W])
    assert cg4.variables == [W]

    w1 = W ** 2
    cg5 = ComputationGraph([w1])
    assert W in cg5.variables
    assert w1 in cg5.variables

    # Test scan
    s, _ = theano.scan(lambda inp, accum: accum + inp,
                       sequences=x,
                       outputs_info=tensor.zeros_like(x[0]))
    scan = s.owner.inputs[0].owner.op
    cg6 = ComputationGraph(s)
    assert cg6.scans == [scan]
    assert all(v in cg6.scan_variables for v in scan.inputs + scan.outputs)
Пример #11
0
    def _get_bn_params(self, output_vars):
        # Pick out the nodes with batch normalization vars
        cg = ComputationGraph(output_vars)
        var_filter = VariableFilter(roles=[BNPARAM])
        bn_ps = var_filter(cg.variables)

        if len(bn_ps) == 0:
            logger.warn('No batch normalization parameters found - is' +
                        ' batch normalization turned off?')
            self._bn = False
            self._counter = None
            self._counter_max = None
            bn_share = []
            output_vars_replaced = output_vars
        else:
            self._bn = True
            assert len(set([p.name for p in bn_ps])) == len(bn_ps), \
                'Some batch norm params have the same name'
            logger.info('Batch norm parameters: %s' %
                        ', '.join([p.name for p in bn_ps]))

            # Filter out the shared variables from the model updates
            def filter_share(par):
                lst = [
                    up for up in cg.updates
                    if up.name == 'shared_%s' % par.name
                ]
                assert len(lst) == 1
                return lst[0]

            bn_share = list(map(filter_share, bn_ps))

            # Replace the BN coefficients in the test data model - Replace the
            # theano variables in the test graph with the shareds
            output_vars_replaced = cg.replace(list(zip(bn_ps,
                                                       bn_share))).outputs

            # Pick out the counter
            self._counter = self._param_from_updates(cg.updates, 'counter')
            self._counter_max = self._param_from_updates(
                cg.updates, 'counter_max')

        return bn_ps, bn_share, output_vars_replaced
Пример #12
0
def test_computation_graph():
    x = tensor.matrix('x')
    y = tensor.matrix('y')
    z = x + y
    a = z.copy()
    a.name = 'a'
    b = z.copy()
    b.name = 'b'
    r = tensor.matrix('r')

    cg = ComputationGraph([a, b])
    assert set(cg.inputs) == {x, y}
    assert set(cg.outputs) == {a, b}
    assert set(cg.variables) == {x, y, z, a, b}
    assert ComputationGraph(a).inputs == cg.inputs

    cg2 = cg.replace({z: r})
    assert set(cg2.inputs) == {r}
    assert set([v.name for v in cg2.outputs]) == {'a', 'b'}
Пример #13
0
def test_computation_graph():
    x = tensor.matrix('x')
    y = tensor.matrix('y')
    z = x + y
    z.name = 'z'
    a = z.copy()
    a.name = 'a'
    b = z.copy()
    b.name = 'b'
    r = tensor.matrix('r')

    cg = ComputationGraph([a, b])
    assert set(cg.inputs) == {x, y}
    assert set(cg.outputs) == {a, b}
    assert set(cg.variables) == {x, y, z, a, b}
    assert cg.variables[2] is z
    assert ComputationGraph(a).inputs == cg.inputs

    cg2 = cg.replace({z: r})
    assert set(cg2.inputs) == {r}
    assert set([v.name for v in cg2.outputs]) == {'a', 'b'}

    W = theano.shared(numpy.zeros((3, 3),
                                  dtype=theano.config.floatX))
    cg3 = ComputationGraph([z + W])
    assert set(cg3.shared_variables) == {W}

    cg4 = ComputationGraph([W])
    assert cg4.variables == [W]

    w1 = W ** 2
    cg5 = ComputationGraph([w1])
    assert W in cg5.variables
    assert w1 in cg5.variables

    # Test scan
    s, _ = theano.scan(lambda inp, accum: accum + inp,
                       sequences=x,
                       outputs_info=tensor.zeros_like(x[0]))
    scan = s.owner.inputs[0].owner.op
    cg6 = ComputationGraph(s)
    assert cg6.scans == [scan]
    assert all(v in cg6.scan_variables for v in scan.inputs + scan.outputs)
Пример #14
0
def test_computation_graph():
    x = tensor.matrix('x')
    y = tensor.matrix('y')
    z = x + y
    a = z.copy()
    a.name = 'a'
    b = z.copy()
    b.name = 'b'
    r = tensor.matrix('r')

    cg = ComputationGraph([a, b])
    assert set(cg.inputs) == {x, y}
    assert set(cg.outputs) == {a, b}
    assert set(cg.variables) == {x, y, z, a, b}
    assert ComputationGraph(a).inputs == cg.inputs

    cg2 = cg.replace({z: r})
    assert set(cg2.inputs) == {r}
    assert set([v.name for v in cg2.outputs]) == {'a', 'b'}
Пример #15
0
 def get_cost_graph(self, batch=True,
                    prediction=None, prediction_mask=None):
     if batch:
         recordings = self.recordings
         recordings_mask = self.recordings_mask
         groundtruth = self.labels
         groundtruth_mask = self.labels_mask
     else:
         recordings = self.single_recording[:, None, :]
         recordings_mask = tensor.ones_like(recordings[:, :, 0])
         groundtruth = self.single_transcription[:, None]
         groundtruth_mask = None
     if not prediction:
         prediction = groundtruth
     if not prediction_mask:
         prediction_mask = groundtruth_mask
     cost = self.cost(recordings, recordings_mask,
                      prediction, prediction_mask)
     cost_cg = ComputationGraph(cost)
     if self.criterion['name'].startswith("mse"):
         placeholder, = VariableFilter(theano_name='groundtruth')(cost_cg)
         cost_cg = cost_cg.replace({placeholder: groundtruth})
     return cost_cg
Пример #16
0
    def _get_bn_params(self, output_vars):
        # Pick out the nodes with batch normalization vars
        cg = ComputationGraph(output_vars)
        var_filter = VariableFilter(roles=[BNPARAM])
        bn_ps = var_filter(cg.variables)

        if len(bn_ps) == 0:
            logger.warn('No batch normalization parameters found - is' +
                        ' batch normalization turned off?')
            self._bn = False
            self._counter = None
            self._counter_max = None
            bn_share = []
            output_vars_replaced = output_vars
        else:
            self._bn = True
            assert len(set([p.name for p in bn_ps])) == len(bn_ps), \
                'Some batch norm params have the same name'
            logger.info('Batch norm parameters: %s' % ', '.join([p.name for p in bn_ps]))

            # Filter out the shared variables from the model updates
            def filter_share(par):
                lst = [up for up in cg.updates if up.name == 'shared_%s' % par.name]
                assert len(lst) == 1
                return lst[0]
            bn_share = map(filter_share, bn_ps)

            # Replace the BN coefficients in the test data model - Replace the
            # theano variables in the test graph with the shareds
            output_vars_replaced = cg.replace(zip(bn_ps, bn_share)).outputs

            # Pick out the counter
            self._counter = self._param_from_updates(cg.updates, 'counter')
            self._counter_max = self._param_from_updates(cg.updates, 'counter_max')

        return bn_ps, bn_share, output_vars_replaced
Пример #17
0
def apply_adaptive_noise(computation_graph,
                         cost,
                         variables,
                         num_examples,
                         parameters=None,
                         init_sigma=1e-6,
                         model_cost_coefficient=1.0,
                         seed=None,
                         gradients=None,
                         ):
    """Add adaptive noise to parameters of a model.

    Each of the given variables will be replaced by a normal
    distribution with learned mean and standard deviation.

    A model cost is computed based on the precision of the the distributions
    associated with each variable. It is added to the given cost used to
    train the model.

    See: A. Graves "Practical Variational Inference for Neural Networks",
         NIPS 2011

    Parameters
    ----------
    computation_graph : instance of :class:`ComputationGraph`
        The computation graph.
    cost : :class:`~tensor.TensorVariable`
        The cost without weight noise. It should be a member of the
        computation_graph.
    variables : :class:`~tensor.TensorVariable`
        Variables to add noise to.
    num_examples : int
        Number of training examples. The cost of the model is divided by
        the number of training examples, please see
        A. Graves "Practical Variational Inference for Neural Networks"
        for justification
    parameters : list of :class:`~tensor.TensorVariable`
        parameters of the model, if gradients are given the list will not
        be used. Otherwise, it will be used to compute the gradients
    init_sigma : float,
        initial standard deviation of noise variables
    model_cost_coefficient : float,
        the weight of the model cost
    seed : int, optional
        The seed with which
        :class:`~theano.sandbox.rng_mrg.MRG_RandomStreams` is initialized,
        is set to 1 by default.
    gradients : dict, optional
        Adaptive weight noise introduces new parameters for which new cost
        and gradients must be computed. Unless the gradients paramter is
        given, it will use theano.grad to get the gradients
    Returns
    -------

    cost : :class:`~tensor.TensorVariable`
        The new cost
    computation_graph : instance of :class:`ComputationGraph`
        new graph with added noise.
    gradients : dict
        a dictionary of gradients for all parameters: the original ones
        and the adaptive noise ones
    noise_brick : :class:~lvsr.graph.NoiseBrick
        the brick that holds all noise parameters and whose .apply method
        can be used to find variables added by adaptive noise
    """
    if not seed:
        seed = config.default_seed
    rng = MRG_RandomStreams(seed)

    try:
        cost_index = computation_graph.outputs.index(cost)
    except ValueError:
        raise ValueError("cost is not part of the computation_graph")

    if gradients is None:
        if parameters is None:
            raise ValueError("Either gradients or parameters must be given")
        logger.info("Taking the cost gradient")
        gradients = dict(equizip(parameters,
                                 tensor.grad(cost, parameters)))
    else:
        if parameters is not None:
            logger.warn("Both gradients and parameters given, will ignore"
                        "parameters")
        parameters = gradients.keys()

    gradients = OrderedDict(gradients)

    log_sigma_scale = 2048.0

    P_noisy = variables  # We will add noise to these
    Beta = []  # will hold means, log_stdev and stdevs
    P_with_noise = []  # will hold parames with added noise

    # These don't change
    P_clean = list(set(parameters).difference(P_noisy))

    noise_brick = NoiseBrick()

    for p in P_noisy:
        p_u = p
        p_val = p.get_value(borrow=True)
        p_ls2 = theano.shared((numpy.zeros_like(p_val) +
                               numpy.log(init_sigma) * 2. / log_sigma_scale
                               ).astype(dtype=numpy.float32))
        p_ls2.name = __get_name(p_u)
        noise_brick.parameters.append(p_ls2)
        p_s2 = tensor.exp(p_ls2 * log_sigma_scale)
        Beta.append((p_u, p_ls2, p_s2))

        p_noisy = p_u + rng.normal(size=p_val.shape) * tensor.sqrt(p_s2)
        p_noisy = tensor.patternbroadcast(p_noisy, p.type.broadcastable)
        P_with_noise.append(p_noisy)

    #  compute the prior mean and variation
    temp_sum = 0.0
    temp_param_count = 0.0
    for p_u, unused_p_ls2, unused_p_s2 in Beta:
        temp_sum = temp_sum + p_u.sum()
        temp_param_count = temp_param_count + p_u.shape.prod()

    prior_u = tensor.cast(temp_sum / temp_param_count, 'float32')

    temp_sum = 0.0
    for p_u, unused_ls2, p_s2 in Beta:
        temp_sum = temp_sum + (p_s2).sum() + (((p_u-prior_u)**2).sum())

    prior_s2 = tensor.cast(temp_sum/temp_param_count, 'float32')

    #  convert everything to use the noisy parameters
    full_computation_graph = ComputationGraph(computation_graph.outputs +
                                              gradients.values())
    full_computation_graph = full_computation_graph.replace(
        dict(zip(P_noisy, P_with_noise)))

    LC = 0.0  # model cost
    for p_u, p_ls2, p_s2 in Beta:
        LC = (LC +
              0.5 * ((tensor.log(prior_s2) - p_ls2 * log_sigma_scale).sum()) +
              1.0 / (2.0 * prior_s2) * (((p_u - prior_u)**2) + p_s2 - prior_s2
                                        ).sum()
              )

    LC = LC / num_examples * model_cost_coefficient

    train_cost = noise_brick.apply(
        full_computation_graph.outputs[cost_index].copy(), LC,
        prior_u, prior_s2)

    gradients = OrderedDict(
        zip(gradients.keys(),
            full_computation_graph.outputs[-len(gradients):]))

    #
    # Delete the gradients form the computational graph
    #
    del full_computation_graph.outputs[-len(gradients):]

    new_grads = {p: gradients.pop(p) for p in P_clean}

    #
    # Warning!!!
    # This only works for batch size 1 (we want that the sum of squares
    # be the square of the sum!
    #
    diag_hessian_estimate = {p: g**2 for p, g in gradients.iteritems()}

    for p_u, p_ls2, p_s2 in Beta:
        p_grad = gradients[p_u]
        p_u_grad = (model_cost_coefficient * (p_u - prior_u) /
                    (num_examples*prior_s2) + p_grad)

        p_ls2_grad = (numpy.float32(model_cost_coefficient *
                                    0.5 / num_examples * log_sigma_scale) *
                      (p_s2/prior_s2 - 1.0) +
                      (0.5*log_sigma_scale) * p_s2 * diag_hessian_estimate[p_u]
                      )
        new_grads[p_u] = p_u_grad
        new_grads[p_ls2] = p_ls2_grad

    return train_cost, full_computation_graph, new_grads, noise_brick
Пример #18
0
def apply_adaptive_noise(
    computation_graph,
    cost,
    variables,
    num_examples,
    parameters=None,
    init_sigma=1e-6,
    model_cost_coefficient=1.0,
    seed=None,
    gradients=None,
):
    """Add adaptive noise to parameters of a model.

    Each of the given variables will be replaced by a normal
    distribution with learned mean and standard deviation.

    A model cost is computed based on the precision of the the distributions
    associated with each variable. It is added to the given cost used to
    train the model.

    See: A. Graves "Practical Variational Inference for Neural Networks",
         NIPS 2011

    Parameters
    ----------
    computation_graph : instance of :class:`ComputationGraph`
        The computation graph.
    cost : :class:`~tensor.TensorVariable`
        The cost without weight noise. It should be a member of the
        computation_graph.
    variables : :class:`~tensor.TensorVariable`
        Variables to add noise to.
    num_examples : int
        Number of training examples. The cost of the model is divided by
        the number of training examples, please see
        A. Graves "Practical Variational Inference for Neural Networks"
        for justification
    parameters : list of :class:`~tensor.TensorVariable`
        parameters of the model, if gradients are given the list will not
        be used. Otherwise, it will be used to compute the gradients
    init_sigma : float,
        initial standard deviation of noise variables
    model_cost_coefficient : float,
        the weight of the model cost
    seed : int, optional
        The seed with which
        :class:`~theano.sandbox.rng_mrg.MRG_RandomStreams` is initialized,
        is set to 1 by default.
    gradients : dict, optional
        Adaptive weight noise introduces new parameters for which new cost
        and gradients must be computed. Unless the gradients paramter is
        given, it will use theano.grad to get the gradients
    Returns
    -------

    cost : :class:`~tensor.TensorVariable`
        The new cost
    computation_graph : instance of :class:`ComputationGraph`
        new graph with added noise.
    gradients : dict
        a dictionary of gradients for all parameters: the original ones
        and the adaptive noise ones
    noise_brick : :class:~lvsr.graph.NoiseBrick
        the brick that holds all noise parameters and whose .apply method
        can be used to find variables added by adaptive noise
    """
    if not seed:
        seed = config.default_seed
    rng = MRG_RandomStreams(seed)

    num_examples = numpy.asarray(num_examples, dtype='int32')

    try:
        cost_index = computation_graph.outputs.index(cost)
    except ValueError:
        raise ValueError("cost is not part of the computation_graph")

    if gradients is None:
        if parameters is None:
            raise ValueError("Either gradients or parameters must be given")
        logger.info("Taking the cost gradient")
        gradients = dict(equizip(parameters, tensor.grad(cost, parameters)))
    else:
        if parameters is not None:
            logger.warn("Both gradients and parameters given, will ignore"
                        "parameters")
        parameters = gradients.keys()

    gradients = OrderedDict(gradients)

    log_sigma_scale = 2048.0

    P_noisy = variables  # We will add noise to these
    Beta = []  # will hold means, log_stdev and stdevs
    P_with_noise = []  # will hold parames with added noise

    # These don't change
    P_clean = list(set(parameters).difference(P_noisy))

    noise_brick = NoiseBrick()

    for p in P_noisy:
        p_u = p
        p_val = p.get_value(borrow=True)
        p_ls2 = theano.shared(
            (numpy.zeros_like(p_val) +
             numpy.log(init_sigma) * 2. / log_sigma_scale).astype(
                 dtype=numpy.float32))
        p_ls2.name = __get_name(p_u)
        noise_brick.parameters.append(p_ls2)
        p_s2 = tensor.exp(p_ls2 * log_sigma_scale)
        Beta.append((p_u, p_ls2, p_s2))

        p_noisy = p_u + rng.normal(size=p_val.shape) * tensor.sqrt(p_s2)
        p_noisy = tensor.patternbroadcast(p_noisy, p.type.broadcastable)
        P_with_noise.append(p_noisy)

    #  compute the prior mean and variation
    temp_sum = 0.0
    temp_param_count = 0.0
    for p_u, unused_p_ls2, unused_p_s2 in Beta:
        temp_sum = temp_sum + p_u.sum()
        temp_param_count = temp_param_count + p_u.shape.prod()

    prior_u = tensor.cast(temp_sum / temp_param_count, 'float32')

    temp_sum = 0.0
    for p_u, unused_ls2, p_s2 in Beta:
        temp_sum = temp_sum + (p_s2).sum() + (((p_u - prior_u)**2).sum())

    prior_s2 = tensor.cast(temp_sum / temp_param_count, 'float32')

    #  convert everything to use the noisy parameters
    full_computation_graph = ComputationGraph(computation_graph.outputs +
                                              gradients.values())
    full_computation_graph = full_computation_graph.replace(
        dict(zip(P_noisy, P_with_noise)))

    LC = 0.0  # model cost
    for p_u, p_ls2, p_s2 in Beta:
        LC = (LC + 0.5 *
              ((tensor.log(prior_s2) - p_ls2 * log_sigma_scale).sum()) + 1.0 /
              (2.0 * prior_s2) *
              (((p_u - prior_u)**2) + p_s2 - prior_s2).sum())

    LC = LC / num_examples * model_cost_coefficient

    train_cost = noise_brick.apply(
        full_computation_graph.outputs[cost_index].copy(), LC, prior_u,
        prior_s2)

    gradients = OrderedDict(
        zip(gradients.keys(),
            full_computation_graph.outputs[-len(gradients):]))

    #
    # Delete the gradients form the computational graph
    #
    del full_computation_graph.outputs[-len(gradients):]

    new_grads = {p: gradients.pop(p) for p in P_clean}

    #
    # Warning!!!
    # This only works for batch size 1 (we want that the sum of squares
    # be the square of the sum!
    #
    diag_hessian_estimate = {p: g**2 for p, g in gradients.iteritems()}

    for p_u, p_ls2, p_s2 in Beta:
        p_grad = gradients[p_u]
        p_u_grad = (
            model_cost_coefficient *
            (p_u - prior_u) / tensor.cast(num_examples * prior_s2, 'float32') +
            p_grad)

        p_ls2_grad = (
            numpy.float32(model_cost_coefficient * 0.5 / num_examples *
                          log_sigma_scale) * (p_s2 / prior_s2 - 1.0) +
            (0.5 * log_sigma_scale) * p_s2 * diag_hessian_estimate[p_u])
        new_grads[p_u] = p_u_grad
        new_grads[p_ls2] = p_ls2_grad

    return train_cost, full_computation_graph, new_grads, noise_brick