Exemplo n.º 1
0
    def get_monitoring_channels(self, X, Y, **kwargs):

        theano_rng = MRG_RandomStreams(2012 + 12 + 19)
        # Explanation of reality
        zh1, rh1 = self.infer_h1(X)
        rh1 = block_gradient(rh1)
        zh2 = T.dot(rh1, self.rh2w) + self.rh2b
        rh2 = theano_rng.binomial(p = T.nnet.sigmoid(zh2), size = zh2.shape, dtype='float32')
        rh2 = block_gradient(rh2)
        y = T.dot(rh2, self.ryw) + self.ryb

        err = T.neq(T.argmax(y, axis=1), T.argmax(Y, axis=1))
        assert err.ndim == 1

        return { 'misclass' : err.astype('float32').mean() }
Exemplo n.º 2
0
    def cost(self, Y, Y_hat):
        # Pull out the argument to the sigmoid
        assert hasattr(Y_hat, 'owner')
        owner = Y_hat.owner
        assert owner is not None
        op = owner.op

        if not hasattr(op, 'scalar_op'):
            raise ValueError("Expected Y_hat to be generated by an Elemwise "
                             "op, got " + str(op) + " of type " +
                             str(type(op)))
        assert isinstance(op.scalar_op, T.nnet.sigm.ScalarSigmoid)
        z, = owner.inputs
        # Broadcasted multiplication with the gradient mask.
        if self._monitor_individual:
            z = (z * self._gradient_mask +
                 block_gradient(z) * (1. - self._gradient_mask))
        # Geometric mean.
        z = z.mean(axis=1)

        # Expecting binary targets.
        term_1 = Y[:, 0] * T.nnet.softplus(-z)
        term_2 = (1 - Y[:, 0]) * T.nnet.softplus(z)

        total = term_1 + term_2
        assert total.ndim == 1
        return total.mean()
Exemplo n.º 3
0
 def arithmetic_mean(self, state):
     reshaped = state.reshape((state.shape[0], self._n_replicas,
                               state.shape[1] / self._n_replicas))
     broadcasted_mask = self._grad_mask.dimshuffle('x', 0, 'x')
     unblocked = reshaped * broadcasted_mask
     blocked = block_gradient(reshaped) * (np.float32(1) - broadcasted_mask)
     return (unblocked + blocked).mean(axis=1)
Exemplo n.º 4
0
    def __call__(self, model, X, Y, **kwargs):

        Y_hat_e = model.fprop(X)
        Y_hat = model.fprop(X, apply_dropout=True)

        assert hasattr(Y_hat, 'owner')
        owner = Y_hat.owner
        assert owner is not None
        op = owner.op
        if isinstance(op, Print):
            assert len(owner.inputs) == 1
            Y_hat, = owner.inputs
            owner = Y_hat.owner
            op = owner.op
        assert isinstance(op, T.nnet.Softmax)
        z ,= owner.inputs
        assert z.ndim == 2

        z_weight = Y_hat - Y_hat_e
        z_weight = block_gradient(z_weight)
        neg = z_weight * z
        neg = neg.sum(axis=1).mean()

        z = z - z.max(axis=1).dimshuffle(0, 'x')
        log_prob = z - T.log(T.exp(z).sum(axis=1).dimshuffle(0, 'x'))
        # we use sum and not mean because this is really one variable per row
        log_prob_of = (Y * log_prob).sum(axis=1)
        assert log_prob_of.ndim == 1
        log_prob_of = log_prob_of.mean()

        return -(log_prob_of + self.alpha * neg)
Exemplo n.º 5
0
    def cost(self, Y, Y_hat):
        # Pull out the argument to the sigmoid
        assert hasattr(Y_hat, 'owner')
        owner = Y_hat.owner
        assert owner is not None
        op = owner.op

        if not hasattr(op, 'scalar_op'):
            raise ValueError("Expected Y_hat to be generated by an Elemwise "
                             "op, got " + str(op) + " of type " +
                             str(type(op)))
        assert isinstance(op.scalar_op, T.nnet.sigm.ScalarSigmoid)
        z, = owner.inputs
        # Broadcasted multiplication with the gradient mask.
        if self._monitor_individual:
            z = (z * self._gradient_mask + block_gradient(z) *
                 (1. - self._gradient_mask))
        # Geometric mean.
        z = z.mean(axis=1)

        # Expecting binary targets.
        term_1 = Y[:, 0] * T.nnet.softplus(-z)
        term_2 = (1 - Y[:, 0]) * T.nnet.softplus(z)

        total = term_1 + term_2
        assert total.ndim == 1
        return total.mean()
Exemplo n.º 6
0
    def __call__(self, model, X, Y, **kwargs):

        Y_hat = model.fprop(X, apply_dropout=False)
        prob = Y_hat * Y + (1-Y_hat) * (1-Y)

        weight = 1./(.1 + prob)

        weight = block_gradient(weight)

        Y_hat = model.fprop(X, apply_dropout=True)
        # Pull out the argument to the sigmoid
        assert hasattr(Y_hat, 'owner')
        owner = Y_hat.owner
        assert owner is not None
        op = owner.op

        if not hasattr(op, 'scalar_op'):
            raise ValueError("Expected Y_hat to be generated by an Elemwise op, got "+str(op)+" of type "+str(type(op)))
        assert isinstance(op.scalar_op, T.nnet.sigm.ScalarSigmoid)
        Z ,= owner.inputs

        term_1 = Y * T.nnet.softplus(-Z)
        term_2 = (1 - Y) * T.nnet.softplus(Z)

        total = term_1 + term_2

        total = weight * total

        ave = total.mean()

        return ave
Exemplo n.º 7
0
    def __call__(self, model, X, Y, ** kwargs):
        Y_hat, Y_hat_e = model.lone_ranger_dropout_fprop(X, default_input_include_prob=self.default_input_include_prob,
                input_include_probs=self.input_include_probs, default_input_scale=self.default_input_scale,
                input_scales=self.input_scales, scale_ensemble=self.scale_ensemble, dont_drop_input = self.dont_drop_input
                )

        assert hasattr(Y_hat, 'owner')
        owner = Y_hat.owner
        assert owner is not None
        op = owner.op
        if isinstance(op, Print):
            assert len(owner.inputs) == 1
            Y_hat, = owner.inputs
            owner = Y_hat.owner
            op = owner.op
        assert isinstance(op, T.nnet.Softmax)
        z ,= owner.inputs
        assert z.ndim == 2

        z_weight = Y_hat - Y_hat_e
        z_weight = block_gradient(z_weight)
        neg = z_weight * z
        neg = neg.sum(axis=1).mean()

        z = z - z.max(axis=1).dimshuffle(0, 'x')
        log_prob = z - T.log(T.exp(z).sum(axis=1).dimshuffle(0, 'x'))
        # we use sum and not mean because this is really one variable per row
        log_prob_of = (Y * log_prob).sum(axis=1)
        assert log_prob_of.ndim == 1
        log_prob_of = log_prob_of.mean()

        return -(log_prob_of + self.alpha * neg)
Exemplo n.º 8
0
    def dropout_fprop(self,
                      inputs,
                      default_input_include_prob=0.5,
                      input_include_probs=None,
                      default_input_scale=2.,
                      input_scales=None,
                      per_example=True):

        inputs = self.input_space.format_as(inputs, self.mlp.input_space)
        if self.scale:
            inputs = inputs / 255.
        rval = self.mlp.dropout_fprop(inputs, default_input_include_prob,
                                      input_include_probs, default_input_scale,
                                      input_scales, per_example)

        if self.pooling_mode == 0:
            rval = tensor.max(rval, axis=0)
        elif self.pooling_mode == 1:
            top_ids = block_gradient(tensor.sort(rval, axis=0))[-3:][::-1]
            top_vals = rval[top_ids]
            rval = T.mean(top_vals)
        elif self.pooling_mode == 2:
            #import ipdb; ipdb.set_trace()
            #collapsed_rval = tensor.sum(rval, axis=1)
            top_ids = block_gradient(tensor.argsort(rval, axis=0))[::-1]
            top_vals_sum = rval[top_ids[0],
                                tensor.arange(rval.shape[1])] * self.probs[0]
            #+\
            #rval[top_ids[1], tensor.arange(rval.shape[1])] * self.probs[1]
            #+\
            #rval[top_ids[2], tensor.arange(rval.shape[1])] * self.probs[2]
            rval = top_vals_sum / 2
        else:
            raise Exception("Others are not implemented yet!")
        rval = rval.dimshuffle('x', 0)

        # TODO if you set input prob, the final layer doesn't recognize h0
        if input_include_probs is None and input_scales is None:
            rval = self.final_layer.dropout_fprop(rval,
                                                  default_input_include_prob,
                                                  input_include_probs,
                                                  default_input_scale,
                                                  input_scales, per_example)
        else:
            rval = self.final_layer.fprop(rval)

        return rval
Exemplo n.º 9
0
    def get_samples_and_objectives(self, model, data):
        space, sources = self.get_data_specs(model)
        space.validate(data)
        assert isinstance(model, AdversaryPair)
        g = model.generator
        d = model.discriminator

        # Note: this assumes data is design matrix
        X = data
        m = data.shape[space.get_batch_axis()]
        y1 = T.alloc(1, m, 1)
        y0 = T.alloc(0, m, 1)
        # NOTE: if this changes to optionally use dropout, change the inference
        # code below to use a non-dropped-out version.
        S, z, other_layers = g.sample_and_noise(m, default_input_include_prob=self.generator_default_input_include_prob, default_input_scale=self.generator_default_input_scale, all_g_layers=(self.infer_layer is not None))

        if self.noise_both != 0.:
            rng = MRG_RandomStreams(2014 / 6 + 2)
            S = S + rng.normal(size=S.shape, dtype=S.dtype) * self.noise_both
            X = X + rng.normal(size=X.shape, dtype=S.dtype) * self.noise_both

        y_hat1 = d.dropout_fprop(X, self.discriminator_default_input_include_prob,
                                     self.discriminator_input_include_probs,
                                     self.discriminator_default_input_scale,
                                     self.discriminator_input_scales)
        y_hat0 = d.dropout_fprop(S, self.discriminator_default_input_include_prob,
                                     self.discriminator_input_include_probs,
                                     self.discriminator_default_input_scale,
                                     self.discriminator_input_scales)

        d_obj =  0.5 * (d.layers[-1].cost(y1, y_hat1) + d.layers[-1].cost(y0, y_hat0))

        if self.no_drop_in_d_for_g:
            y_hat0_no_drop = d.dropout_fprop(S)
            g_obj = d.layers[-1].cost(y1, y_hat0_no_drop)
        else:
            g_obj = d.layers[-1].cost(y1, y_hat0)

        if self.blend_obj:
            g_obj = (self.zurich_coeff * g_obj - self.minimax_coeff * d_obj) / (self.zurich_coeff + self.minimax_coeff)

        if model.inferer is not None:
            # Change this if we ever switch to using dropout in the
            # construction of S.
            S_nograd = block_gradient(S)  # Redundant as long as we have custom get_gradients
            pred = model.inferer.dropout_fprop(S_nograd, self.inference_default_input_include_prob,
                                                self.inference_input_include_probs,
                                                self.inference_default_input_scale,
                                                self.inference_input_scales)
            if self.infer_layer is None:
                target = z
            else:
                target = other_layers[self.infer_layer]
            i_obj = model.inferer.layers[-1].cost(target, pred)
        else:
            i_obj = 0

        return S, d_obj, g_obj, i_obj
Exemplo n.º 10
0
 def geometric_mean(self, state):
     pre = extract_op_argument(state, recurse=2)
     rsh = pre.reshape((pre.shape[0], self._n_replicas, pre.shape[1] /
                       self._n_replicas))
     broadcasted_mask = self._grad_mask.dimshuffle('x', 0, 'x')
     unblocked = rsh * broadcasted_mask
     blocked = block_gradient(rsh) * (np.float32(1) - broadcasted_mask)
     geo = T.nnet.softmax((unblocked + blocked).mean(axis=1))
     return geo
Exemplo n.º 11
0
    def get_weight(self, model, X, Y):

        ensemble_Y = model.fprop(X, apply_dropout=False)
        prob_of = (ensemble_Y * Y).sum(axis=1)

        weight = 1./ (self.k + self.alpha * (prob_of - self.beta * 1./T.cast(Y.shape[1], 'float32')))
        weight = weight / weight.sum()
        weight = block_gradient(weight)
        return weight
Exemplo n.º 12
0
    def dropout_fprop(self, inputs, default_input_include_prob=0.5,
                    input_include_probs=None, default_input_scale=2.,
                    input_scales=None, per_example=True):

        inputs = self.input_space.format_as(inputs, self.mlp.input_space)
        if self.scale:
            inputs = inputs / 255.
        rval = self.mlp.dropout_fprop(inputs, default_input_include_prob,
                    input_include_probs, default_input_scale,
                    input_scales, per_example)

        if self.pooling_mode == 0:
            rval = tensor.max(rval, axis=0)
        elif self.pooling_mode == 1:
            top_ids = block_gradient(tensor.sort(rval, axis=0))[-3:][::-1]
            top_vals = rval[top_ids]
            rval = T.mean(top_vals)
        elif self.pooling_mode == 2:
            #import ipdb; ipdb.set_trace()
            #collapsed_rval = tensor.sum(rval, axis=1)
            top_ids = block_gradient(tensor.argsort(rval, axis=0))[::-1]
            top_vals_sum = rval[top_ids[0], tensor.arange(rval.shape[1])] * self.probs[0]
                    #+\
                    #rval[top_ids[1], tensor.arange(rval.shape[1])] * self.probs[1]
                    #+\
                    #rval[top_ids[2], tensor.arange(rval.shape[1])] * self.probs[2]
            rval = top_vals_sum / 2
        else:
            raise Exception("Others are not implemented yet!")
        rval = rval.dimshuffle('x', 0)

        # TODO if you set input prob, the final layer doesn't recognize h0
        if input_include_probs is None and input_scales is None:
            rval = self.final_layer.dropout_fprop(rval, default_input_include_prob,
                    input_include_probs, default_input_scale,
                    input_scales, per_example)
        else:
            rval = self.final_layer.fprop(rval)

        return rval
Exemplo n.º 13
0
def block(l):
    """
    .. todo::

        WRITEME
    """
    new = []
    for elem in l:
        if isinstance(elem, (list, tuple)):
            new.append(block(elem))
        else:
            new.append(block_gradient(elem))
    if isinstance(l, tuple):
        return tuple(new)
    return new
Exemplo n.º 14
0
def block(l):
    """
    .. todo::

        WRITEME
    """
    new = []
    for elem in l:
        if isinstance(elem, (list, tuple)):
            new.append(block(elem))
        else:
            new.append(block_gradient(elem))
    if isinstance(l, tuple):
        return tuple(new)
    return new
Exemplo n.º 15
0
    def fprop(self, inputs):

        # format inputs
        inputs = self.input_space.format_as(inputs, self.mlp.input_space)
        rval = self.mlp.fprop(inputs)
        if self.pooling_mode == 0:
            rval = tensor.max(rval, axis=0)
        elif self.pooling_mode == 1:
            rval = block_gradient(tensor.sort(rval, axis=0))[-3:][::-1]
            rval = tensor.sum(rval * self.probs, axis=0) / 3
            #rval = tensor.mean(rval, axis=0)
        else:
            raise Exception("Others are not implemented yet!")
        rval = rval.dimshuffle('x', 0)
        rval = self.final_layer.fprop(rval)

        return rval
Exemplo n.º 16
0
    def fprop(self, inputs):

        # format inputs
        inputs = self.input_space.format_as(inputs, self.mlp.input_space)
        rval = self.mlp.fprop(inputs)
        if self.pooling_mode == 0:
            rval = tensor.max(rval, axis=0)
        elif self.pooling_mode == 1:
            rval = block_gradient(tensor.sort(rval, axis=0))[-3:][::-1]
            rval = tensor.sum(rval * self.probs, axis=0) / 3
            #rval = tensor.mean(rval, axis=0)
        else:
            raise Exception("Others are not implemented yet!")
        rval = rval.dimshuffle('x', 0)
        rval = self.final_layer.fprop(rval)

        return rval
Exemplo n.º 17
0
    def get_samples_and_objectives(self, model, data):
        space, sources = self.get_data_specs(model)
        space.validate(data)
        assert isinstance(model, AdversaryPair)
        g = model.generator
        d = model.discriminator

        # Note: this assumes data is b01c
        X = data
        assert X.ndim == 4
        m = data.shape[space.get_batch_axis()]
        y1 = T.alloc(1, m, 1)
        y0 = T.alloc(0, m, 1)
        # NOTE: if this changes to optionally use dropout, change the inference
        # code below to use a non-dropped-out version.
        S, z = g.inpainting_sample_and_noise(X, default_input_include_prob=self.generator_default_input_include_prob, default_input_scale=self.generator_default_input_scale)
        y_hat1 = d.dropout_fprop(X, self.discriminator_default_input_include_prob,
                                     self.discriminator_input_include_probs,
                                     self.discriminator_default_input_scale,
                                     self.discriminator_input_scales)
        y_hat0 = d.dropout_fprop(S, self.discriminator_default_input_include_prob,
                                     self.discriminator_input_include_probs,
                                     self.discriminator_default_input_scale,
                                     self.discriminator_input_scales)

        d_obj =  0.5 * (d.layers[-1].cost(y1, y_hat1) + d.layers[-1].cost(y0, y_hat0))

        if self.no_drop_in_d_for_g:
            y_hat0_no_drop = d.dropout_fprop(S)
            g_obj = d.layers[-1].cost(y1, y_hat0)
        else:
            g_obj = d.layers[-1].cost(y1, y_hat0)

        if model.inferer is not None:
            # Change this if we ever switch to using dropout in the
            # construction of S.
            S_nograd = block_gradient(S)  # Redundant as long as we have custom get_gradients
            z_hat = model.inferer.dropout_fprop(S_nograd, self.inference_default_input_include_prob,
                                                self.inference_input_include_probs,
                                                self.inference_default_input_scale,
                                                self.inference_input_scales)
            i_obj = model.inferer.layers[-1].cost(z, z_hat)
        else:
            i_obj = 0

        return S, d_obj, g_obj, i_obj
Exemplo n.º 18
0
    def get_gradients(self, model, X, Y=None, **kwargs):

        assert 'dual' not in kwargs
        updates = {}

        if self.use_admm:
            rho = self.constraint_coeff * 2.
            dual = model.dual
            WBW = T.dot(model.W.T * model.beta, model.W)
            target = T.identity_like(WBW)
            err = WBW - target
            new_dual = dual + rho * err
            new_dual = block_gradient(new_dual)
            kwargs['dual'] = new_dual
            updates[dual] = new_dual

        cost = self(model, X, Y, **kwargs)

        params = model.get_params()
        assert not isinstance(params, set)
        return dict(zip(params, T.grad(cost, params))), updates
Exemplo n.º 19
0
    def get_gradients(self, model, X, Y=None, **kwargs):

        assert 'dual' not in kwargs
        updates = {}

        if self.use_admm:
            rho = self.constraint_coeff * 2.
            dual = model.dual
            WBW = T.dot(model.W.T * model.beta, model.W)
            target = T.identity_like(WBW)
            err = WBW - target
            new_dual = dual + rho * err
            new_dual = block_gradient(new_dual)
            kwargs['dual'] = new_dual
            updates[dual] = new_dual

        cost = self(model, X, Y, **kwargs)

        params = model.get_params()
        assert not isinstance(params, set)
        return dict(zip(params, T.grad(cost, params))), updates
Exemplo n.º 20
0
    def __call__(self, model, X, Y, **kwargs):

        Y = Y * 2 - 1

        # Get the approximate ensemble predictions
        Y_hat = model.fprop(X, apply_dropout=False)
        # Pull out the argument to the sigmoid
        assert hasattr(Y_hat, 'owner')
        owner = Y_hat.owner
        assert owner is not None
        op = owner.op

        if not hasattr(op, 'scalar_op'):
            raise ValueError("Expected Y_hat to be generated by an Elemwise op, got "+str(op)+" of type "+str(type(op)))
        assert isinstance(op.scalar_op, T.nnet.sigm.ScalarSigmoid)
        F ,= owner.inputs

        weights = - Y * T.nnet.softmax(-(Y * F).T).T

        weights = block_gradient(weights)


        # Get the individual model predictions
        Y_hat = model.fprop(X, apply_dropout=True)
        # Pull out the argument to the sigmoid
        assert hasattr(Y_hat, 'owner')
        owner = Y_hat.owner
        assert owner is not None
        op = owner.op

        if not hasattr(op, 'scalar_op'):
            raise ValueError("Expected Y_hat to be generated by an Elemwise op, got "+str(op)+" of type "+str(type(op)))
        assert isinstance(op.scalar_op, T.nnet.sigm.ScalarSigmoid)
        f ,= owner.inputs

        cost = (weights * T.exp(-Y * f)).mean()

        assert cost.ndim == 0

        return cost
Exemplo n.º 21
0
    def get_cost(self, X, Y, **kwargs):

        # Dream
        theano_rng = MRG_RandomStreams(2012 + 12 + 18)
        exp_y = T.nnet.softmax(T.alloc(0., self.batch_size, self.n_classes) + self.gyb)
        dy = theano_rng.multinomial(pvals = exp_y, dtype='float32')
        dy = block_gradient(dy)
        exp_h2 = T.nnet.sigmoid(T.dot(dy, self.gh2w) + self.gh2b)
        dh2 = theano_rng.binomial(p = exp_h2, size = exp_h2.shape, dtype='float32')
        dh2 = block_gradient(dh2)
        exp_h1 = T.nnet.sigmoid(T.dot(dh2, self.gh1w) + self.gh1b)
        dh1 = theano_rng.binomial(p = exp_h1, size = exp_h1.shape, dtype='float32')
        dh1 = block_gradient(dh1)
        exp_v = T.nnet.sigmoid(T.dot(dh1, self.gvw) + self.gvb)
        dv = theano_rng.binomial(p = exp_v, size = exp_v.shape, dtype='float32')
        dv = block_gradient(dv)

        # Explanation of dream
        zh1, rh1 = self.infer_h1(dv)
        zh2 = T.dot(rh1, self.rh2w) + self.rh2b
        rh2 = T.nnet.sigmoid(zh2)
        zy = T.dot(rh2, self.ryw) + self.ryb

        # Probability of dream
        dream_prob = sigmoid_prob(zh1, dh1) + sigmoid_prob(zh2, dh2) + softmax_prob(zy, dy)

        # Explanation of reality
        zh1, rh1 = self.infer_h1(X)
        rh1 = block_gradient(rh1)
        zh2 = T.dot(rh1, self.rh2w) + self.rh2b
        rh2 = theano_rng.binomial(p = T.nnet.sigmoid(zh2), size = zh2.shape, dtype='float32')
        rh2 = block_gradient(rh2)

        # Probability of reality
        real_prob = softmax_prob(T.alloc(0., self.batch_size, self.n_classes) + self.gyb, Y) + \
                sigmoid_prob(T.dot(Y, self.gh2w) + self.gh2b, rh2) + \
                sigmoid_prob(T.dot(rh2, self.gh1w) + self.gh1b, rh1) + \
                sigmoid_prob(T.dot(rh1, self.gvw) + self.gvb, X)

        return - dream_prob - real_prob + .0001 * (
            T.sqr(self.gvw).sum() + T.sqr(self.gh1w).sum() + \
                    T.sqr(self.gh2w).sum()
                )
    def get_samples_and_objectives(self, model, data):
        space, sources = self.get_data_specs(model)
        space.validate(data)
        assert isinstance(model, AdversaryPair)
        g = model.generator
        d = model.discriminator

        # Note: this assumes data is design matrix
        X = data
        m = data.shape[space.get_batch_axis()]
        y1 = T.alloc(1, m, 1)
        y0 = T.alloc(0, m, 1)
        # NOTE: if this changes to optionally use dropout, change the inference
        # code below to use a non-dropped-out version.
        S, z, other_layers = g.sample_and_noise(
            m,
            default_input_include_prob=self.
            generator_default_input_include_prob,
            default_input_scale=self.generator_default_input_scale,
            all_g_layers=(self.infer_layer is not None))

        if self.noise_both != 0.:
            rng = MRG_RandomStreams(2014 / 6 + 2)
            S = S + rng.normal(size=S.shape, dtype=S.dtype) * self.noise_both
            X = X + rng.normal(size=X.shape, dtype=S.dtype) * self.noise_both

        y_hat1 = d.dropout_fprop(X,
                                 self.discriminator_default_input_include_prob,
                                 self.discriminator_input_include_probs,
                                 self.discriminator_default_input_scale,
                                 self.discriminator_input_scales)
        y_hat0 = d.dropout_fprop(S,
                                 self.discriminator_default_input_include_prob,
                                 self.discriminator_input_include_probs,
                                 self.discriminator_default_input_scale,
                                 self.discriminator_input_scales)

        # d_obj =  0.5 * (d.layers[-1].cost(y1, y_hat1) + d.layers[-1].cost(y0, y_hat0))

        pos_mask = y_hat1 < .5 + self.d_eps
        neg_mask = y_hat0 > .5 - self.d_eps

        pos_cost_matrix = d.layers[-1].cost_matrix(y1, y_hat1)
        neg_cost_matrix = d.layers[-1].cost_matrix(y0, y_hat0)

        pos_cost = (pos_mask * pos_cost_matrix).mean()
        neg_cost = (neg_mask * neg_cost_matrix).mean()

        d_obj = 0.5 * (pos_cost + neg_cost)

        if self.no_drop_in_d_for_g:
            y_hat0_no_drop = d.dropout_fprop(S)
            g_cost_mat = d.layers[-1].cost_matrix(y1, y_hat0_no_drop)
        else:
            g_cost_mat = d.layers[-1].cost_matrix(y1, y_hat0)
        assert g_cost_mat.ndim == 2
        assert y_hat0.ndim == 2

        mask = y_hat0 < 0.5 + self.g_eps
        masked_cost = g_cost_mat * mask
        g_obj = masked_cost.mean()

        if model.inferer is not None:
            # Change this if we ever switch to using dropout in the
            # construction of S.
            S_nograd = block_gradient(
                S)  # Redundant as long as we have custom get_gradients
            pred = model.inferer.dropout_fprop(
                S_nograd, self.inference_default_input_include_prob,
                self.inference_input_include_probs,
                self.inference_default_input_scale,
                self.inference_input_scales)
            if self.infer_layer is None:
                target = z
            else:
                target = other_layers[self.infer_layer]
            i_obj = model.inferer.layers[-1].cost(target, pred)
        else:
            i_obj = 0

        return S, d_obj, g_obj, i_obj