Python zero_grad 예제들, theano.gradient.zero_grad Python 예제들

예제 #1

0

파일 보기

파일: test_gradient.py 프로젝트: Theano/Theano

def test_undefined_grad_opt():
    # Make sure that undefined grad get removed in optimized graph.
    random = RandomStreams(np.random.randint(1, 2147462579))
    pvals = theano.shared(np.random.rand(10, 20).astype(theano.config.floatX))
    pvals = pvals / pvals.sum(axis=1)
    pvals = gradient.zero_grad(pvals)
    samples = random.multinomial(pvals=pvals, n=1)
    samples = theano.tensor.cast(samples, pvals.dtype)
    samples = gradient.zero_grad(samples)
    cost = theano.tensor.sum(samples + pvals)
    grad = theano.tensor.grad(cost, samples)
    f = theano.function([], grad)
    theano.printing.debugprint(f)
    assert not any([isinstance(node.op, gradient.UndefinedGrad) for node in f.maker.fgraph.apply_nodes])

예제 #2

0

파일 보기

파일: 57799_test_gradient.py 프로젝트: tate11/intelligent-code-completion

def test_undefined_grad_opt():
    # Make sure that undefined grad get removed in optimized graph.
    random = RandomStreams(np.random.randint(1, 2147462579))
    pvals = theano.shared(np.random.rand(10, 20).astype(theano.config.floatX))
    pvals = pvals / pvals.sum(axis=1)
    pvals = gradient.zero_grad(pvals)
    samples = random.multinomial(pvals=pvals, n=1)
    samples = theano.tensor.cast(samples, pvals.dtype)
    samples = gradient.zero_grad(samples)
    cost = theano.tensor.sum(samples + pvals)
    grad = theano.tensor.grad(cost, samples)
    f = theano.function([], grad)
    theano.printing.debugprint(f)
    assert not any([isinstance(node.op, gradient.UndefinedGrad) for node in f.maker.fgraph.apply_nodes])

예제 #3

0

파일 보기

파일: test_gradient.py 프로젝트: ChinaQuants/Theano

 def test_op_removed(self):
     x = theano.tensor.matrix("x")
     y = x * gradient.zero_grad(x)
     f = theano.function([x], y)
     # need to refer to theano.gradient.zero_grad here,
     # theano.gradient.zero_grad is a wrapper function!
     assert gradient.zero_grad_ not in [node.op for node in f.maker.fgraph.toposort()]

예제 #4

0

파일 보기

 def test_op_removed(self):
     x = theano.tensor.matrix('x')
     y = x * gradient.zero_grad(x)
     f = theano.function([x], y)
     # need to refer to theano.gradient.zero_grad here,
     # theano.gradient.zero_grad is a wrapper function!
     assert gradient.zero_grad_ not in \
         [node.op for node in f.maker.fgraph.toposort()]

예제 #5

0

파일 보기

파일: test_gradient.py 프로젝트: michaelosthege/aesara

    def test_grad(self):
        T = theano.tensor
        a = np.asarray(self.rng.randn(5, 5), dtype=config.floatX)

        x = T.matrix("x")

        expressions_gradients = [
            (x * gradient.zero_grad(x), x),
            (x * gradient.zero_grad(T.exp(x)), T.exp(x)),
            (gradient.zero_grad(x), T.constant(0.0)),
            (x**2 * gradient.zero_grad(x), 2 * x**2),
        ]

        for expr, expr_grad in expressions_gradients:
            g = gradient.grad(expr.sum(), x)
            # gradient according to theano
            f = theano.function([x], g, on_unused_input="ignore")
            # desired gradient
            f2 = theano.function([x], expr_grad, on_unused_input="ignore")

            assert np.allclose(f(a), f2(a))

예제 #6

0

파일 보기

파일: test_gradient.py 프로젝트: ChinaQuants/Theano

    def test_grad(self):
        T = theano.tensor
        a = np.asarray(self.rng.randn(5, 5), dtype=config.floatX)

        x = T.matrix("x")

        expressions_gradients = [
            (x * gradient.zero_grad(x), x),
            (x * gradient.zero_grad(T.exp(x)), T.exp(x)),
            (gradient.zero_grad(x), T.constant(0.0)),
            (x ** 2 * gradient.zero_grad(x), 2 * x ** 2),
        ]

        for expr, expr_grad in expressions_gradients:
            g = gradient.grad(expr.sum(), x)
            # gradient according to theano
            f = theano.function([x], g, on_unused_input="ignore")
            # desired gradient
            f2 = theano.function([x], expr_grad, on_unused_input="ignore")

            assert np.allclose(f(a), f2(a))

예제 #7

0

파일 보기

    def target_decode_step(self, target_embedding, h1, h2, o, a_c1, a_c2,
                           mask):
        # a_c1 for feed in RNN
        # a_c2 for calculate score

        # Calculate attention score
        s = T.dot(o, self.attention_s)
        n, d = s.shape
        s = s.reshape((1, n, d))
        attention_score = T.tanh(s + a_c2)
        l, n, d = attention_score.shape
        attention_score = attention_score.reshape((l * n, d))
        attention_score = T.dot(attention_score, self.attetion_v)
        attention_score = attention_score.reshape((l, n))
        max_clip = zero_grad(T.max(attention_score, axis=0))
        attention_score = T.exp(attention_score - max_clip.reshape((1, n)))
        attention_score = attention_score.reshape((l, n, 1)) * mask
        denorm = T.sum(attention_score, axis=0)
        attention_score = attention_score / denorm.reshape((1, n, 1))

        # Calculate attention content
        attention_content = T.sum(attention_score * a_c1, axis=0)

        # Decoding GRU layer 1
        h_in = T.concatenate([h1, attention_content, target_embedding], axis=1)
        gate = get_output(self.gru_de_gate_1, h_in)
        u1 = gate[:, :self.hid_size]
        r1 = gate[:, self.hid_size:]
        reset_h1 = h1 * r1

        c_in = T.concatenate([reset_h1, attention_content, target_embedding],
                             axis=1)
        c1 = get_output(self.gru_de_candidate_1, c_in)
        h1 = (1.0 - u1) * h1 + u1 * c1

        h_in = T.concatenate([h1, h2, attention_content, target_embedding],
                             axis=1)
        gate = get_output(self.gru_de_gate_2, h_in)
        u2 = gate[:, :self.hid_size]
        r2 = gate[:, self.hid_size:]
        reset_h2 = h2 * r2

        c_in = T.concatenate(
            [h1, reset_h2, attention_content, target_embedding], axis=1)
        c2 = get_output(self.gru_de_candidate_2, c_in)
        h2 = (1.0 - u1) * h2 + u2 * c2

        o = T.concatenate([h1, h2], axis=-1)
        o = get_output(self.decode_out_mlp, o)

        return h1, h2, o, attention_content

예제 #8

0

파일 보기

파일: test_gradient.py 프로젝트: michaelosthege/aesara

    def test_rop(self):
        T = theano.tensor

        x = T.vector()
        v = T.vector()
        y = gradient.zero_grad(x)

        rop = T.Rop(y, x, v)
        f = theano.function([x, v], rop, on_unused_input="ignore")

        a = np.asarray(self.rng.randn(5), dtype=config.floatX)
        u = np.asarray(self.rng.randn(5), dtype=config.floatX)

        assert np.count_nonzero(f(a, u)) == 0

예제 #9

0

파일 보기

파일: test_gradient.py 프로젝트: Theano/Theano

    def test_rop(self):
        T = theano.tensor

        x = T.vector()
        v = T.vector()
        y = gradient.zero_grad(x)

        rop = T.Rop(y, x, v)
        f = theano.function([x, v], rop, on_unused_input='ignore')

        a = np.asarray(self.rng.randn(5),
                       dtype=config.floatX)
        u = np.asarray(self.rng.randn(5),
                       dtype=config.floatX)

        assert np.count_nonzero(f(a, u)) == 0

예제 #10

0

파일 보기

파일: FourLayersInterAttFrac.py 프로젝트: maodayezheng/TranslationReadWrite

    def decoding_step(self, embedding, h1, h2, o, s_embedding, a_c1, a_c2,
                      mask):
        s = T.dot(o, self.attention_s)
        n, d = s.shape
        s = s.reshape((n, 1, d))
        attention_score = T.tanh(s + a_c2)
        n, l, d = attention_score.shape
        attention_score = attention_score.reshape((l * n, d))
        attention_score = T.dot(attention_score, self.attetion_v)
        attention_score = attention_score.reshape((n, l))
        max_clip = zero_grad(T.max(attention_score, axis=-1))
        attention_score = T.exp(attention_score - max_clip.reshape((n, 1)))
        attention_score = attention_score * mask
        denorm = T.sum(attention_score, axis=-1)
        attention_score = attention_score / denorm.reshape((n, 1))
        attention_content = T.sum(attention_score.reshape((n, l, 1)) * a_c1,
                                  axis=1)

        # Decoding GRU layer 1
        input_info = T.concatenate([embedding, h1, attention_content], axis=-1)
        gate1 = get_output(self.gru_de_gate_1, input_info)
        u1 = gate1[:, :self.hid_size]
        r1 = gate1[:, self.hid_size:]
        reset_h1 = h1 * r1
        c_in = T.concatenate([embedding, reset_h1, attention_content], axis=-1)
        c1 = get_output(self.gru_de_candidate_1, c_in)
        h1 = (1.0 - u1) * h1 + u1 * c1

        # Decoding GRU layer 2
        input_info = T.concatenate([embedding, h1, h2, attention_content],
                                   axis=-1)
        gate2 = get_output(self.gru_de_gate_2, input_info)
        u2 = gate2[:, :self.hid_size]
        r2 = gate2[:, self.hid_size:]
        reset_h2 = h2 * r2
        c_in = T.concatenate([embedding, h1, reset_h2, attention_content],
                             axis=1)
        c2 = get_output(self.gru_de_candidate_2, c_in)
        h2 = (1.0 - u2) * h2 + u2 * c2

        o = get_output(self.decode_out_mlp, T.concatenate([h1, h2], axis=-1))

        score_in = T.concatenate([embedding, o, attention_content], axis=-1)
        s = get_output(self.score, score_in)
        sample_score = T.dot(s, s_embedding)

        return h1, h2, o, s, sample_score, attention_score

예제 #11

0

파일 보기

파일: FourLayersInterAttFrac.py 프로젝트: maodayezheng/TranslationReadWrite

    def symbolic_elbo(self, source, target):
        """
        Return a symbolic variable, representing the ELBO, for the given minibatch.
        :param num_samples: The number of samples to use to evaluate the ELBO.
        :return elbo: The symbolic variable representing the ELBO.
        """
        n = source.shape[0]
        l = source.shape[1]
        # Get input embedding
        source_embedding = get_output(self.input_embedding, source)
        source_embedding = get_output(
            self.encoder, source_embedding.reshape(
                (n * l, self.embedding_dim)))
        source_embedding = source_embedding.reshape((n, l, self.hid_size))
        # Create input mask
        encode_mask = T.cast(T.gt(source, -1), "float32")

        # Create decoding mask
        d_m = T.cast(T.gt(target, -1), "float32")
        decode_mask = d_m[:, 1:]
        # Init decoding states
        h_init = T.zeros((n, self.hid_size))
        source_embedding = source_embedding * encode_mask.reshape((n, l, 1))

        read_attention_weight = self.attention_weight
        read_attention_bias = self.attention_bias
        read_attention_bias = read_attention_bias.reshape((1, 2))
        sample_embed = self.target_output_embedding.W
        decode_in_embedding = get_output(self.target_input_embedding, target)
        decode_in_embedding = decode_in_embedding[:, :-1]
        decode_in_embedding = decode_in_embedding.dimshuffle((1, 0, 2))

        # create the time step
        f = 0.3
        max_t = T.ceil(l * f)
        time_steps = T.cast(T.arange(max_t), "float32")

        key_init = T.tile(self.key_init.reshape((1, self.key_dim)), (n, 1))
        read_pos = T.arange(l, dtype="float32") + 1.0
        read_pos = read_pos.reshape(
            (1, l)) / (T.sum(encode_mask, axis=-1).reshape((n, 1)) + 1.0)

        ([h1, h2, keys, c, addresses], update) = theano.scan(
            self.encoding_step,
            outputs_info=[h_init, h_init, key_init, None, None],
            non_sequences=[
                source_embedding, read_pos, read_attention_weight,
                read_attention_bias, encode_mask
            ],
            sequences=[time_steps])

        # Create Attention mask
        t = addresses.shape[0]
        true_times = T.ceil(T.sum(encode_mask, axis=-1) * f)
        true_times = T.cast(true_times.reshape((n, 1)), "float32")
        attention_mask = T.cast(
            T.le(time_steps.reshape((1, time_steps.shape[0])), true_times),
            "float32")

        # Decoding RNN
        l, n, d = c.shape
        attention_c1 = c.reshape((n * l, d))
        attention_c2 = T.dot(attention_c1, self.attention_h_2)
        attention_c1 = attention_c1.reshape((l, n, self.hid_size))
        attention_c1 = attention_c1.dimshuffle((1, 0, 2))
        attention_c2 = attention_c2.reshape((l, n, self.output_score_dim))
        attention_c2 = attention_c2.dimshuffle((1, 0, 2))

        decode_init = get_output(self.decode_init_mlp,
                                 T.concatenate([h1[-1], h2[-1]], axis=-1))
        o_init = get_output(self.decode_out_mlp, decode_init)

        ([h1, h2, o, s, sample_score,
          att_score], update) = theano.scan(self.decoding_step,
                                            outputs_info=[
                                                decode_init[:, :self.hid_size],
                                                decode_init[:, self.hid_size:],
                                                o_init, None, None, None
                                            ],
                                            sequences=[decode_in_embedding],
                                            non_sequences=[
                                                sample_embed.T, attention_c1,
                                                attention_c2, attention_mask
                                            ])

        # Get sample embedding
        l = sample_score.shape[0]
        n = sample_score.shape[1]
        max_clip = T.max(sample_score, axis=-1)
        score_clip = zero_grad(max_clip)
        sample_score = T.exp(sample_score - score_clip.reshape((l, n, 1)))
        sample_score = T.sum(sample_score, axis=-1)

        # Get true embedding
        true_embed = get_output(self.target_output_embedding, target[:, 1:])
        true_embed = true_embed.dimshuffle((1, 0, 2))
        true_embed = true_embed.reshape((n * l, self.output_score_dim))
        d = s.shape[-1]
        s = s.reshape((n * l, d))
        score = T.exp(
            T.sum(s * true_embed, axis=-1).reshape((l, n)) - score_clip)
        score = score.reshape((l, n))
        prob = score / sample_score
        prob = prob.dimshuffle((1, 0))
        # Loss per sentence
        loss = decode_mask * T.log(prob + 1e-5)
        loss = -T.mean(T.sum(loss, axis=1))
        return loss, addresses

예제 #12

0

파일 보기

def softsign(h, epsilon=1e-3):
    _mu = abs(h).mean()
    h_epsilon = np.float32(epsilon) * _mu
    act = (h / (abs(h) + zero_grad(h_epsilon)))
    return act

예제 #13

0

파일 보기

파일: utils.py 프로젝트: benoitgaujac/CD_CSS

def logsumexp(X):
    """Expects an NxD tensor"""
    m = zero_grad(T.max(X, axis=1, keepdims=True))
    return m + T.log(T.sum(T.exp(X - m), axis=1, keepdims=True))

예제 #14

0

파일 보기

    def symbolic_elbo(self, source, target):
        """
        Return a symbolic variable, representing the ELBO, for the given minibatch.
        :param num_samples: The number of samples to use to evaluate the ELBO.
        :return elbo: The symbolic variable representing the ELBO.
        """
        n = target.shape[0]
        # Encoding mask
        encode_mask = T.cast(T.gt(source, -1), "float32")
        source_input_embedding = get_output(self.input_embedding, source)
        n, l = encode_mask.shape
        encode_mask = encode_mask.reshape((n, l, 1))
        encode_mask = encode_mask.dimshuffle((1, 0, 2))
        source_input_embedding = source_input_embedding.dimshuffle((1, 0, 2))

        # Encoding RNN
        h_init = T.zeros((n, self.hid_size))
        ([h_e_1, h_e_2, e_o],
         update) = theano.scan(self.source_encode_step,
                               outputs_info=[h_init, h_init, None],
                               sequences=[source_input_embedding, encode_mask])

        decode_mask = T.cast(T.gt(target, -1), "float32")[:, 1:]

        # Decoding RNN
        attention_candidate = e_o

        l, n, d = attention_candidate.shape
        attention_c1 = attention_candidate.reshape((n * l, d))
        attention_c2 = T.dot(attention_c1, self.attention_h_2)
        attention_c1 = attention_c1.reshape((l, n, self.output_score_dim))
        attention_c2 = attention_c2.reshape((l, n, self.output_score_dim))
        target_input = target[:, :-1]
        n, l = target_input.shape
        target_input = target_input.reshape((n * l, ))
        target_input_embedding = get_output(self.target_input_embedding,
                                            target_input)
        target_input_embedding = target_input_embedding.reshape(
            (n, l, self.embedding_dim))
        target_input_embedding = target_input_embedding.dimshuffle((1, 0, 2))
        decode_init = get_output(
            self.decode_init_mlp, T.concatenate([h_e_1[-1], h_e_2[-1]],
                                                axis=-1))
        o_init = get_output(self.decode_out_mlp, decode_init)
        ([h_d_1, h_d_2, d_o, attention_content], update) = theano.scan(
            self.target_decode_step,
            outputs_info=[
                decode_init[:, :self.hid_size], decode_init[:, self.hid_size:],
                o_init, None
            ],
            sequences=[target_input_embedding],
            non_sequences=[attention_c1, attention_c2, encode_mask])

        score_eva_in = T.concatenate(
            [d_o, attention_content, target_input_embedding], axis=-1)
        ([h, score],
         update) = theano.scan(self.score_eval_step,
                               sequences=[score_eva_in],
                               non_sequences=[self.target_output_embedding.W],
                               outputs_info=[None, None])
        h = h.dimshuffle((1, 0, 2))
        score = score.dimshuffle((1, 0, 2))
        max_clip = T.max(score, axis=-1)
        max_clip = zero_grad(max_clip)
        score = T.exp(score - max_clip.reshape((n, l, 1)))
        denominator = T.sum(score, axis=-1)

        # Get true embedding
        target_out = target[:, 1:]
        n, l = target_out.shape
        target_out = target_out.reshape((n * l, ))
        true_embed = get_output(self.target_output_embedding, target_out)
        true_embed = true_embed.reshape((n * l, self.output_score_dim))
        h = h.reshape((n * l, self.output_score_dim))
        true_score = T.exp(
            T.sum(h * true_embed, axis=-1) - max_clip.reshape((l * n, )))
        true_score = true_score.reshape((n, l))
        prob = true_score / denominator
        # Loss per sentence
        loss = decode_mask * T.log(prob + 1e-5)
        loss = -T.mean(T.sum(loss, axis=1))

        return loss

예제 #15

0

파일 보기

def deepfool(model,
             inputs,
             labels=None,
             num_classes=None,
             norm='l2',
             max_iter=25,
             clip_dist=None,
             over_shoot=0.02):
    """Theano implementation of DeepFool https://arxiv.org/abs/1511.04599

    """
    assert norm in ['l1', 'l2']

    if num_classes is None:
        raise RuntimeError("Number of classes need to be provided for Theano")
    if labels is None:
        labels = gradient.zero_grad(T.argmax(model(inputs), axis=1))

    batch_size = inputs.shape[0]
    batch_indices = T.arange(batch_size)

    def find_perturb(perturbation):
        logits_os = model(inputs + (1 + over_shoot) * perturbation)
        y_pred = T.argmax(logits_os, axis=1)
        is_mistake = T.neq(y_pred, labels)
        current_ind = batch_indices[(1 - is_mistake).nonzero()]
        should_stop = T.all(is_mistake)

        # continue generating perturbation only for correctly classified
        inputs_subset = inputs[current_ind]
        perturbation_subset = perturbation[current_ind]
        labels_subset = labels[current_ind]
        batch_subset = T.arange(inputs_subset.shape[0])

        x_adv = inputs_subset + perturbation_subset
        logits = model(x_adv)
        corrects = logits[batch_subset, labels_subset]
        jac = jacobian(logits, x_adv, num_classes)

        # deepfool
        f = logits - T.shape_padright(corrects)
        w = jac - T.shape_padaxis(jac[batch_subset, labels_subset], axis=1)
        reduce_ind = range(2, inputs.ndim + 1)
        if norm == 'l2':
            dist = T.abs_(f) / w.norm(2, axis=reduce_ind)
        else:
            dist = T.abs_(f) / T.sum(T.abs_(w), axis=reduce_ind)
        # remove correct targets
        dist = T.set_subtensor(dist[batch_subset, labels_subset],
                               T.constant(np.inf))
        l = T.argmin(dist, axis=1)
        dist_l = dist[batch_subset, l].dimshuffle(0, 'x', 'x', 'x')
        # avoid numerical instability and clip max value
        if clip_dist is not None:
            dist_l = T.clip(dist_l, 0, clip_dist)
        w_l = w[batch_subset, l]
        if norm == 'l2':
            reduce_ind = range(1, inputs.ndim)
            perturbation_upd = dist_l * w_l / w_l.norm(
                2, reduce_ind, keepdims=True)
        else:
            perturbation_upd = dist_l * T.sgn(w_l)
        perturbation = ifelse(
            should_stop, perturbation,
            T.inc_subtensor(perturbation[current_ind], perturbation_upd))
        return perturbation, scan_module.until(should_stop)

    initial_perturbation = T.zeros_like(inputs)
    results, _ = theano.scan(find_perturb,
                             outputs_info=[initial_perturbation],
                             n_steps=max_iter)
    perturbation = results[-1]
    return gradient.disconnected_grad(inputs + (1 + over_shoot) * perturbation)

예제 #16

0

파일 보기

파일: recurrent_mlp_gpu.py 프로젝트: olivernina/tracking-with-rnn

def __step(img, prev_bbox, prev_att, state, prev_conf, prev_sugg, prev_W, prev_b, prev_pos, prev_neg, timestep):
	cx = (prev_bbox[:, 2] + prev_bbox[:, 0]) / 2.
	cy = (prev_bbox[:, 3] + prev_bbox[:, 1]) / 2.
	sigma = TT.exp(prev_att[:, 0]) * (max(img_col, img_row) / 2)
	fract = TT.exp(prev_att[:, 1])
        amplifier = TT.exp(prev_att[:, 2])

        eps = 1e-8

	abs_cx = (cx + 1) / 2. * (img_col - 1)
	abs_cy = (cy + 1) / 2. * (img_row - 1)
	abs_stride = (fract * (max(img_col, img_row) - 1)) * ((1. / (NUM_N - 1.)) if NUM_N > 1 else 0)

	FX, FY = __filterbank(abs_cx, abs_cy, abs_stride, sigma)
	unnormalized_mask = (FX.dimshuffle(0, 'x', 1, 'x', 2) * FY.dimshuffle(0, 1, 'x', 2, 'x')).sum(axis=2).sum(axis=1)
	mask = unnormalized_mask# / (unnormalized_mask.sum(axis=2).sum(axis=1) + eps).dimshuffle(0, 'x', 'x')
	masked_img = img

	conv1 = conv2d(masked_img, conv1_filters, subsample=(conv1_stride, conv1_stride))
	act1 = TT.tanh(conv1)
	flat1 = TT.reshape(act1, (-1, conv1_output_dim))
	gru_in = TT.concatenate([flat1, prev_bbox, prev_conf.reshape((batch_size, 1)), prev_sugg], axis=1)
	gru_z = NN.sigmoid(TT.dot(gru_in, Wz) + TT.dot(state, Uz) + bz)
	gru_r = NN.sigmoid(TT.dot(gru_in, Wr) + TT.dot(state, Ur) + br)
	gru_h_ = TT.tanh(TT.dot(gru_in, Wg) + TT.dot(gru_r * state, Ug) + bg)
	gru_h = (1 - gru_z) * state + gru_z * gru_h_
	bbox = TT.tanh(TT.dot(gru_h, W_fc2) + b_fc2)
	att = TT.dot(gru_h, W_fc3) + b_fc3

	def batch_dot(a, b):
		return (a.dimshuffle(0, 1, 2, 'x') * b.dimshuffle(0, 'x', 1, 2)).sum(axis=2)

	def bounding(bbox):
		return TT.stack([TT.maximum(bbox[:, 0], -1), TT.minimum(bbox[:, 1], 1), TT.maximum(bbox[:, 2], -1), TT.minimum(bbox[:, 3], 1)], axis=1)

	def sample_positives(bbox):
		x0 = bbox[:, 0]
		y0 = bbox[:, 1]
		x1 = bbox[:, 2]
		y1 = bbox[:, 3]
		return TT.stack([bounding(TT.as_tensor([x0, y0, x1, y1]).T),
				 bounding(TT.as_tensor([x0 * 0.75 + x1 * 0.25, y0, x1, y1]).T),
				 bounding(TT.as_tensor([x0, y0 * 0.75 + y1 * 0.25, x1, y1]).T),
				 bounding(TT.as_tensor([x0, y0, x1 * 0.75 + x0 * 0.25, y1]).T),
				 bounding(TT.as_tensor([x0, y0, x1, y1 * 0.75 + y0 * 0.25]).T),
				 bounding(TT.as_tensor([x0 * 1.25 - x1 * 0.25, y0, x1, y1]).T),
				 bounding(TT.as_tensor([x0, y0 * 1.25 - y1 * 0.25, x1, y1]).T),
				 bounding(TT.as_tensor([x0, y0, x1 * 1.25 - x0 * 0.25, y1]).T),
				 bounding(TT.as_tensor([x0, y0, x1, y1 * 1.25 - y0 * 0.25]).T),
				], axis=1)

	def sample_negatives(bbox):
		x0 = bbox[:, 0]
		y0 = bbox[:, 1]
		x1 = bbox[:, 2]
		y1 = bbox[:, 3]
		return TT.stack([bounding(TT.as_tensor([x0 * 0.5 + x1 * 0.5, y0, x1, y1]).T),
				 bounding(TT.as_tensor([x0, y0 * 0.5 + y1 * 0.5, x1, y1]).T),
				 bounding(TT.as_tensor([x0, y0, x1 * 0.5 + x0 * 0.5, y1]).T),
				 bounding(TT.as_tensor([x0, y0, x1, y1 * 0.5 + y0 * 0.5]).T),
				 bounding(TT.as_tensor([x0 * 1.5 - x1 * 0.5, y0, x1 * 0.5 + x0 * 0.5, y1]).T),
				 bounding(TT.as_tensor([x0, y0 * 1.5 - y1 * 0.5, x1, y1 * 0.5 + y0 * 0.5]).T),
				 bounding(TT.as_tensor([x0 * 0.5 + x1 * 0.5, y0, x1 * 1.5 - x0 * 0.5, y1]).T),
				 bounding(TT.as_tensor([x0, y0 * 0.5 + y1 * 0.5, x1, y1 * 1.5 - y0 * 0.5]).T),
				], axis=1)

	def sample_around(bbox):
		return TT.concatenate([sample_positives(bbox), sample_negatives(bbox)], axis=1)

	crop = batch_multicrop(bbox.dimshuffle(0, 'x', 1), img)
	feat = conv2d(crop.reshape((batch_size, 1, img_row, img_col)), conv1_filters, subsample=(conv1_stride, conv1_stride)).reshape((batch_size, 1, -1))
	conf = NN.sigmoid(batch_dot(feat, prev_W) + TT.addbroadcast(prev_b, 1))

	nr_samples = 17
	sugg_bbox = sample_around(bbox)		# (batch_size, nr_samples, 4)
	sugg_crop = batch_multicrop(sugg_bbox, img)
	sugg_feat = conv2d(sugg_crop.reshape((batch_size * nr_samples, 1, img_row, img_col)), conv1_filters, subsample=(conv1_stride, conv1_stride)).reshape((batch_size, nr_samples, -1))
	sugg_conf = batch_dot(sugg_feat, prev_W) + TT.addbroadcast(prev_b, 1)
	print sugg_conf.dtype
	sugg_pos = TT.cast(sugg_conf > 0, T.config.floatX)
	print sugg_pos.dtype
	sugg = TG.disconnected_grad((sugg_bbox * TT.patternbroadcast(sugg_pos, [False, False, True])).sum(axis=1) / TT.patternbroadcast(sugg_pos.sum(axis=1), [False, True]))

	def classify(x, W, b):
		# x: (batch_size, samples_per_batch, feature_per_sample)
		return NN.sigmoid(batch_dot(x, W) + TT.addbroadcast(b, 1))

	def update_step(W, b, x, y, alpha=1):
		y_hat = classify(x, W, b)
		loss = ((y_hat - y) ** 2).mean()
		g = T.grad(loss, [W, b])
		return (W - alpha * g[0], b - alpha * g[1], loss), T.scan_module.until(loss < 0.01)

	nr_samples = 9
	pos_bbox = sample_positives(bbox)
	pos_crop = batch_multicrop(pos_bbox, img)
	pos_feat = conv2d(pos_crop.reshape((batch_size * nr_samples, 1, img_row, img_col)), conv1_filters, subsample=(conv1_stride, conv1_stride)).reshape((batch_size, nr_samples, -1))
	pos = TG.disconnected_grad(TT.set_subtensor(prev_pos[:, (nr_samples*timestep):(nr_samples*(timestep+1))], pos_feat))
	nr_samples = 8
	neg_bbox = sample_negatives(bbox)
	neg_crop = batch_multicrop(neg_bbox, img)
	neg_feat = conv2d(neg_crop.reshape((batch_size * nr_samples, 1, img_row, img_col)), conv1_filters, subsample=(conv1_stride, conv1_stride)).reshape((batch_size, nr_samples, -1))
	neg = TG.disconnected_grad(TT.set_subtensor(prev_neg[:, (nr_samples*timestep):(nr_samples*(timestep+1))], neg_feat))
	update_scan, _ = T.scan(fn=update_step,
				outputs_info=[prev_W, prev_b, None],
                                non_sequences=[TT.concatenate([pos[:, :9*timestep], neg[:, :8*timestep]], axis=1), TT.concatenate([TT.ones((batch_size, 9*timestep, 1)), -TT.ones((batch_size, 8*timestep, 1))], axis=1)], n_steps=1000)
	new_W, new_b = TG.disconnected_grad(update_scan[0][-1]), TG.zero_grad(update_scan[1][-1])

	return bbox, att, gru_h, TT.unbroadcast(conf, 1), sugg, new_W, TT.unbroadcast(new_b, 1), pos, neg, timestep + 1