def __init__(self, softmax=softmax):

        self.inpv = T.matrix('inpv')
        self.outv = T.imatrix('outv')  # indices
        self.ep = T.matrix('ep')
        self.w = T.scalar('w')

        self.n = self.inpv.shape[0]

        self.enc_m = get_encoder()
        self.enc_s = get_encoder()
        self.dec = get_decoder()

        self.mu = get_output(self.enc_m, self.inpv)
        self.log_s = get_output(self.enc_s, self.inpv)
        self.log_v = 2 * self.log_s
        self.sigma = T.exp(self.log_s)
        self.var = T.exp(self.log_s * 2)
        self.z = self.mu + self.sigma * self.ep
        self.rec_linear = get_output(self.dec, self.z)
        self.rec_reshaped_ln = self.rec_linear.reshape((self.n * d2, 256))
        self.rec_reshaped = softmax(self.rec_reshaped_ln)

        self.out_onehot = T.extra_ops.to_one_hot(
            self.outv.reshape((self.n * d2, )), 256)

        # lazy modeling just using squared error ...
        self.rec_losses_reshaped = cc(self.rec_reshaped, self.out_onehot)
        self.rec_losses = self.rec_losses_reshaped.reshape((self.n, d2)).sum(1)
        self.klss = - 0.5 * (1+self.log_v) + \
                      0.5 * (self.mu**2 + self.var)
        self.kls = self.klss.sum(1)
        self.rec_loss = self.rec_losses.mean()
        self.kl = self.kls.mean()
        self.loss = self.rec_loss + self.kl * self.w

        self.params = get_all_params(self.enc_m) + \
                      get_all_params(self.enc_s) + \
                      get_all_params(self.dec)
        self.updates = lasagne.updates.adam(self.loss, self.params, lr)

        print '\tgetting train func'
        self.train_func = theano.function(
            [self.inpv, self.outv, self.ep, self.w],
            [self.loss.mean(),
             self.rec_loss.mean(),
             self.kl.mean()],
            updates=self.updates)

        print '\tgetting other useful funcs'
        self.recon = theano.function([self.inpv, self.ep],
                                     self.rec_reshaped.argmax(1).reshape(
                                         (self.n, d2)))
        self.recon_ = theano.function([self.inpv, self.ep],
                                      self.rec_reshaped.reshape(
                                          (self.n, d2, 256)))
        self.project = theano.function([self.inpv, self.ep], self.z)
        self.get_mu = theano.function([self.inpv], self.mu)
        self.get_var = theano.function([self.inpv], self.var)
        self.get_klss = theano.function([self.inpv], self.klss)
예제 #2
0
        def step(hid_previous):
            tiled_hid_prev = T.tile(
                T.reshape(hid_previous, (-1, 1, 1, self.hid_state_size)),
                (1, C.shape[1], 1, 1))

            g = Ep_Gate(C_reshaped, tiled_hid_prev, tiled_q, self.Wb, self.W1,
                        self.W2, self.b1, self.b2)

            g = T.reshape(g, (-1, C.shape[1]))
            g = T.switch(T.eq(input_sentence_mask, 1), g, np.float32(-np.inf))
            g = nonlin.softmax(g)
            e = T.sum(T.reshape(g, (g.shape[0], g.shape[1], 1)) * C, axis=1)

            input_n = e

            hid_input = T.dot(hid_previous, W_hid_stacked)
            input_n = T.dot(input_n, W_in_stacked) + b_stacked

            resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0)
            updategate = slice_w(hid_input, 1) + slice_w(input_n, 1)
            resetgate = self.nonlinearity_resetgate(resetgate)
            updategate = self.nonlinearity_updategate(updategate)

            hid_update_in = slice_w(input_n, 2)
            hid_update_hid = slice_w(hid_input, 2)
            hid_update = hid_update_in + resetgate * hid_update_hid

            hid_update = self.nonlinearity_hid(hid_update)

            hid = (1 - updategate) * hid_previous + updategate + hid_update

            return (hid, g)
예제 #3
0
        def step(hid_previous):
            tiled_hid_prev = T.tile(T.reshape(
                hid_previous,(-1,1,1,self.hid_state_size)),(1,C.shape[1],1,1))
            
            g = Ep_Gate(C_reshaped, tiled_hid_prev, tiled_q,
                        self.Wb, self.W1, self.W2, self.b1, self.b2)
            
            g = T.reshape(g,(-1,C.shape[1]))
            g = T.switch(T.eq(input_sentence_mask, 1), g, np.float32(-np.inf))
            g = nonlin.softmax(g)
            e = T.sum(T.reshape(g,(g.shape[0],g.shape[1],1)) * C, axis=1)

            input_n = e
            
            hid_input = T.dot(hid_previous, W_hid_stacked)
            input_n = T.dot(input_n, W_in_stacked) + b_stacked

            resetgate  = slice_w(hid_input, 0) + slice_w(input_n, 0)
            updategate = slice_w(hid_input, 1) + slice_w(input_n, 1)
            resetgate  = self.nonlinearity_resetgate(resetgate)
            updategate = self.nonlinearity_updategate(updategate)

            hid_update_in  = slice_w(input_n, 2)
            hid_update_hid = slice_w(hid_input, 2)
            hid_update     = hid_update_in + resetgate*hid_update_hid

            hid_update = self.nonlinearity_hid(hid_update)

            hid = (1 - updategate)*hid_previous + updategate+hid_update

            return (hid, g)
예제 #4
0
    def get_output_for(self, inputs, **kwargs):
        s_hat_t = inputs[0]
        h_hat_t = inputs[1]
        # s_hat_t = s_hat_t.dimshuffle(1, 0)
        # h_hat_t = h_hat_t.dimshuffle(1, 0)
        H = inputs[2]
        # H = H.dimshuffle(2, 0, 1)
        # H_len = H.shape[-1]
        # z_t 1*none*k
        zt = T.dot(
            self.nonlinearity(
                T.dot(H, self.W_v_to_attenGate) + T.dot(
                    T.dot(h_hat_t, self.W_g_to_attenGate).dimshuffle(
                        0, 1, 'x'), T.ones((1, self.num_inputs)))),
            self.W_h_to_attenGate)[:, :, 0]
        vt = T.dot(
            self.nonlinearity(
                T.dot(s_hat_t, self.W_s_to_attenGate) +
                T.dot(h_hat_t, self.W_g_to_attenGate)), self.W_h_to_attenGate)

        alpha_hat_t = self.nonlinearity_atten(T.concatenate([zt, vt], axis=-1))
        feature = T.concatenate([H, s_hat_t.dimshuffle(0, 'x', 1)],
                                axis=1).dimshuffle(2, 0, 1)
        c_hat_t = T.sum(alpha_hat_t * feature, axis=-1)
        out = T.dot((c_hat_t.T + h_hat_t), self.W_p)

        return nonlinearities.softmax(out)
예제 #5
0
    def _create_iter_funcs(self):
        X = T.imatrix('X')
        Y = T.imatrix('Y')
        sx0, sx1 = X.shape  # input shape
        sy0, sy1 = Y.shape  # output shape
        nt = T.iscalar('num tokens')
        inputs = [X, Y, nt]

        output_layer = self.layers_.values()[-1]

        Y_flat = T.reshape(Y, (sy0 * sy1, 1)).flatten()

        # bs x time x num_tokens
        output_train = get_output(output_layer, X, deterministic=False)
        # bs * time x num_tokens
        output_train_flat = T.reshape(
            output_train[:, :sy1, :], (sx0 * sy1, nt))
        output_train_01 = softmax(output_train_flat)
        probs_train = output_train_01[T.arange(sx0 * sy1), Y_flat]
        loss_train = -T.mean(T.log(probs_train))

        # bs x time x num_tokens
        output_valid = get_output(output_layer, X, deterministic=True)
        # bs * time x num_tokens
        output_valid_flat = T.reshape(
            output_valid[:, :sy1, :], (sx0 * sy1, nt))
        output_valid_01 = softmax(output_valid_flat)
        probs_valid = output_valid_01[T.arange(sx0 * sy1), Y_flat]
        loss_valid = T.mean(-T.log(probs_valid))

        pred_reshape = T.reshape(output_valid, (sx0 * sx1, nt))
        pred_softmax = softmax(pred_reshape)
        pred_valid = T.reshape(pred_softmax, (sx0, sx1, nt))

        all_params = get_all_params(output_layer)
        updates = self.updater(loss_train, all_params)

        train_iter = theano.function(inputs, loss_train, updates=updates)
        valid_iter = theano.function(inputs, loss_valid)
        predict_iter = theano.function([X, nt], pred_valid)

        return train_iter, valid_iter, predict_iter
예제 #6
0
        def step(hid_previous, out_previous, *args):
            input_n = T.concatenate([out_previous, q], axis=1)

            hid_input = T.dot(hid_previous, W_hid_stacked)
            input_n = T.dot(input_n, W_in_stacked) + b_stacked

            resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0)
            updategate = slice_w(hid_input, 1) + slice_w(input_n, 1)
            resetgate = self.nonlinearity_resetgate(resetgate)
            updategate = self.nonlinearity_updategate(updategate)

            hid_update_in = slice_w(input_n, 2)
            hid_update_hid = slice_w(hid_input, 2)
            hid_update = hid_update_in + resetgate * hid_update_hid

            hid_update = self.nonlinearity_hid(hid_update)

            hid = (1 - updategate) * hid_previous + updategate + hid_update
            out = nonlin.softmax(T.dot(hid, self.W))

            return (hid, out)
예제 #7
0
        def step(hid_previous, out_previous, *args):
            input_n = T.concatenate([out_previous, q], axis=1)

            hid_input = T.dot(hid_previous, W_hid_stacked)
            input_n = T.dot(input_n, W_in_stacked) + b_stacked

            resetgate  = slice_w(hid_input, 0) + slice_w(input_n, 0)
            updategate = slice_w(hid_input, 1) + slice_w(input_n, 1)
            resetgate  = self.nonlinearity_resetgate(resetgate)
            updategate = self.nonlinearity_updategate(updategate)

            hid_update_in  = slice_w(input_n, 2)
            hid_update_hid = slice_w(hid_input, 2)
            hid_update     = hid_update_in + resetgate*hid_update_hid

            hid_update = self.nonlinearity_hid(hid_update)

            hid = (1 - updategate)*hid_previous + updategate+hid_update
            out = nonlin.softmax(T.dot(hid, self.W))

            return (hid, out)
    def get_output_for(self, inputs, **kwargs):

        input = inputs[0]
        mask = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]
        (d1, d2, d3) = input.shape

        # out = T.tensordot(input, self.W, axes=[[2], [0]])
        # b_shuffled = self.b.dimshuffle('x', 'x', 0)
        # out += b_shuffled
        # out = tanh(out)
        # out *= mask.dimshuffle(0, 1, 'x')
        # out = T.batched_dot(out, out.dimshuffle(0, 2, 1))
        q = T.tensordot(input, self.W1, axes=[[2], [0]])
        b1_shuffled = self.b1.dimshuffle('x', 'x', 0)
        q += b1_shuffled
        q = tanh(q)

        #        k = T.tensordot(input, self.W2, axes=[[2], [0]])
        # b2_shuffled = self.b2.dimshuffle('x', 'x', 0)
        # k += b2_shuffled
        # k = tanh(k)

        q *= mask.dimshuffle(0, 1, 'x')
        #        k *= mask.dimshuffle(0, 1, 'x')
        out = T.batched_dot(q, q.dimshuffle(0, 2, 1))
        #out /= np.sqrt(self.nu)
        #out *= 0.1

        out *= (1 - T.eye(d2, d2))

        matrix = softmax(out.reshape((d1 * d2, d2))).reshape((d1, d2, d2))
        matrix *= mask.dimshuffle(0, 1, 'x')
        matrix *= mask.dimshuffle(0, 'x', 1)

        return matrix
예제 #9
0
    def get_output_for(self, inputs, **kwargs):
        input = inputs[0]
        original_shape = input.shape

        mask = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]

        # reshape input
        input = input.reshape(
            (input.shape[0] * input.shape[1], input.shape[2]))

        # apply mask
        if mask is not None:
            mask = mask.reshape((mask.shape[0] * mask.shape[1], 1))
            input *= mask

        # compute g(W* ... g(W* g(W*x+b) +b) ... +b) * v
        activation = input
        for W, b in zip(self.W, self.b):
            activation = T.dot(activation, W) + b.dimshuffle('x', 0)
            activation = self.nonlinearity(activation)
        activation = T.dot(activation, self.v)

        # apply softmax - acquiring attention weights for each letter in each tweet
        activation = activation.reshape((original_shape[0], original_shape[1]))
        attention_w = nonlinearities.softmax(activation)
        attention_w = attention_w.reshape(
            (original_shape[0] * original_shape[1], 1))

        # get weighted sum of each hidden state according to attention weights
        context = input * attention_w
        context = context.reshape(original_shape)
        context = T.sum(context, axis=1)

        return context
예제 #10
0
def safe_softmax(x, eps=1e-6):
    """ Prevents that any of the outputs become exactly 1 or 0 """
    x = softmax(x)
    x = T.maximum(x, eps)
    x = T.minimum(x, 1 - eps)
    return x
예제 #11
0
class_lab = T.batched_dot(
    T.reshape(output_before_softmax_lab,
              newshape=(args.batch_size, 2, num_classes)).dimshuffle(0, 2, 1),
    T.ones(shape=(args.batch_size, 2, 1))).dimshuffle(
        0,
        1,
    )
class_gen = T.batched_dot(
    T.reshape(output_before_softmax_gen,
              newshape=(args.batch_size, 2, num_classes)).dimshuffle(0, 2, 1),
    T.ones(shape=(args.batch_size, 2, 1))).dimshuffle(
        0,
        1,
    )
loss_gen_class = T.mean(
    categorical_crossentropy(predictions=softmax(class_gen),
                             targets=labels_gen))
loss_gen_source = T.mean(
    categorical_crossentropy(predictions=softmax(source_gen),
                             targets=T.zeros(shape=(args.batch_size, ),
                                             dtype='int32')))
loss_lab_class = T.mean(
    categorical_crossentropy(predictions=softmax(class_lab), targets=labels))
loss_lab_source = T.mean(categorical_crossentropy(predictions=softmax(source_lab), targets=T.zeros(shape=(args.batch_size,), dtype='int32'))) +\
    T.mean(categorical_crossentropy(predictions=softmax(source_gen), targets=T.ones(shape=(args.batch_size,), dtype='int32')))
weight_gen_loss = th.shared(np.float32(0.))
output_lab = ll.get_output(disc_layers[-2], x_lab, deterministic=False)
output_gen = ll.get_output(disc_layers[-2], gen_dat, deterministic=False)
m1 = T.mean(output_lab, axis=0)
m2 = T.mean(output_gen, axis=0)
feature_loss = T.mean(abs(m1 - m2))
예제 #12
0
 def get_output(a):
     return nonlin.softmax(T.dot(a, self.W))
예제 #13
0
 def get_output_for(self, input, **kwargs):
     activation = T.dot(input, self.C)
     if self.b is not None:
         activation = activation + self.b.dimshuffle('x', 0)
     return nonlinearities.softmax(activation)
예제 #14
0
 def get_output(a):
     return nonlin.softmax(T.dot(a,self.W))
예제 #15
0
    def build_model(self,
                    train_set_unlabeled,
                    train_set_labeled,
                    test_set,
                    validation_set=None):
        """
        Build the auxiliary deep generative model from the initialized hyperparameters.
        Define the lower bound term and compile it into a training function.
        :param train_set_unlabeled: Unlabeled train set containing variables x, t.
        :param train_set_labeled: Unlabeled train set containing variables x, t.
        :param test_set: Test set containing variables x, t.
        :param validation_set: Validation set containing variables x, t.
        :return: train, test, validation function and dicts of arguments.
        """
        super(CSDGM, self).build_model(train_set_unlabeled, test_set,
                                       validation_set)

        sh_train_x_l = theano.shared(np.asarray(train_set_labeled[0],
                                                dtype=theano.config.floatX),
                                     borrow=True)
        sh_train_t_l = theano.shared(np.asarray(train_set_labeled[1],
                                                dtype=theano.config.floatX),
                                     borrow=True)
        n = self.sh_train_x.shape[0].astype(
            theano.config.floatX)  # no. of data points
        n_l = sh_train_x_l.shape[0].astype(
            theano.config.floatX)  # no. of labeled data points

        # Define the layers for the density estimation used in the lower bound.
        l_log_qa = GaussianLogDensityLayer(self.l_qa, self.l_qa_mu,
                                           self.l_qa_logvar)
        l_log_qz = GaussianLogDensityLayer(self.l_qz, self.l_qz_mu,
                                           self.l_qz_logvar)
        l_log_qy = MultinomialLogDensityLayer(self.l_qy, self.l_y_in, eps=1e-8)

        l_log_pz = StandardNormalLogDensityLayer(self.l_qz)
        l_log_pa = GaussianLogDensityLayer(self.l_qa, self.l_pa_mu,
                                           self.l_pa_logvar)

        l_x_in = ReshapeLayer(self.l_x_in, (-1, self.n_l * self.n_c))
        l_px = DimshuffleLayer(self.l_px, (0, 3, 1, 2, 4))
        l_px = ReshapeLayer(l_px, (-1, self.sym_samples, 1, self.n_c))
        if self.x_dist == 'bernoulli':
            l_log_px = BernoulliLogDensityLayer(self.l_px, self.l_x_in)
        elif self.x_dist == 'multinomial':
            l_log_px = MultinomialLogDensityLayer(l_px, l_x_in)
            l_log_px = ReshapeLayer(l_log_px, (-1, self.n_l, 1, 1, 1))
            l_log_px = MeanLayer(l_log_px, axis=1)
        elif self.x_dist == 'gaussian':
            l_px_mu = ReshapeLayer(
                DimshuffleLayer(self.l_px_mu, (0, 2, 3, 1, 4)),
                (-1, self.sym_samples, 1, self.n_l * self.n_c))
            l_px_logvar = ReshapeLayer(
                DimshuffleLayer(self.l_px_logvar, (0, 2, 3, 1, 4)),
                (-1, self.sym_samples, 1, self.n_l * self.n_c))
            l_log_px = GaussianLogDensityLayer(l_x_in, l_px_mu, l_px_logvar)

        def lower_bound(log_pa, log_qa, log_pz, log_qz, log_py, log_px):
            lb = log_px + log_py + (log_pz + log_pa - log_qa -
                                    log_qz) * (1.1 - self.sym_warmup)
            return lb

        # Lower bound for labeled data
        out_layers = [
            l_log_pa, l_log_pz, l_log_qa, l_log_qz, l_log_px, l_log_qy
        ]
        inputs = {self.l_x_in: self.sym_x_l, self.l_y_in: self.sym_t_l}
        out = get_output(out_layers,
                         inputs,
                         batch_norm_update_averages=False,
                         batch_norm_use_averages=False)
        log_pa_l, log_pz_l, log_qa_x_l, log_qz_axy_l, log_px_zy_l, log_qy_ax_l = out

        # Prior p(y) expecting that all classes are evenly distributed
        py_l = softmax(T.zeros((self.sym_x_l.shape[0], self.n_y)))
        log_py_l = -categorical_crossentropy(py_l, self.sym_t_l).reshape(
            (-1, 1)).dimshuffle((0, 'x', 'x', 1))
        lb_l = lower_bound(log_pa_l, log_qa_x_l, log_pz_l, log_qz_axy_l,
                           log_py_l, log_px_zy_l)
        lb_l = lb_l.mean(axis=(1, 2))  # Mean over the sampling dimensions
        log_qy_ax_l *= (
            self.sym_beta * (n / n_l)
        )  # Scale the supervised cross entropy with the alpha constant
        lb_l += log_qy_ax_l.mean(axis=(
            1, 2
        ))  # Collect the lower bound term and mean over sampling dimensions

        # Lower bound for unlabeled data
        bs_u = self.sym_x_u.shape[0]

        # For the integrating out approach, we repeat the input matrix x, and construct a target (bs * n_y) x n_y
        # Example of input and target matrix for a 3 class problem and batch_size=2. 2D tensors of the form
        #               x_repeat                     t_repeat
        #  [[x[0,0], x[0,1], ..., x[0,n_x]]         [[1, 0, 0]
        #   [x[1,0], x[1,1], ..., x[1,n_x]]          [1, 0, 0]
        #   [x[0,0], x[0,1], ..., x[0,n_x]]          [0, 1, 0]
        #   [x[1,0], x[1,1], ..., x[1,n_x]]          [0, 1, 0]
        #   [x[0,0], x[0,1], ..., x[0,n_x]]          [0, 0, 1]
        #   [x[1,0], x[1,1], ..., x[1,n_x]]]         [0, 0, 1]]
        t_eye = T.eye(self.n_y, k=0)
        t_u = t_eye.reshape((self.n_y, 1, self.n_y)).repeat(bs_u,
                                                            axis=1).reshape(
                                                                (-1, self.n_y))
        x_u = self.sym_x_u.reshape(
            (1, bs_u, self.n_l, self.n_c)).repeat(self.n_y, axis=0).reshape(
                (-1, self.n_l, self.n_c))

        # Since the expectation of var a is outside the integration we calculate E_q(a|x) first
        a_x_u = get_output(self.l_qa,
                           self.sym_x_u,
                           batch_norm_update_averages=True,
                           batch_norm_use_averages=False)
        a_x_u_rep = a_x_u.reshape(
            (1, bs_u * self.sym_samples, self.n_a)).repeat(self.n_y,
                                                           axis=0).reshape(
                                                               (-1, self.n_a))
        out_layers = [l_log_pa, l_log_pz, l_log_qa, l_log_qz, l_log_px]
        inputs = {self.l_x_in: x_u, self.l_y_in: t_u, self.l_a_in: a_x_u_rep}
        out = get_output(out_layers,
                         inputs,
                         batch_norm_update_averages=False,
                         batch_norm_use_averages=False)
        log_pa_u, log_pz_u, log_qa_x_u, log_qz_axy_u, log_px_zy_u = out

        # Prior p(y) expecting that all classes are evenly distributed
        py_u = softmax(T.zeros((bs_u * self.n_y, self.n_y)))
        log_py_u = -categorical_crossentropy(py_u, t_u).reshape(
            (-1, 1)).dimshuffle((0, 'x', 'x', 1))
        lb_u = lower_bound(log_pa_u, log_qa_x_u, log_pz_u, log_qz_axy_u,
                           log_py_u, log_px_zy_u)
        lb_u = lb_u.reshape(
            (self.n_y, 1, 1, bs_u)).transpose(3, 1, 2, 0).mean(axis=(1, 2))
        inputs = {
            self.l_x_in: self.sym_x_u,
            self.l_a_in: a_x_u.reshape((-1, self.n_a))
        }
        y_u = get_output(self.l_qy,
                         inputs,
                         batch_norm_update_averages=True,
                         batch_norm_use_averages=False).mean(axis=(1, 2))
        y_u += 1e-8  # Ensure that we get no NANs when calculating the entropy
        y_u /= T.sum(y_u, axis=1, keepdims=True)
        lb_u = (y_u * (lb_u - T.log(y_u))).sum(axis=1)

        # Regularizing with weight priors p(theta|N(0,1)), collecting and clipping gradients
        weight_priors = 0.0
        for p in self.trainable_model_params:
            if 'W' not in str(p):
                continue
            weight_priors += log_normal(p, 0, 1).sum()

        # Collect the lower bound and scale it with the weight priors.
        elbo = ((lb_l.mean() + lb_u.mean()) * n + weight_priors) / -n
        lb_labeled = -lb_l.mean()
        lb_unlabeled = -lb_u.mean()
        log_px = log_px_zy_l.mean() + log_px_zy_u.mean()
        log_pz = log_pz_l.mean() + log_pz_u.mean()
        log_qz = log_qz_axy_l.mean() + log_qz_axy_u.mean()
        log_pa = log_pa_l.mean() + log_pa_u.mean()
        log_qa = log_qa_x_l.mean() + log_qa_x_u.mean()

        grads_collect = T.grad(elbo, self.trainable_model_params)
        params_collect = self.trainable_model_params
        sym_beta1 = T.scalar('beta1')
        sym_beta2 = T.scalar('beta2')
        clip_grad, max_norm = 1, 5
        mgrads = total_norm_constraint(grads_collect, max_norm=max_norm)
        mgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads]
        updates = adam(mgrads, params_collect, self.sym_lr, sym_beta1,
                       sym_beta2)

        # Training function
        indices = self._srng.choice(size=[self.sym_bs_l],
                                    a=sh_train_x_l.shape[0],
                                    replace=False)
        x_batch_l = sh_train_x_l[indices]
        t_batch_l = sh_train_t_l[indices]
        x_batch_u = self.sh_train_x[self.batch_slice]
        if self.x_dist == 'bernoulli':  # Sample bernoulli input.
            x_batch_u = self._srng.binomial(size=x_batch_u.shape,
                                            n=1,
                                            p=x_batch_u,
                                            dtype=theano.config.floatX)
            x_batch_l = self._srng.binomial(size=x_batch_l.shape,
                                            n=1,
                                            p=x_batch_l,
                                            dtype=theano.config.floatX)

        givens = {
            self.sym_x_l: x_batch_l,
            self.sym_x_u: x_batch_u,
            self.sym_t_l: t_batch_l
        }
        inputs = [
            self.sym_index, self.sym_batchsize, self.sym_bs_l, self.sym_beta,
            self.sym_lr, sym_beta1, sym_beta2, self.sym_samples,
            self.sym_warmup
        ]
        outputs = [
            elbo, lb_labeled, lb_unlabeled, log_px, log_pz, log_qz, log_pa,
            log_qa
        ]
        f_train = theano.function(inputs=inputs,
                                  outputs=outputs,
                                  givens=givens,
                                  updates=updates)

        # Default training args. Note that these can be changed during or prior to training.
        self.train_args['inputs']['batchsize_unlabeled'] = 100
        self.train_args['inputs']['batchsize_labeled'] = 100
        self.train_args['inputs']['beta'] = 0.1
        self.train_args['inputs']['learningrate'] = 3e-4
        self.train_args['inputs']['beta1'] = 0.9
        self.train_args['inputs']['beta2'] = 0.999
        self.train_args['inputs']['samples'] = 1
        self.train_args['inputs']['warmup'] = 0.1
        self.train_args['outputs']['lb'] = '%0.3f'
        self.train_args['outputs']['lb-l'] = '%0.3f'
        self.train_args['outputs']['lb-u'] = '%0.3f'
        self.train_args['outputs']['px'] = '%0.3f'
        self.train_args['outputs']['pz'] = '%0.3f'
        self.train_args['outputs']['qz'] = '%0.3f'
        self.train_args['outputs']['pa'] = '%0.3f'
        self.train_args['outputs']['qa'] = '%0.3f'

        # Validation and test function
        y = get_output(self.l_qy, self.sym_x_l,
                       deterministic=True).mean(axis=(1, 2))
        class_err = (1. - categorical_accuracy(y, self.sym_t_l).mean()) * 100
        givens = {self.sym_x_l: self.sh_test_x, self.sym_t_l: self.sh_test_t}
        f_test = theano.function(inputs=[self.sym_samples],
                                 outputs=[class_err],
                                 givens=givens)

        # Test args.  Note that these can be changed during or prior to training.
        self.test_args['inputs']['samples'] = 1
        self.test_args['outputs']['test'] = '%0.2f%%'

        f_validate = None
        if validation_set is not None:
            givens = {
                self.sym_x_l: self.sh_valid_x,
                self.sym_t_l: self.sh_valid_t
            }
            f_validate = theano.function(inputs=[self.sym_samples],
                                         outputs=[class_err],
                                         givens=givens)
            # Default validation args. Note that these can be changed during or prior to training.
            self.validate_args['inputs']['samples'] = 1
            self.validate_args['outputs']['validation'] = '%0.2f%%'

        return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
예제 #16
0
 def get_output_for(self, input, **kwargs):
     activation = T.dot(input, self.C)
     if self.b is not None:
         activation = activation + self.b.dimshuffle('x', 0)
     return nonlinearities.softmax(activation)
예제 #17
0
def main():
    parser = argparse.ArgumentParser(description='Tuning with bi-directional LSTM-CNN-CRF')
    parser.add_argument('--num_epochs', type=int, default=1000, help='Number of training epochs')
    parser.add_argument('--batch_size', type=int, default=10, help='Number of sentences in each batch')
    parser.add_argument('--num_units', type=int, default=100, help='Number of hidden units in LSTM')
    parser.add_argument('--num_filters', type=int, default=20, help='Number of filters in CNN')
    parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate')
    parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate')
    parser.add_argument('--grad_clipping', type=float, default=0, help='Gradient clipping')
    parser.add_argument('--gamma', type=float, default=1e-6, help='weight for regularization')
    parser.add_argument('--delta', type=float, default=0.0, help='weight for expectation-linear regularization')
    parser.add_argument('--regular', choices=['none', 'l2'], help='regularization for training', required=True)
    parser.add_argument('--dropout', choices=['std', 'recurrent'], help='dropout patten')
    parser.add_argument('--schedule', nargs='+', type=int, help='schedule for learning rate decay')
    parser.add_argument('--output_prediction', action='store_true', help='Output predictions to temp files')
    parser.add_argument('--train')  # "data/POS-penn/wsj/split1/wsj1.train.original"
    parser.add_argument('--dev')  # "data/POS-penn/wsj/split1/wsj1.dev.original"
    parser.add_argument('--test')  # "data/POS-penn/wsj/split1/wsj1.test.original"

    args = parser.parse_args()

    logger = get_logger("Sequence Labeling")
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    num_epochs = args.num_epochs
    batch_size = args.batch_size
    num_units = args.num_units
    num_filters = args.num_filters
    regular = args.regular
    grad_clipping = args.grad_clipping
    gamma = args.gamma
    delta = args.delta
    learning_rate = args.learning_rate
    momentum = 0.9
    decay_rate = args.decay_rate
    schedule = args.schedule
    output_predict = args.output_prediction
    dropout = args.dropout
    p = 0.5

    logger.info("Creating Alphabets")
    word_alphabet, char_alphabet, pos_alphabet, type_alphabet = data_utils.create_alphabets("data/alphabets/",
                                                                                            [train_path, dev_path,
                                                                                             test_path],
                                                                                            40000)
    logger.info("Word Alphabet Size: %d" % word_alphabet.size())
    logger.info("Character Alphabet Size: %d" % char_alphabet.size())
    logger.info("POS Alphabet Size: %d" % pos_alphabet.size())

    num_labels = pos_alphabet.size() - 1

    logger.info("Reading Data")
    data_train = data_utils.read_data(train_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet)
    data_dev = data_utils.read_data(dev_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet)
    data_test = data_utils.read_data(test_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet)

    num_data = sum([len(bucket) for bucket in data_train])

    logger.info("constructing network...")
    # create variables
    target_var = T.imatrix(name='targets')
    mask_var = T.matrix(name='masks', dtype=theano.config.floatX)
    mask_nr_var = T.matrix(name='masks_nr', dtype=theano.config.floatX)
    word_var = T.imatrix(name='inputs')
    char_var = T.itensor3(name='char-inputs')

    network = build_network(word_var, char_var, mask_var, word_alphabet, char_alphabet, dropout, num_units, num_labels,
                            grad_clipping, num_filters, p)

    logger.info("Network structure: hidden=%d, filter=%d, dropout=%s" % (num_units, num_filters, dropout))
    # compute loss
    num_tokens = mask_var.sum(dtype=theano.config.floatX)
    num_tokens_nr = mask_nr_var.sum(dtype=theano.config.floatX)

    # get outpout of bi-lstm-cnn-crf shape [batch, length, num_labels, num_labels]
    energies_train = lasagne.layers.get_output(network)
    energies_train_det = lasagne.layers.get_output(network, deterministic=True)
    energies_eval = lasagne.layers.get_output(network, deterministic=True)

    loss_train_org = chain_crf_loss(energies_train, target_var, mask_var).mean()

    energy_shape = energies_train.shape
    # [batch, length, num_labels, num_labels] --> [batch*length, num_labels*num_labels]
    energies = T.reshape(energies_train, (energy_shape[0] * energy_shape[1], energy_shape[2] * energy_shape[3]))
    energies = nonlinearities.softmax(energies)
    energies_det = T.reshape(energies_train_det, (energy_shape[0] * energy_shape[1], energy_shape[2] * energy_shape[3]))
    energies_det = nonlinearities.softmax(energies_det)
    # [batch*length, num_labels*num_labels] --> [batch, length*num_labels*num_labels]
    energies = T.reshape(energies, (energy_shape[0], energy_shape[1] * energy_shape[2] * energy_shape[3]))
    energies_det = T.reshape(energies_det, (energy_shape[0], energy_shape[1] * energy_shape[2] * energy_shape[3]))

    loss_train_expect_linear = lasagne.objectives.squared_error(energies, energies_det)
    loss_train_expect_linear = loss_train_expect_linear.sum(axis=1)
    loss_train_expect_linear = loss_train_expect_linear.mean()

    loss_train = loss_train_org + delta * loss_train_expect_linear
    # l2 regularization?
    if regular == 'l2':
        l2_penalty = lasagne.regularization.regularize_network_params(network, lasagne.regularization.l2)
        loss_train = loss_train + gamma * l2_penalty

    _, corr_train = chain_crf_accuracy(energies_train, target_var)
    corr_nr_train = (corr_train * mask_nr_var).sum(dtype=theano.config.floatX)
    corr_train = (corr_train * mask_var).sum(dtype=theano.config.floatX)
    prediction_eval, corr_eval = chain_crf_accuracy(energies_eval, target_var)
    corr_nr_eval = (corr_eval * mask_nr_var).sum(dtype=theano.config.floatX)
    corr_eval = (corr_eval * mask_var).sum(dtype=theano.config.floatX)

    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = nesterov_momentum(loss_train, params=params, learning_rate=learning_rate, momentum=momentum)

    # Compile a function performing a training step on a mini-batch
    train_fn = theano.function([word_var, char_var, target_var, mask_var, mask_nr_var],
                               [loss_train, loss_train_org, loss_train_expect_linear,
                                corr_train, corr_nr_train, num_tokens, num_tokens_nr], updates=updates)
    # Compile a second function evaluating the loss and accuracy of network
    eval_fn = theano.function([word_var, char_var, target_var, mask_var, mask_nr_var],
                              [corr_eval, corr_nr_eval, num_tokens, num_tokens_nr, prediction_eval])

    # Finally, launch the training loop.
    logger.info(
        "Start training: regularization: %s(%f), dropout: %s, delta: %.2f (#training data: %d, batch size: %d, clip: %.1f)..." \
        % (regular, (0.0 if regular == 'none' else gamma), dropout, delta, num_data, batch_size, grad_clipping))

    num_batches = num_data / batch_size + 1
    dev_correct = 0.0
    dev_correct_nr = 0.0
    best_epoch = 0
    test_correct = 0.0
    test_correct_nr = 0.0
    test_total = 0
    test_total_nr = 0
    test_inst = 0
    lr = learning_rate
    for epoch in range(1, num_epochs + 1):
        print 'Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % (epoch, lr, decay_rate)
        train_err = 0.0
        train_err_org = 0.0
        train_err_linear = 0.0
        train_corr = 0.0
        train_corr_nr = 0.0
        train_total = 0
        train_total_nr = 0
        train_inst = 0
        start_time = time.time()
        num_back = 0
        for batch in xrange(1, num_batches + 1):
            wids, cids, pids, _, _, masks = data_utils.get_batch(data_train, batch_size)
            masks_nr = np.copy(masks)
            masks_nr[:, 0] = 0
            err, err_org, err_linear, corr, corr_nr, num, num_nr = train_fn(wids, cids, pids, masks, masks_nr)
            train_err += err * wids.shape[0]
            train_err_org += err_org * wids.shape[0]
            train_err_linear += err_linear * wids.shape[0]
            train_corr += corr
            train_corr_nr += corr_nr
            train_total += num
            train_total_nr += num_nr
            train_inst += wids.shape[0]
            time_ave = (time.time() - start_time) / batch
            time_left = (num_batches - batch) * time_ave

            # update log
            sys.stdout.write("\b" * num_back)
            log_info = 'train: %d/%d loss: %.4f, loss_org: %.4f, loss_linear: %.4f, acc: %.2f%%, acc(no root): %.2f%%, time left (estimated): %.2fs' % (
                batch, num_batches, train_err / train_inst, train_err_org / train_inst, train_err_linear / train_inst,
                train_corr * 100 / train_total, train_corr_nr * 100 / train_total_nr, time_left)
            sys.stdout.write(log_info)
            num_back = len(log_info)
        # update training log after each epoch
        assert train_inst == num_batches * batch_size
        assert train_total == train_total_nr + train_inst
        sys.stdout.write("\b" * num_back)
        print 'train: %d/%d loss: %.4f,  loss_org: %.4f, loss_linear: %.4f, acc: %.2f%%, acc(no root): %.2f%%, time: %.2fs' % (
            train_inst, train_inst, train_err / train_inst, train_err_org / train_inst, train_err_linear / train_inst,
            train_corr * 100 / train_total, train_corr_nr * 100 / train_total_nr, time.time() - start_time)

        # evaluate performance on dev data
        dev_corr = 0.0
        dev_corr_nr = 0.0
        dev_total = 0
        dev_total_nr = 0
        dev_inst = 0
        for batch in data_utils.iterate_batch(data_dev, batch_size):
            wids, cids, pids, _, _, masks = batch
            masks_nr = np.copy(masks)
            masks_nr[:, 0] = 0
            corr, corr_nr, num, num_nr, predictions = eval_fn(wids, cids, pids, masks, masks_nr)
            dev_corr += corr
            dev_corr_nr += corr_nr
            dev_total += num
            dev_total_nr += num_nr
            dev_inst += wids.shape[0]
        assert dev_total == dev_total_nr + dev_inst
        print 'dev corr: %d, total: %d, acc: %.2f%%, no root corr: %d, total: %d, acc: %.2f%%' % (
            dev_corr, dev_total, dev_corr * 100 / dev_total, dev_corr_nr, dev_total_nr, dev_corr_nr * 100 / dev_total_nr)

        if dev_correct_nr < dev_corr_nr:
            dev_correct = dev_corr
            dev_correct_nr = dev_corr_nr
            best_epoch = epoch

            # evaluate on test data when better performance detected
            test_corr = 0.0
            test_corr_nr = 0.0
            test_total = 0
            test_total_nr = 0
            test_inst = 0
            for batch in data_utils.iterate_batch(data_test, batch_size):
                wids, cids, pids, _, _, masks = batch
                masks_nr = np.copy(masks)
                masks_nr[:, 0] = 0
                corr, corr_nr, num, num_nr, predictions = eval_fn(wids, cids, pids, masks, masks_nr)
                test_corr += corr
                test_corr_nr += corr_nr
                test_total += num
                test_total_nr += num_nr
                test_inst += wids.shape[0]
            assert test_total + test_total_nr + test_inst
            test_correct = test_corr
            test_correct_nr = test_corr_nr
        print "best dev  corr: %d, total: %d, acc: %.2f%%, no root corr: %d, total: %d, acc: %.2f%% (epoch: %d)" % (
            dev_correct, dev_total, dev_correct * 100 / dev_total,
            dev_correct_nr, dev_total_nr, dev_correct_nr * 100 / dev_total_nr, best_epoch)
        print "best test corr: %d, total: %d, acc: %.2f%%, no root corr: %d, total: %d, acc: %.2f%% (epoch: %d)" % (
            test_correct, test_total, test_correct * 100 / test_total,
            test_correct_nr, test_total_nr, test_correct_nr * 100 / test_total_nr, best_epoch)

        if epoch in schedule:
            lr = lr * decay_rate
            updates = nesterov_momentum(loss_train, params=params, learning_rate=lr, momentum=momentum)
            train_fn = theano.function([word_var, char_var, target_var, mask_var, mask_nr_var],
                                       [loss_train, loss_train_org, loss_train_expect_linear,
                                        corr_train, corr_nr_train, num_tokens, num_tokens_nr], updates=updates)
예제 #18
0
def safe_softmax(x, eps=1e-6):
    """ Prevents that any of the outputs become exactly 1 or 0 """
    x = softmax(x)
    x = T.maximum(x, eps)
    x = T.minimum(x, 1 - eps)
    return x