Exemplo n.º 1
0
    def build_trainer_with_model_parallel(self, src, src_mask, trg, trg_mask, ite, ps_device, devices,
                                          l1_reg_weight=1e-6, l2_reg_weight=1e-6):
        assert K._BACKEND == 'tensorflow'

        src_mask_3d = K.expand_dims(src_mask)
        trg_mask_3d = K.expand_dims(trg_mask)

        # compute loss and grads
        loss = self.calc_loss_with_model_parallel(src,
                                                  src_mask_3d,
                                                  trg,
                                                  trg_mask_3d,
                                                  ps_device=ps_device,
                                                  devices=devices,
                                                  l1_reg_weight=l1_reg_weight,
                                                  l2_reg_weight=l2_reg_weight)

        grads = tf.gradients(loss, self.params, colocate_gradients_with_ops=True)

        grads = grad_clip(grads, self.clip_c)
        updates = adadelta(self.params, grads)
        inps = [src, src_mask, trg, trg_mask]

        self.train_fn = K.function(inps,
                                   [loss],
                                   updates=updates)
Exemplo n.º 2
0
    def build_trainer(self, src, src_mask, trg, trg_mask, ite,
                      l1_reg_weight=1e-6,
                      l2_reg_weight=1e-6,
                      softmax_output_num_sampled=100000):

        src_mask_3d = K.expand_dims(src_mask)
        trg_mask_3d = K.expand_dims(trg_mask)
        annotations = self.encoder.apply(src, src_mask_3d)
        
        # init_context = annotations[0, :, -self.n_hids_src:]
        # modification #1
        # mean pooling
        init_context = K.sum(annotations * src_mask_3d, axis=0) / K.sum(src_mask_3d, axis=0)

        trg_emb = self.table_trg.apply(trg)
        # shift_right assumes a 3D tensor, and time steps is dimension one
        trg_emb_shifted = K.permute_dimensions(K.shift_right(K.permute_dimensions(trg_emb, [1, 0, 2])),
                                               [1, 0, 2])

        hiddens, readout, _ = self.decoder.run_pipeline(state_below=trg_emb_shifted,
                                                        mask_below=trg_mask_3d,
                                                        init_context=init_context,
                                                        c=annotations,
                                                        c_mask=src_mask_3d)
        # apply dropout
        if self.dropout > 0.:
            logger.info('Apply dropout with p = {}'.format(self.dropout))
            readout = Dropout(readout, self.dropout)

        self.cost = calc_loss_from_readout(readout=readout,
                                           targets=trg,
                                           targets_mask=trg_mask_3d,
                                           logisticRegressionLayer=self.logistic_layer,
                                           softmax_output_num_sampled=softmax_output_num_sampled)
        # for reconstruction

        self.L1 = sum([K.sum(K.abs(param)) for param in self.params])
        self.L2 = sum([K.sum(K.square(param)) for param in self.params])

        params_regular = self.L1 * l1_reg_weight + self.L2 * l2_reg_weight

        # train cost
        train_cost = self.cost + params_regular

        # gradients
        grads = K.gradients(train_cost, self.params)

        # apply gradient clipping here
        grads = grad_clip(grads, self.clip_c)

        # updates
        updates = adadelta(self.params, grads)

        # train function
        inps = [src, src_mask, trg, trg_mask]

        self.train_fn = K.function(inps, [train_cost], updates=updates, name='train_func')
Exemplo n.º 3
0
    def build_trainer_with_data_parallel(self,
                                         src,
                                         src_mask,
                                         trg,
                                         trg_mask,
                                         ite,
                                         devices,
                                         l1_reg_weight=1e-6,
                                         l2_reg_weight=1e-6,
                                         softmax_output_num_sampled=100000):

        assert K._BACKEND == 'tensorflow'

        src_mask_3d = [K.expand_dims(mask) for mask in src_mask]
        trg_mask_3d = [K.expand_dims(mask) for mask in trg_mask]

        num_devices = len(devices)

        loss_list = []
        grads_list = []

        # TODO: group the devices by hosts, first calculate the averaged gradients for each host
        for i, device in enumerate(devices):
            with tf.device(device):
                loss = self.calc_loss(
                    src[i],
                    src_mask_3d[i],
                    trg[i],
                    trg_mask_3d[i],
                    l1_reg_weight=l1_reg_weight,
                    l2_reg_weight=l2_reg_weight,
                    softmax_output_num_sampled=softmax_output_num_sampled)

                loss_list.append(loss)
                grads = K.gradients(loss, self.params)
                grads_list.append(grads)

        avg_loss = sum(loss_list) / num_devices
        # use customized version of gradient to enable colocate_gradients with_ops
        # to ensure the gradient are computed by the same device that do the forward computation
        grads = avg_grads(grads_list)
        grads = grad_clip(grads, self.clip_c)
        updates = adadelta(self.params, grads)

        inps = src + src_mask + trg + trg_mask

        self.train_fn = K.function(inps, [avg_loss] + loss_list,
                                   updates=updates)
Exemplo n.º 4
0
    def build_trainer(self, src, src_mask, src_hist, src_hist_mask, trg,
                      trg_mask, ite):

        # added by Longyue
        # checked by Zhaopeng: sentence dim = n_steps, hist_len, batch_size (4, 3, 25)
        # hist = (bath_size, sent_num, sent_len) --.T-->
        # hist = (sent_len, sent_num, bath_size) --lookup table-->
        # (sent_len, sent_num, bath_size, word_emb) --reshape-->
        # (sent_len, sent_num*bath_size, word_emb) --word-level rnn-->
        # (sent_len, sent_num*bath_size, hidden_size) --reshape-->
        # (sent_len, sent_num, bath_size, hidden_size) --[-1]-->
        # (sent_num, bath_size, hidden_size) --sent-level rnn-->
        # (sent_num, bath_size, hidden_size) --[-1]-->
        # (bath_size, hidden_size) = cross-sent context vector

        annotations_1 = self.encoder_hist_1.apply_1(src_hist, src_hist_mask)
        annotations_1 = annotations_1[-1]  # get last hidden states
        annotations_2 = self.encoder_hist_2.apply_2(annotations_1)
        annotations_3 = annotations_2[-1]  # get last hidden states

        #modified by Longyue
        annotations = self.encoder.apply(src, src_mask, annotations_3)
        # init_context = annotations[0, :, -self.n_hids_src:]
        # modification #1
        # mean pooling
        init_context = (annotations *
                        src_mask[:, :, None]).sum(0) / src_mask.sum(0)[:, None]

        #added by Longyue
        init_context = concatenate([init_context, annotations_3],
                                   axis=annotations_3.ndim - 1)

        trg_emb = self.table_trg.apply(trg)
        trg_emb_shifted = T.zeros_like(trg_emb)
        trg_emb_shifted = T.set_subtensor(trg_emb_shifted[1:], trg_emb[:-1])
        # modified by Longyue
        hiddens, readout, alignment = self.decoder.run_pipeline(
            state_below=trg_emb_shifted,
            mask_below=trg_mask,
            init_context=init_context,
            c=annotations,
            c_mask=src_mask,
            hist=annotations_3)

        # apply dropout
        if self.dropout < 1.0:
            logger.info('Apply dropout with p = {}'.format(self.dropout))
            readout = Dropout(self.trng, readout, 1, self.dropout)

        p_y_given_x = self.logistic_layer.get_probs(readout)

        self.cost = self.logistic_layer.cost(p_y_given_x, trg,
                                             trg_mask) / trg.shape[1]

        # self.cost = theano.printing.Print('likilihood cost:')(self.cost)

        # added by Zhaopeng Tu, 2016-07-12
        # for reconstruction
        if self.with_reconstruction:
            # now hiddens is the annotations
            inverse_init_context = (hiddens * trg_mask[:, :, None]
                                    ).sum(0) / trg_mask.sum(0)[:, None]

            src_emb = self.table_src.apply(src)
            src_emb_shifted = T.zeros_like(src_emb)
            src_emb_shifted = T.set_subtensor(src_emb_shifted[1:],
                                              src_emb[:-1])
            inverse_hiddens, inverse_readout, inverse_alignment = self.inverse_decoder.run_pipeline(
                state_below=src_emb_shifted,
                mask_below=src_mask,
                init_context=inverse_init_context,
                c=hiddens,
                c_mask=trg_mask)

            # apply dropout
            if self.dropout < 1.0:
                # logger.info('Apply dropout with p = {}'.format(self.dropout))
                inverse_readout = Dropout(self.srng, inverse_readout, 1,
                                          self.dropout)

            p_x_given_y = self.inverse_logistic_layer.get_probs(
                inverse_readout)

            self.reconstruction_cost = self.inverse_logistic_layer.cost(
                p_x_given_y, src, src_mask) / src.shape[1]

            # self.reconstruction_cost = theano.printing.Print('reconstructed cost:')(self.reconstruction_cost)
            self.cost += self.reconstruction_cost * self.reconstruction_weight

        self.L1 = sum(T.sum(abs(param)) for param in self.params)
        self.L2 = sum(T.sum(param**2) for param in self.params)

        params_regular = self.L1 * 1e-6 + self.L2 * 1e-6
        # params_regular = theano.printing.Print('params_regular:')(params_regular)

        # train cost
        train_cost = self.cost + params_regular

        # gradients
        grads = T.grad(train_cost, self.params)

        # apply gradient clipping here
        grads = grad_clip(grads, self.clip_c)

        # updates
        updates = adadelta(self.params, grads)

        # train function
        # modified by Longyue
        inps = [src, src_mask, src_hist, src_hist_mask, trg, trg_mask]

        self.train_fn = theano.function(inps, [train_cost],
                                        updates=updates,
                                        name='train_function')
Exemplo n.º 5
0
    def build_trainer(self, src, src_mask, trg, trg_mask, ite):

        annotations = self.encoder.apply(src, src_mask)
        # init_context = annotations[0, :, -self.n_hids_src:]
        # modification #1
        # mean pooling
        init_context = (annotations *
                        src_mask[:, :, None]).sum(0) / src_mask.sum(0)[:, None]

        trg_emb = self.table_trg.apply(trg)
        trg_emb_shifted = T.zeros_like(trg_emb)
        trg_emb_shifted = T.set_subtensor(trg_emb_shifted[1:], trg_emb[:-1])
        hiddens, readout, alignment = self.decoder.run_pipeline(
            state_below=trg_emb_shifted,
            mask_below=trg_mask,
            init_context=init_context,
            c=annotations,
            c_mask=src_mask)

        # apply dropout
        if self.dropout < 1.0:
            logger.info('Apply dropout with p = {}'.format(self.dropout))
            readout = Dropout(self.trng, readout, 1, self.dropout)

        p_y_given_x = self.logistic_layer.get_probs(readout)

        self.cost = self.logistic_layer.cost(p_y_given_x, trg,
                                             trg_mask) / trg.shape[1]

        # self.cost = theano.printing.Print('likilihood cost:')(self.cost)

        # added by Zhaopeng Tu, 2016-07-12
        # for reconstruction
        if self.with_reconstruction:
            # now hiddens is the annotations
            inverse_init_context = (hiddens * trg_mask[:, :, None]
                                    ).sum(0) / trg_mask.sum(0)[:, None]

            src_emb = self.table_src.apply(src)
            src_emb_shifted = T.zeros_like(src_emb)
            src_emb_shifted = T.set_subtensor(src_emb_shifted[1:],
                                              src_emb[:-1])
            inverse_hiddens, inverse_readout, inverse_alignment = self.inverse_decoder.run_pipeline(
                state_below=src_emb_shifted,
                mask_below=src_mask,
                init_context=inverse_init_context,
                c=hiddens,
                c_mask=trg_mask)

            # apply dropout
            if self.dropout < 1.0:
                # logger.info('Apply dropout with p = {}'.format(self.dropout))
                inverse_readout = Dropout(self.srng, inverse_readout, 1,
                                          self.dropout)

            p_x_given_y = self.inverse_logistic_layer.get_probs(
                inverse_readout)

            self.reconstruction_cost = self.inverse_logistic_layer.cost(
                p_x_given_y, src, src_mask) / src.shape[1]

            # self.reconstruction_cost = theano.printing.Print('reconstructed cost:')(self.reconstruction_cost)
            self.cost += self.reconstruction_cost * self.reconstruction_weight

        self.L1 = sum(T.sum(abs(param)) for param in self.params)
        self.L2 = sum(T.sum(param**2) for param in self.params)

        params_regular = self.L1 * 1e-6 + self.L2 * 1e-6
        # params_regular = theano.printing.Print('params_regular:')(params_regular)

        # train cost
        train_cost = self.cost + params_regular

        # gradients
        grads = T.grad(train_cost, self.params)

        # apply gradient clipping here
        grads = grad_clip(grads, self.clip_c)

        # updates
        updates = adadelta(self.params, grads)

        # train function
        inps = [src, src_mask, trg, trg_mask]

        self.train_fn = theano.function(inps, [train_cost],
                                        updates=updates,
                                        name='train_function')
Exemplo n.º 6
0
    def build_trainer(self, src, src_mask, trg, trg_mask):
        annotations = self.encoder.apply(src, src_mask)
        # init_context = annotations[0, :, -self.n_hids_src:]
        # modification #1
        # mean pooling
        init_context = (annotations *
                        src_mask[:, :, None]).sum(0) / src_mask.sum(0)[:, None]

        trg_emb = self.table_trg.apply(trg)
        trg_emb_shifted = T.zeros_like(trg_emb)
        trg_emb_shifted = T.set_subtensor(trg_emb_shifted[1:], trg_emb[:-1])
        results = self.decoder.run_pipeline(state_below=trg_emb_shifted,
                                            mask_below=trg_mask,
                                            init_context=init_context,
                                            c=annotations,
                                            c_mask=src_mask)

        hiddens, ctxs, readout, alignment = results[:4]

        # apply dropout
        if self.dropout < 1.0:
            logger.info('Apply dropout with p = {}'.format(self.dropout))
            readout = Dropout(self.trng, readout, 1, self.dropout)

        p_y_given_x = self.logistic_layer.get_probs(readout)

        self.cost = self.logistic_layer.cost(p_y_given_x, trg,
                                             trg_mask) / trg.shape[1]

        # self.cost = theano.printing.Print('likilihood cost:')(self.cost)

        # added by Zhaopeng Tu, 2016-07-12
        # for reconstruction
        if self.with_reconstruction:
            # now hiddens is the annotations
            inverse_init_context = (hiddens * trg_mask[:, :, None]
                                    ).sum(0) / trg_mask.sum(0)[:, None]

            src_emb = self.table_src.apply(src)
            src_emb_shifted = T.zeros_like(src_emb)
            src_emb_shifted = T.set_subtensor(src_emb_shifted[1:],
                                              src_emb[:-1])
            inverse_results = self.inverse_decoder.run_pipeline(
                state_below=src_emb_shifted,
                mask_below=src_mask,
                init_context=inverse_init_context,
                c=hiddens,
                c_mask=trg_mask)

            inverse_hiddens, inverse_ctxs, inverse_readout, inverse_alignment = inverse_results[:
                                                                                                4]

            # apply dropout
            if self.dropout < 1.0:
                # logger.info('Apply dropout with p = {}'.format(self.dropout))
                inverse_readout = Dropout(self.srng, inverse_readout, 1,
                                          self.dropout)

            p_x_given_y = self.inverse_logistic_layer.get_probs(
                inverse_readout)

            self.reconstruction_cost = self.inverse_logistic_layer.cost(
                p_x_given_y, src, src_mask) / src.shape[1]

            # self.reconstruction_cost = theano.printing.Print('reconstructed cost:')(self.reconstruction_cost)
            self.cost += self.reconstruction_cost * self.reconstruction_weight

        self.L1 = sum(T.sum(abs(param)) for param in self.params)
        self.L2 = sum(T.sum(param**2) for param in self.params)

        params_regular = self.L1 * 1e-6 + self.L2 * 1e-6
        # params_regular = theano.printing.Print('params_regular:')(params_regular)

        # train cost
        train_cost = self.cost + params_regular

        # gradients
        grads = T.grad(train_cost, self.params)

        # apply gradient clipping here
        grads = grad_clip(grads, self.clip_c)

        # train function
        inps = [src, src_mask, trg, trg_mask]
        outs = [train_cost]

        if self.with_layernorm:
            inps = [src, src_mask, trg, trg_mask]
            lr = T.scalar(name='lr')
            print 'Building optimizers...',
            self.train_fn, self.update_fn = adam(lr, self.params, grads, inps,
                                                 outs)
        else:
            # updates
            updates = adadelta(self.params, grads)

            # mode=theano.Mode(linker='vm') for ifelse
            # Unless linker='vm' or linker='cvm' are used, ifelse will compute both variables and take the same computation time as switch.
            self.train_fn = theano.function(inps,
                                            outs,
                                            updates=updates,
                                            name='train_function',
                                            mode=theano.Mode(linker='vm'))