def build_trainer_with_model_parallel(self, src, src_mask, trg, trg_mask, ite, ps_device, devices, l1_reg_weight=1e-6, l2_reg_weight=1e-6): assert K._BACKEND == 'tensorflow' src_mask_3d = K.expand_dims(src_mask) trg_mask_3d = K.expand_dims(trg_mask) # compute loss and grads loss = self.calc_loss_with_model_parallel(src, src_mask_3d, trg, trg_mask_3d, ps_device=ps_device, devices=devices, l1_reg_weight=l1_reg_weight, l2_reg_weight=l2_reg_weight) grads = tf.gradients(loss, self.params, colocate_gradients_with_ops=True) grads = grad_clip(grads, self.clip_c) updates = adadelta(self.params, grads) inps = [src, src_mask, trg, trg_mask] self.train_fn = K.function(inps, [loss], updates=updates)
def build_trainer(self, src, src_mask, trg, trg_mask, ite, l1_reg_weight=1e-6, l2_reg_weight=1e-6, softmax_output_num_sampled=100000): src_mask_3d = K.expand_dims(src_mask) trg_mask_3d = K.expand_dims(trg_mask) annotations = self.encoder.apply(src, src_mask_3d) # init_context = annotations[0, :, -self.n_hids_src:] # modification #1 # mean pooling init_context = K.sum(annotations * src_mask_3d, axis=0) / K.sum(src_mask_3d, axis=0) trg_emb = self.table_trg.apply(trg) # shift_right assumes a 3D tensor, and time steps is dimension one trg_emb_shifted = K.permute_dimensions(K.shift_right(K.permute_dimensions(trg_emb, [1, 0, 2])), [1, 0, 2]) hiddens, readout, _ = self.decoder.run_pipeline(state_below=trg_emb_shifted, mask_below=trg_mask_3d, init_context=init_context, c=annotations, c_mask=src_mask_3d) # apply dropout if self.dropout > 0.: logger.info('Apply dropout with p = {}'.format(self.dropout)) readout = Dropout(readout, self.dropout) self.cost = calc_loss_from_readout(readout=readout, targets=trg, targets_mask=trg_mask_3d, logisticRegressionLayer=self.logistic_layer, softmax_output_num_sampled=softmax_output_num_sampled) # for reconstruction self.L1 = sum([K.sum(K.abs(param)) for param in self.params]) self.L2 = sum([K.sum(K.square(param)) for param in self.params]) params_regular = self.L1 * l1_reg_weight + self.L2 * l2_reg_weight # train cost train_cost = self.cost + params_regular # gradients grads = K.gradients(train_cost, self.params) # apply gradient clipping here grads = grad_clip(grads, self.clip_c) # updates updates = adadelta(self.params, grads) # train function inps = [src, src_mask, trg, trg_mask] self.train_fn = K.function(inps, [train_cost], updates=updates, name='train_func')
def build_trainer_with_data_parallel(self, src, src_mask, trg, trg_mask, ite, devices, l1_reg_weight=1e-6, l2_reg_weight=1e-6, softmax_output_num_sampled=100000): assert K._BACKEND == 'tensorflow' src_mask_3d = [K.expand_dims(mask) for mask in src_mask] trg_mask_3d = [K.expand_dims(mask) for mask in trg_mask] num_devices = len(devices) loss_list = [] grads_list = [] # TODO: group the devices by hosts, first calculate the averaged gradients for each host for i, device in enumerate(devices): with tf.device(device): loss = self.calc_loss( src[i], src_mask_3d[i], trg[i], trg_mask_3d[i], l1_reg_weight=l1_reg_weight, l2_reg_weight=l2_reg_weight, softmax_output_num_sampled=softmax_output_num_sampled) loss_list.append(loss) grads = K.gradients(loss, self.params) grads_list.append(grads) avg_loss = sum(loss_list) / num_devices # use customized version of gradient to enable colocate_gradients with_ops # to ensure the gradient are computed by the same device that do the forward computation grads = avg_grads(grads_list) grads = grad_clip(grads, self.clip_c) updates = adadelta(self.params, grads) inps = src + src_mask + trg + trg_mask self.train_fn = K.function(inps, [avg_loss] + loss_list, updates=updates)
def build_trainer(self, src, src_mask, src_hist, src_hist_mask, trg, trg_mask, ite): # added by Longyue # checked by Zhaopeng: sentence dim = n_steps, hist_len, batch_size (4, 3, 25) # hist = (bath_size, sent_num, sent_len) --.T--> # hist = (sent_len, sent_num, bath_size) --lookup table--> # (sent_len, sent_num, bath_size, word_emb) --reshape--> # (sent_len, sent_num*bath_size, word_emb) --word-level rnn--> # (sent_len, sent_num*bath_size, hidden_size) --reshape--> # (sent_len, sent_num, bath_size, hidden_size) --[-1]--> # (sent_num, bath_size, hidden_size) --sent-level rnn--> # (sent_num, bath_size, hidden_size) --[-1]--> # (bath_size, hidden_size) = cross-sent context vector annotations_1 = self.encoder_hist_1.apply_1(src_hist, src_hist_mask) annotations_1 = annotations_1[-1] # get last hidden states annotations_2 = self.encoder_hist_2.apply_2(annotations_1) annotations_3 = annotations_2[-1] # get last hidden states #modified by Longyue annotations = self.encoder.apply(src, src_mask, annotations_3) # init_context = annotations[0, :, -self.n_hids_src:] # modification #1 # mean pooling init_context = (annotations * src_mask[:, :, None]).sum(0) / src_mask.sum(0)[:, None] #added by Longyue init_context = concatenate([init_context, annotations_3], axis=annotations_3.ndim - 1) trg_emb = self.table_trg.apply(trg) trg_emb_shifted = T.zeros_like(trg_emb) trg_emb_shifted = T.set_subtensor(trg_emb_shifted[1:], trg_emb[:-1]) # modified by Longyue hiddens, readout, alignment = self.decoder.run_pipeline( state_below=trg_emb_shifted, mask_below=trg_mask, init_context=init_context, c=annotations, c_mask=src_mask, hist=annotations_3) # apply dropout if self.dropout < 1.0: logger.info('Apply dropout with p = {}'.format(self.dropout)) readout = Dropout(self.trng, readout, 1, self.dropout) p_y_given_x = self.logistic_layer.get_probs(readout) self.cost = self.logistic_layer.cost(p_y_given_x, trg, trg_mask) / trg.shape[1] # self.cost = theano.printing.Print('likilihood cost:')(self.cost) # added by Zhaopeng Tu, 2016-07-12 # for reconstruction if self.with_reconstruction: # now hiddens is the annotations inverse_init_context = (hiddens * trg_mask[:, :, None] ).sum(0) / trg_mask.sum(0)[:, None] src_emb = self.table_src.apply(src) src_emb_shifted = T.zeros_like(src_emb) src_emb_shifted = T.set_subtensor(src_emb_shifted[1:], src_emb[:-1]) inverse_hiddens, inverse_readout, inverse_alignment = self.inverse_decoder.run_pipeline( state_below=src_emb_shifted, mask_below=src_mask, init_context=inverse_init_context, c=hiddens, c_mask=trg_mask) # apply dropout if self.dropout < 1.0: # logger.info('Apply dropout with p = {}'.format(self.dropout)) inverse_readout = Dropout(self.srng, inverse_readout, 1, self.dropout) p_x_given_y = self.inverse_logistic_layer.get_probs( inverse_readout) self.reconstruction_cost = self.inverse_logistic_layer.cost( p_x_given_y, src, src_mask) / src.shape[1] # self.reconstruction_cost = theano.printing.Print('reconstructed cost:')(self.reconstruction_cost) self.cost += self.reconstruction_cost * self.reconstruction_weight self.L1 = sum(T.sum(abs(param)) for param in self.params) self.L2 = sum(T.sum(param**2) for param in self.params) params_regular = self.L1 * 1e-6 + self.L2 * 1e-6 # params_regular = theano.printing.Print('params_regular:')(params_regular) # train cost train_cost = self.cost + params_regular # gradients grads = T.grad(train_cost, self.params) # apply gradient clipping here grads = grad_clip(grads, self.clip_c) # updates updates = adadelta(self.params, grads) # train function # modified by Longyue inps = [src, src_mask, src_hist, src_hist_mask, trg, trg_mask] self.train_fn = theano.function(inps, [train_cost], updates=updates, name='train_function')
def build_trainer(self, src, src_mask, trg, trg_mask, ite): annotations = self.encoder.apply(src, src_mask) # init_context = annotations[0, :, -self.n_hids_src:] # modification #1 # mean pooling init_context = (annotations * src_mask[:, :, None]).sum(0) / src_mask.sum(0)[:, None] trg_emb = self.table_trg.apply(trg) trg_emb_shifted = T.zeros_like(trg_emb) trg_emb_shifted = T.set_subtensor(trg_emb_shifted[1:], trg_emb[:-1]) hiddens, readout, alignment = self.decoder.run_pipeline( state_below=trg_emb_shifted, mask_below=trg_mask, init_context=init_context, c=annotations, c_mask=src_mask) # apply dropout if self.dropout < 1.0: logger.info('Apply dropout with p = {}'.format(self.dropout)) readout = Dropout(self.trng, readout, 1, self.dropout) p_y_given_x = self.logistic_layer.get_probs(readout) self.cost = self.logistic_layer.cost(p_y_given_x, trg, trg_mask) / trg.shape[1] # self.cost = theano.printing.Print('likilihood cost:')(self.cost) # added by Zhaopeng Tu, 2016-07-12 # for reconstruction if self.with_reconstruction: # now hiddens is the annotations inverse_init_context = (hiddens * trg_mask[:, :, None] ).sum(0) / trg_mask.sum(0)[:, None] src_emb = self.table_src.apply(src) src_emb_shifted = T.zeros_like(src_emb) src_emb_shifted = T.set_subtensor(src_emb_shifted[1:], src_emb[:-1]) inverse_hiddens, inverse_readout, inverse_alignment = self.inverse_decoder.run_pipeline( state_below=src_emb_shifted, mask_below=src_mask, init_context=inverse_init_context, c=hiddens, c_mask=trg_mask) # apply dropout if self.dropout < 1.0: # logger.info('Apply dropout with p = {}'.format(self.dropout)) inverse_readout = Dropout(self.srng, inverse_readout, 1, self.dropout) p_x_given_y = self.inverse_logistic_layer.get_probs( inverse_readout) self.reconstruction_cost = self.inverse_logistic_layer.cost( p_x_given_y, src, src_mask) / src.shape[1] # self.reconstruction_cost = theano.printing.Print('reconstructed cost:')(self.reconstruction_cost) self.cost += self.reconstruction_cost * self.reconstruction_weight self.L1 = sum(T.sum(abs(param)) for param in self.params) self.L2 = sum(T.sum(param**2) for param in self.params) params_regular = self.L1 * 1e-6 + self.L2 * 1e-6 # params_regular = theano.printing.Print('params_regular:')(params_regular) # train cost train_cost = self.cost + params_regular # gradients grads = T.grad(train_cost, self.params) # apply gradient clipping here grads = grad_clip(grads, self.clip_c) # updates updates = adadelta(self.params, grads) # train function inps = [src, src_mask, trg, trg_mask] self.train_fn = theano.function(inps, [train_cost], updates=updates, name='train_function')
def build_trainer(self, src, src_mask, trg, trg_mask): annotations = self.encoder.apply(src, src_mask) # init_context = annotations[0, :, -self.n_hids_src:] # modification #1 # mean pooling init_context = (annotations * src_mask[:, :, None]).sum(0) / src_mask.sum(0)[:, None] trg_emb = self.table_trg.apply(trg) trg_emb_shifted = T.zeros_like(trg_emb) trg_emb_shifted = T.set_subtensor(trg_emb_shifted[1:], trg_emb[:-1]) results = self.decoder.run_pipeline(state_below=trg_emb_shifted, mask_below=trg_mask, init_context=init_context, c=annotations, c_mask=src_mask) hiddens, ctxs, readout, alignment = results[:4] # apply dropout if self.dropout < 1.0: logger.info('Apply dropout with p = {}'.format(self.dropout)) readout = Dropout(self.trng, readout, 1, self.dropout) p_y_given_x = self.logistic_layer.get_probs(readout) self.cost = self.logistic_layer.cost(p_y_given_x, trg, trg_mask) / trg.shape[1] # self.cost = theano.printing.Print('likilihood cost:')(self.cost) # added by Zhaopeng Tu, 2016-07-12 # for reconstruction if self.with_reconstruction: # now hiddens is the annotations inverse_init_context = (hiddens * trg_mask[:, :, None] ).sum(0) / trg_mask.sum(0)[:, None] src_emb = self.table_src.apply(src) src_emb_shifted = T.zeros_like(src_emb) src_emb_shifted = T.set_subtensor(src_emb_shifted[1:], src_emb[:-1]) inverse_results = self.inverse_decoder.run_pipeline( state_below=src_emb_shifted, mask_below=src_mask, init_context=inverse_init_context, c=hiddens, c_mask=trg_mask) inverse_hiddens, inverse_ctxs, inverse_readout, inverse_alignment = inverse_results[: 4] # apply dropout if self.dropout < 1.0: # logger.info('Apply dropout with p = {}'.format(self.dropout)) inverse_readout = Dropout(self.srng, inverse_readout, 1, self.dropout) p_x_given_y = self.inverse_logistic_layer.get_probs( inverse_readout) self.reconstruction_cost = self.inverse_logistic_layer.cost( p_x_given_y, src, src_mask) / src.shape[1] # self.reconstruction_cost = theano.printing.Print('reconstructed cost:')(self.reconstruction_cost) self.cost += self.reconstruction_cost * self.reconstruction_weight self.L1 = sum(T.sum(abs(param)) for param in self.params) self.L2 = sum(T.sum(param**2) for param in self.params) params_regular = self.L1 * 1e-6 + self.L2 * 1e-6 # params_regular = theano.printing.Print('params_regular:')(params_regular) # train cost train_cost = self.cost + params_regular # gradients grads = T.grad(train_cost, self.params) # apply gradient clipping here grads = grad_clip(grads, self.clip_c) # train function inps = [src, src_mask, trg, trg_mask] outs = [train_cost] if self.with_layernorm: inps = [src, src_mask, trg, trg_mask] lr = T.scalar(name='lr') print 'Building optimizers...', self.train_fn, self.update_fn = adam(lr, self.params, grads, inps, outs) else: # updates updates = adadelta(self.params, grads) # mode=theano.Mode(linker='vm') for ifelse # Unless linker='vm' or linker='cvm' are used, ifelse will compute both variables and take the same computation time as switch. self.train_fn = theano.function(inps, outs, updates=updates, name='train_function', mode=theano.Mode(linker='vm'))