def transformer( src_vocab_size, trg_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, dropout_rate, label_smooth_eps, ): enc_inputs = make_all_inputs(encoder_data_input_fields + encoder_util_input_fields) enc_output = wrap_encoder( src_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, dropout_rate, enc_inputs, ) dec_inputs = make_all_inputs(decoder_data_input_fields[:-1] + decoder_util_input_fields) predict = wrap_decoder( trg_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, dropout_rate, dec_inputs, enc_output, ) # Padding index do not contribute to the total loss. The weights is used to # cancel padding index in calculating the loss. label, weights = make_all_inputs(label_data_input_fields) if label_smooth_eps: label = layers.label_smooth( label=layers.one_hot( input=label, depth=trg_vocab_size), epsilon=label_smooth_eps) cost = layers.softmax_with_cross_entropy( logits=predict, label=label, soft_label=True if label_smooth_eps else False) # cost = layers.softmax_with_cross_entropy(logits=predict, label=gold) weighted_cost = cost * weights sum_cost = layers.reduce_sum(weighted_cost) token_num = layers.reduce_sum(weights) avg_cost = sum_cost / token_num return sum_cost, avg_cost, predict, token_num
def forward(self, enc_inputs, dec_inputs, label, weights): """ forward :param enc_inputs: :param dec_inputs: :param label: :param weights: :return: """ enc_output = self._wrap_encoder_layer(enc_inputs) predict = self._wrap_decoder_layer(dec_inputs, enc_output) if self._label_smooth_eps: label_out = layers.label_smooth(label=layers.one_hot( input=label, depth=self._trg_vocab_size), epsilon=self._label_smooth_eps) cost = layers.softmax_with_cross_entropy( logits=predict, label=label_out, soft_label=True if self._label_smooth_eps else False) weighted_cost = cost * weights sum_cost = layers.reduce_sum(weighted_cost) token_num = layers.reduce_sum(weights) token_num.stop_gradient = True avg_cost = sum_cost / token_num return sum_cost, avg_cost, predict, token_num
def forward(self, inputs, is_infer=False): """ Run model main forward. """ outputs = {} if is_infer: self.generation_caches = [{ "k": layers.fill_constant_batch_size_like( input=inputs["token_ids"], shape=[-1, 0, self.d_key * self.n_head], dtype=self.dtype, value=0), "v": layers.fill_constant_batch_size_like( input=inputs["token_ids"], shape=[-1, 0, self.d_value * self.n_head], dtype=self.dtype, value=0), } for i in range(self.n_layer)] else: self.generation_caches = None latent_embeddings = layers.create_parameter( shape=[self.emb_size, self.latent_type_size], dtype=self.dtype, attr=fluid.ParamAttr(name=self.latent_emb_name, initializer=self.param_initializer)) if is_infer: latent_id = inputs["latent_id"] weights = layers.one_hot(latent_id, self.latent_type_size) else: logits, recognition_checkpoints = self._recognition_network( token_ids=inputs["token_ids"], type_ids=inputs["type_ids"], pos_ids=inputs["pos_ids"], role_ids=inputs.get("role_ids", None), recognition_mask=inputs["recognition_mask"], ) outputs["post_probs"] = layers.softmax(logits) weights = self._gumbel_softmax(logits) outputs["checkpoints"] = recognition_checkpoints latent_emb = layers.matmul(x=weights, y=latent_embeddings, transpose_y=True) outputs["enc_out"], generation_checkpoints = self._generation_network( token_ids=inputs["token_ids"], type_ids=inputs["type_ids"], pos_ids=inputs["pos_ids"], role_ids=inputs.get("role_ids", None), generation_mask=inputs["generation_mask"], aux_emb=layers.unsqueeze(latent_emb, axes=[1]), gather_idx=inputs.get("parent_idx", None), ) if not is_infer: outputs["checkpoints"].extend(generation_checkpoints) return outputs
def std_gen_interpolate(batch_size=8, seed=None, out_path='data/out', levels=None, interpolate_mode=0): default_levels = ("y;z0;z11;z12;z21;z22;z31;z32;z41;z42;z51;z52;z61;z62") if levels is None: levels = default_levels default_levels = default_levels.split(';') img_save_dir = os.path.join('/tmp', out_path+'.dir') os.system(f'rm -rf {img_save_dir}') os.system(f'mkdir {img_save_dir} -p') with dg.no_grad(): model_cache.train_mode = False model_cache.initialized = False if seed is not None: rds.rng = np.random.RandomState(seed) elif rds.rng is None: rds.rng = np.random G = model_cache.G x_np = rds.rng.randn(batch_size,140).astype('float32') y_np = rds.rng.randint(0,1000,size=[batch_size]).astype('int64') x = dg.to_variable(x_np) y_cls = dg.to_variable(y_np) y_hot = layers.one_hot(layers.unsqueeze(y_cls,[1]), depth=1000) y_embed = G.embed_y(y_hot) x = layers.concat([x, x[:1]], 0) y_embed = layers.concat([y_embed, y_embed[:1]], 0) levels = levels.split(';') for level in default_levels: if len(level) == 1: locals()[level] = y_embed locals()['_'+level] = y_embed[:1] if len(level) >= 2: idx = int(level[1])*20 locals()[level] = x[:,idx:idx+20] locals()['_'+level] = x[:1,idx:idx+20] imgs = [] for i in range(batch_size): for j in range(40): alpha = j / 40 if interpolate_mode == 1: alpha = alpha**2 * (3 - 2 * alpha) for level in levels: locals()['_'+level] = (1 - alpha) * locals()[level][i:i+1] + alpha * locals()[level][i+1:i+2] inputs = [] for level in default_levels[1:]: inputs.append(locals()['_'+level]) img_pd = G(inputs, locals()['_'+default_levels[0]], True) img = np.uint8(img_pd.numpy().clip(0,1)*255)[0].transpose([1,2,0]) imgs.append(Image.fromarray(img)) stdout.write(f'{i*40+j+1}/{40*batch_size}\r') stdout.flush() print('') for i, img in enumerate(imgs): img.save(os.path.join(img_save_dir, str(i).zfill(5)+'.png')) imgs[0].save(out_path+'.gif', save_all=True, append_images=imgs[1:], duration=40, loop=0) out_path = out_path + '.mp4' os.system(f'ffmpeg -r 40 -i {img_save_dir}/%05d.png -hide_banner -loglevel warning -nostats -c:v libx264 -crf 23 -y {out_path}') os.system(f'rm -rf {img_save_dir}')
def _collect_metrics(self, inputs, outputs): """ Calculate loss function by using inputs and outputs. """ metrics = {} tgt_len = layers.reduce_sum( layers.reduce_sum(inputs["tgt_mask"], dim=1) - 1) tgt_len.stop_gradient = True label = inputs["tgt_token"][:, 1:] if self.label_smooth > 0: one_hot_label = layers.one_hot(label, self.num_token_embeddings) smooth_label = layers.label_smooth(one_hot_label, epsilon=self.label_smooth, dtype=self._dtype) nll = layers.cross_entropy(outputs["dec_pred"], smooth_label, soft_label=True, ignore_index=self.padding_idx) else: nll = layers.cross_entropy(outputs["dec_probs"], label, ignore_index=self.padding_idx) nll = layers.reduce_sum(nll, dim=1) token_nll = layers.reduce_sum(nll) / tgt_len nll = layers.reduce_mean(nll) metrics["nll"] = nll metrics["token_nll"] = token_nll loss = nll if self.num_latent > 0 and self.with_bow: bow_probs = F.unsqueeze(outputs["bow_probs"], [1]) bow_probs = layers.expand(bow_probs, [1, label.shape[1], 1]) if self.label_smooth > 0: bow = layers.cross_entropy(bow_probs, smooth_label, soft_label=True, ignore_index=self.padding_idx) else: bow = layers.cross_entropy(bow_probs, label, ignore_index=self.padding_idx) bow = layers.reduce_sum(bow, dim=1) token_bow = layers.reduce_sum(bow) / tgt_len bow = layers.reduce_mean(bow) metrics["bow"] = bow metrics["token_bow"] = token_bow loss = loss + bow if self.num_latent > 0 and self.use_discriminator: dis = 0.0 - (layers.log(outputs["pos_probs"]) + layers.log(1.0 - outputs["neg_probs"])) dis = layers.reduce_mean(dis) metrics["dis"] = dis loss = loss + dis * self.dis_ratio metrics["loss"] = loss metrics["token_num"] = tgt_len return metrics
def test_label_smooth(self): program = Program() with program_guard(program): label = layers.data(name="label", shape=[1], dtype="float32") one_hot_label = layers.one_hot(input=label, depth=10) smooth_label = layers.label_smooth( label=one_hot_label, epsilon=0.1, dtype="float32") self.assertIsNotNone(smooth_label) print(str(program))
def __call__(self, predict, label, weights): if self.label_smooth_eps: label_out = layers.label_smooth(label=layers.one_hot( input=label, depth=predict.shape[-1]), epsilon=self.label_smooth_eps) cost = layers.softmax_with_cross_entropy( logits=predict, label=label_out, soft_label=True if self.label_smooth_eps else False) weighted_cost = cost * weights sum_cost = layers.reduce_sum(weighted_cost) token_num = layers.reduce_sum(weights) token_num.stop_gradient = True avg_cost = sum_cost / token_num return sum_cost, avg_cost, token_num
def forward(self, outputs, labels): predict, (label, weights) = outputs[0], labels if self.label_smooth_eps: label = layers.label_smooth(label=layers.one_hot( input=label, depth=predict.shape[-1]), epsilon=self.label_smooth_eps) cost = layers.softmax_with_cross_entropy( logits=predict, label=label, soft_label=True if self.label_smooth_eps else False) weighted_cost = cost * weights sum_cost = layers.reduce_sum(weighted_cost) token_num = layers.reduce_sum(weights) token_num.stop_gradient = True avg_cost = sum_cost / token_num return avg_cost
def build_model(self, enc_input, dec_input, tgt_label, label_weights): """Build the model with source encoding and target decoding""" enc_word_output, enc_sen_output = self.encode(enc_input) dec_output = self.decode(dec_input, enc_word_output, enc_sen_output) predict_token_idx = layers.argmax(dec_output, axis=-1) correct_token_idx = layers.cast(layers.equal( tgt_label, layers.reshape(predict_token_idx, shape=[-1, 1])), dtype='float32') weighted_correct = layers.elementwise_mul(x=correct_token_idx, y=label_weights, axis=0) sum_correct = layers.reduce_sum(weighted_correct) sum_correct.stop_gradient = True # Padding index do not contribute to the total loss. The weights is used to # cancel padding index in calculating the loss. if self._label_smooth_eps: # TODO: use fluid.input.one_hot after softmax_with_cross_entropy removing # the enforcement that the last dimension of label must be 1. tgt_label = layers.label_smooth(label=layers.one_hot( input=tgt_label, depth=self.voc_size), epsilon=self._label_smooth_eps) cost = layers.softmax_with_cross_entropy( logits=dec_output, label=tgt_label, soft_label=True if self._label_smooth_eps else False) weighted_cost = layers.elementwise_mul(x=cost, y=label_weights, axis=0) sum_cost = layers.reduce_sum(weighted_cost) token_num = layers.reduce_sum(label_weights) token_num.stop_gradient = True avg_cost = sum_cost / token_num graph_vars = { "loss": avg_cost, "sum_correct": sum_correct, "token_num": token_num, } for k, v in graph_vars.items(): v.persistable = True return graph_vars
def std_gen(batch_size=8, seed=None): with dg.no_grad(): model_cache.train_mode = False model_cache.initialized = False if seed is not None: rds.rng = np.random.RandomState(seed) elif rds.rng is None: rds.rng = np.random G = model_cache.G x_np = rds.rng.randn(batch_size,140).astype('float32') y_np = rds.rng.randint(0,1000,size=[batch_size]).astype('int64') x = dg.to_variable(x_np) y = dg.to_variable(y_np) y_hot = layers.one_hot(layers.unsqueeze(y,[1]), depth=1000) img_pd = G(x, y_hot) img = np.uint8(img_pd.numpy().clip(0,1)*255) imgs = [] for i in range(len(img)): imgs += [Image.fromarray(img[i].transpose([1,2,0]))] return imgs
def renorm_gen_interpolate(batch_size=8, seed=None, out_path='data/out.gif'): with dg.no_grad(): model_cache.train_mode = True model_cache.initialized = True if seed is not None: rds.rng = np.random.RandomState(seed) elif rds.rng is None: rds.rng = np.random G = model_cache.G x_np = rds.rng.randn(batch_size, 140).astype('float32') y_np = rds.rng.randint(0, 1000, size=[batch_size]).astype('int64') x = dg.to_variable(x_np) y = dg.to_variable(y_np) y_hot = layers.one_hot(layers.unsqueeze(y, [1]), depth=1000) y_embed = G.embed_y(y_hot) G(x, y_embed, True) model_cache.train_mode = False model_cache.initialized = True x = layers.concat([x, x[:1]], 0) y_embed = layers.concat([y_embed, y_embed[:1]], 0) imgs = [] for i in range(batch_size): for j in range(40): alpha = j / (40 - 1) _x = (1 - alpha) * x[i:i + 1] + alpha * x[i + 1:i + 2] _y_embed = (1 - alpha ) * y_embed[i:i + 1] + alpha * y_embed[i + 1:i + 2] img_pd = G(_x, _y_embed, True) img = np.uint8(img_pd.numpy().clip(0, 1) * 255)[0].transpose( [1, 2, 0]) imgs.append(Image.fromarray(img)) stdout.write(f'{i*40+j+1}/{40*batch_size}\r') stdout.flush() print('') imgs[0].save(out_path, save_all=True, append_images=imgs[1:], duration=40, loop=0) return Image.open(out_path)
def transformer(src_vocab_size, trg_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, label_smooth_eps, bos_idx=0, is_test=False, model_input=None): """ transformer main """ if weight_sharing: assert src_vocab_size == trg_vocab_size, ( "Vocabularies in source and target should be same for weight sharing." ) enc_inputs = (model_input.src_word, model_input.src_pos, model_input.src_slf_attn_bias) dec_inputs = (model_input.trg_word, model_input.trg_pos, model_input.trg_slf_attn_bias, model_input.trg_src_attn_bias) label = model_input.lbl_word weights = model_input.lbl_weight enc_output = wrap_encoder(src_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, enc_inputs, bos_idx=bos_idx) predict = wrap_decoder( trg_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, dec_inputs, enc_output, ) # Padding index do not contribute to the total loss. The weights is used to # cancel padding index in calculating the loss. if label_smooth_eps: label = layers.label_smooth(label=layers.one_hot(input=label, depth=trg_vocab_size), epsilon=label_smooth_eps) cost = layers.softmax_with_cross_entropy( logits=predict, label=label, soft_label=True if label_smooth_eps else False) weighted_cost = cost * weights sum_cost = layers.reduce_sum(weighted_cost) token_num = layers.reduce_sum(weights) token_num.stop_gradient = True avg_cost = sum_cost / token_num res = [sum_cost, avg_cost, predict, token_num] return res
def inference(self, model, inputs, outputs): """ Run inference. Args: inputs(dict): Its key is input name(str) and its value is a Variable. model(object): A generate model. Need to implement `_generation_network` and `_calc_logits`. Returns: dict(str:Variable): Its key is output name(str) and its value is a Variable. """ # prepare while loop max_len = layers.fill_constant( shape=[1], dtype="int64", value=self.max_dec_len, force_cpu=True) min_len = layers.fill_constant( shape=[1], dtype="int64", value=self.min_dec_len, force_cpu=True) step_idx = layers.fill_constant( shape=[1], dtype="int64", value=0, force_cpu=True) ids = layers.array_write(layers.reshape(inputs["tgt_ids"], (-1, 1)), step_idx) pos_biases = layers.array_write(layers.reshape(inputs["tgt_pos"], (-1, 1)), step_idx) scores = layers.array_write(inputs["init_score"], step_idx) tgt_generation_mask = layers.array_write(inputs["tgt_generation_mask"], step_idx) parent_idx = inputs["parent_idx"] if self.decoding_strategy == "beam_search": beam_size = self.beam_size else: beam_size = 1 eos_penalty = np.zeros(self.vocab_size, dtype="float32") eos_penalty[self.eos_id] = -1e9 eos_penalty = layers.assign(eos_penalty) token_penalty = np.zeros(self.vocab_size, dtype="float32") token_penalty[self.unk_id] = -1e9 if self.mask_id >= 0: token_penalty[self.mask_id] = -1e9 token_penalty = layers.assign(token_penalty) # start while loop cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond) with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) pos_bias = layers.array_read(array=pos_biases, i=step_idx) pos_bias = layers.gather(input=pos_bias, index=parent_idx) tmp_tgt_generation_mask = layers.array_read(tgt_generation_mask, i=step_idx) dtype = tmp_tgt_generation_mask.dtype append_mask = layers.fill_constant_batch_size_like( input=pre_ids, value=1.0, shape=[-1, 1, 1], dtype=dtype) tmp_tgt_generation_mask = layers.concat([tmp_tgt_generation_mask, append_mask], axis=2) pre_mask = tmp_tgt_generation_mask = layers.gather(input=tmp_tgt_generation_mask, index=parent_idx) pre_sent = layers.fill_constant_batch_size_like( input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype) if self.continuous_position: pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) + pos_bias else: pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) if self.use_role: pre_role = layers.fill_constant_batch_size_like( input=pre_mask, value=0, shape=[-1, 1, 1], dtype=pre_ids.dtype) else: pre_role = None dec_out, _ = model._generation_network( token_ids=pre_ids, type_ids=pre_sent, pos_ids=pre_pos, role_ids=pre_role, generation_mask=tmp_tgt_generation_mask, gather_idx=parent_idx) logits = model._calc_logits(dec_out) # ignore unk and mask token if self.ignore_unk: logits = layers.elementwise_add(logits, token_penalty, axis=1) # min dec length min_len_cond = layers.less_than(x=step_idx, y=min_len) def min_len_penalty(): """Plus minimum length penalty.""" return layers.elementwise_add(logits, eos_penalty, axis=1) def no_penalty(): """No penalty.""" return logits logits = layers.case([(min_len_cond, min_len_penalty)], default=no_penalty) # get probs probs = layers.softmax(logits / self.temperature) if self.decoding_strategy == "beam_search": topk_scores, topk_indices = layers.topk( input=probs, k=beam_size) else: if self.decoding_strategy.startswith("sampling"): sampling_ids = layers.sampling_id(probs, dtype="int") elif self.decoding_strategy.startswith("topk_sampling"): topk_probs, _ = layers.topk(input=probs, k=self.topk) ge_cond = layers.cast( layers.greater_equal( probs, layers.unsqueeze(topk_probs[:, -1], [1])), "float32") old_probs = probs probs = probs * ge_cond / layers.reduce_sum(topk_probs, dim=-1, keep_dim=True) sampling_ids = layers.sampling_id(probs, dtype="int") probs = old_probs else: raise ValueError(self.decoding_strategy) sampling_scores = layers.one_hot( layers.unsqueeze(sampling_ids, [1]), probs.shape[1] ) sampling_scores = sampling_scores * probs - (1 - sampling_scores) * 1e3 topk_scores, topk_indices = layers.topk( input=sampling_scores, k=1) pre_len = layers.cast(step_idx, "float32") layers.increment(x=step_idx, value=1.0, in_place=True) cur_len = layers.cast(step_idx, "float32") # update scores if self.length_average: accu_scores = layers.elementwise_add( x=layers.log(topk_scores), y=pre_scores * pre_len, axis=0) / cur_len elif self.length_penalty > 0: pre_lp = layers.pow((5 + pre_len) / 6, self.length_penalty) cur_lp = layers.pow((5 + cur_len) / 6, self.length_penalty) accu_scores = layers.elementwise_add( x=layers.log(topk_scores), y=pre_scores * pre_lp, axis=0) / cur_lp else: accu_scores = layers.elementwise_add( x=layers.log(topk_scores), y=pre_scores, axis=0) topk_indices = layers.lod_reset(topk_indices, pre_ids) accu_scores = layers.lod_reset(accu_scores, pre_ids) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=beam_size, end_id=self.eos_id, return_parent_idx=True) layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.array_write(pre_mask, i=step_idx, array=tgt_generation_mask) layers.array_write(pos_bias, i=step_idx, array=pos_biases) layers.assign(gather_idx, parent_idx) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=beam_size, end_id=self.eos_id) predictions = { "finished_ids": finished_ids, "finished_scores": finished_scores, "token_ids": inputs["token_ids"], "data_id": inputs["data_id"] } return predictions
def forward_transformer(src_vocab_size, trg_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, embedding_sharing, label_smooth_eps, use_py_reader=False, is_test=False, params_type="normal", all_data_inputs=None): """ transformer """ if embedding_sharing: assert src_vocab_size == trg_vocab_size, ( "Vocabularies in source and target should be same for weight sharing." ) data_input_names = encoder_data_input_fields + \ decoder_data_input_fields[:-1] + label_data_input_fields + dense_bias_input_fields if use_py_reader: all_inputs = all_data_inputs else: all_inputs = make_all_inputs(data_input_names) enc_inputs_len = len(encoder_data_input_fields) dec_inputs_len = len(decoder_data_input_fields[:-1]) enc_inputs = all_inputs[0:enc_inputs_len] dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len + dec_inputs_len] real_label = all_inputs[enc_inputs_len + dec_inputs_len] weights = all_inputs[enc_inputs_len + dec_inputs_len + 1] reverse_label = all_inputs[enc_inputs_len + dec_inputs_len + 2] enc_output = wrap_encoder( src_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, embedding_sharing, enc_inputs, params_type=params_type) predict = wrap_decoder( trg_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, embedding_sharing, dec_inputs, enc_output, is_train = True if not is_test else False, params_type=params_type) # Padding index do not contribute to the total loss. The weights is used to # cancel padding index in calculating the loss. if label_smooth_eps: label = layers.one_hot(input=real_label, depth=trg_vocab_size) label = label * (1 - label_smooth_eps) + (1 - label) * ( label_smooth_eps / (trg_vocab_size - 1)) label.stop_gradient = True else: label = real_label cost = layers.softmax_with_cross_entropy( logits=predict, label=label, soft_label=True if label_smooth_eps else False) weighted_cost = cost * weights sum_cost = layers.reduce_sum(weighted_cost) sum_cost.persistable = True token_num = layers.reduce_sum(weights) token_num.persistable = True token_num.stop_gradient = True avg_cost = sum_cost / token_num sen_count = layers.shape(dec_inputs[0])[0] batch_predict = layers.reshape(predict, shape = [sen_count, -1, ModelHyperParams.trg_vocab_size]) #batch_label = layers.reshape(real_label, shape=[sen_count, -1]) batch_weights = layers.reshape(weights, shape=[sen_count, -1, 1]) return sum_cost, avg_cost, token_num, batch_predict, cost, sum_cost, real_label, batch_weights
def _init_train(self): instances = self.instances Backbone = self.Backbone bb_conf = self.bb_conf bb_name = self.bb_name dev_count = self.dev_count num_instances = len(instances) mrs = self.mrs # set first_target/main task instance main_inst = None for inst in instances: if inst.is_target: main_inst = inst inst.is_first_target = True break main_conf = main_inst.config if not os.path.exists(main_conf['save_path']): os.makedirs(main_conf['save_path']) # prepare backbone train_backbone = Backbone(bb_conf, phase='train') pred_backbone = Backbone(bb_conf, phase='pred') # create reader, task # then check i/o across reader, backbone and task_layer task_attrs = [] pred_task_attrs = [] for inst in instances: train_reader = inst.Reader(inst.config, phase='train') inst.reader['train'] = train_reader train_parad = inst.Paradigm(inst.config, phase='train', backbone_config=bb_conf) inst.task_layer['train'] = train_parad task_attr_from_reader = _encode_inputs( train_parad.inputs_attrs['reader'], inst.name) task_attrs.append(task_attr_from_reader) _check_io(train_backbone.inputs_attr, train_reader.outputs_attr, in_name=bb_name + '_backbone', out_name='reader.train') _check_io(train_parad.inputs_attrs['reader'], train_reader.outputs_attr, in_name='task_paradigm.train.reader', out_name='reader.train') _check_io(train_parad.inputs_attrs['backbone'], train_backbone.outputs_attr, in_name='task_paradigm.train.backbone', out_name=bb_name + '_backbone') if inst.is_target: if 'pred_file' not in inst.config: inst.config['pred_file'] = '' pred_reader = inst.Reader(inst.config, phase='pred') pred_parad = inst.Paradigm(inst.config, phase='pred', backbone_config=bb_conf) # inst.reader['pred'] = pred_reader # 这里创建的reader是个假reader,只是为了读取output_attr而已,所以不做保存 inst.task_layer['pred'] = pred_parad # 框架有巨坑,先这样写吧 task_attr_from_reader = _encode_inputs( pred_parad.inputs_attrs['reader'], inst.name) pred_task_attrs.append(task_attr_from_reader) # task_attr = pred_parad.inputs_attrs['reader'] _check_io(pred_backbone.inputs_attr, pred_reader.outputs_attr, in_name=bb_name + '_backbone', out_name='reader.pred') _check_io(pred_parad.inputs_attrs['reader'], pred_reader.outputs_attr, in_name='task_paradigm.pred.reader', out_name='reader.pred') _check_io(pred_parad.inputs_attrs['backbone'], pred_backbone.outputs_attr, in_name='task_paradigm.pred.backbone', out_name=bb_name + '_backbone') # merge reader input attrs from backbone and task_instances joint_input_names, joint_shape_and_dtypes, name_to_position = merge_input_attrs( train_backbone.inputs_attr, task_attrs) pred_joint_input_names, pred_joint_shape_and_dtypes, _ = merge_input_attrs( pred_backbone.inputs_attr, pred_task_attrs, insert_taskid=False, insert_batchsize=False, insert_seqlen=False, insert_batchsize_x_seqlen=False) # shapes: [task_id, shapes_of_backbone, shapes_of_inst1, ..., shapes_of_instN] if DEBUG: print('----- for debug -----') print('joint input names:') print(joint_input_names) print('joint input shape and dtypes:') print(joint_shape_and_dtypes) # load data for inst in instances: print(inst.name + ": preparing data...") inst.reader['train'].load_data() # merge dataset iterators and create net input vars iterators = [] prefixes = [] mrs = [] for inst in instances: iterators.append(inst.reader['train'].iterator()) prefixes.append(inst.name) mrs.append(inst.mix_ratio) joint_iterator_fn = create_joint_iterator_fn(iterators, prefixes, joint_shape_and_dtypes, mrs, name_to_position, dev_count=dev_count, verbose=VERBOSE) input_attrs = [[ i, j, k ] for i, (j, k) in zip(joint_input_names, joint_shape_and_dtypes)] pred_input_attrs = [[i, j, k] for i, ( j, k) in zip(pred_joint_input_names, pred_joint_shape_and_dtypes)] net_inputs = create_net_inputs(input_attrs, async=True, iterator_fn=joint_iterator_fn, dev_count=dev_count, n_prefetch=3) # build backbone and task layers # 不指定scope名字会挂,框架有坑 train_prog = fluid.default_main_program() train_init_prog = fluid.default_startup_program() # 别用unique_name.guard了,没用的,无法作用到param_attr里的name上 # with fluid.unique_name.guard("backbone-"): bb_output_vars = train_backbone.build(net_inputs, scope_name='__paddlepalm_') assert sorted(bb_output_vars.keys()) == sorted( train_backbone.outputs_attr.keys()) # for block in train_init_prog.blocks: # for var in block.vars: # print(var) # 会挂 # 这里是否有必要新建一个program?是的,被坑死了 pred_prog = fluid.Program() pred_init_prog = fluid.Program() with fluid.program_guard(main_program=pred_prog, startup_program=pred_init_prog): # with fluid.unique_name.guard(): pred_net_inputs = create_net_inputs(pred_input_attrs) # 别用unique_name.guard了,没用的,无法作用到param_attr里的name上 # with fluid.unique_name.guard("backbone-"): pred_bb_output_vars = pred_backbone.build( pred_net_inputs, scope_name='__paddlepalm_') fluid.framework.switch_main_program(train_prog) fluid.framework.switch_startup_program(train_init_prog) # pred_backbone = train_backbone # pred_bb_output_vars = bb_output_vars task_output_vars = {} for inst in instances: task_inputs = {'backbone': bb_output_vars} task_inputs_from_reader = _decode_inputs(net_inputs, inst.name) task_inputs['reader'] = task_inputs_from_reader scope = inst.task_reuse_scope + '/' with fluid.unique_name.guard(scope): output_vars = inst.build_task_layer(task_inputs, phase='train', scope=scope) output_vars = { inst.name + '/' + key: val for key, val in output_vars.items() } old = len(task_output_vars) # for debug task_output_vars.update(output_vars) assert len(task_output_vars) - old == len( output_vars) # for debug # # prepare predict vars for saving inference model if inst.is_target: # task_attr = inst.task_layer['pred'].inputs_attrs['reader'] # _input_names, _shape_and_dtypes, _ = merge_input_attrs(pred_backbone.inputs_attr, task_attr, insert_taskid=False) # pred_input_attrs = [[i, j, k] for i, (j,k) in zip(_input_names, _shape_and_dtypes)] with fluid.program_guard(pred_prog, pred_init_prog): # pred_net_inputs = create_net_inputs(pred_input_attrs) # 这里同时建立了pred阶段的backbone计算图,不知道是否会造成额外的显存开销(paddle不会计算运行路径) cur_inputs = _decode_inputs(pred_net_inputs, inst.name) inst.pred_input = cur_inputs pred_task_inputs = { 'backbone': pred_bb_output_vars, 'reader': cur_inputs } scope = inst.task_reuse_scope + '/' # 注意,这里不加上fluid.unique_name.guard会挂 with fluid.unique_name.guard(scope): inst.build_task_layer(pred_task_inputs, phase='pred', scope=scope) bb_fetches = {k: v.name for k, v in bb_output_vars.items()} task_fetches = {k: v.name for k, v in task_output_vars.items()} # fetches = bb_fetches.copy() # 注意!框架在多卡时无法fetch变长维度的tensor,这里加入bb的out后会挂 # fetches.update(task_fetches) fetches = task_fetches fetches['__task_id'] = net_inputs['__task_id'].name # compute loss task_id_var = net_inputs['__task_id'] task_id_vec = layers.one_hot(task_id_var, num_instances) losses = fluid.layers.concat( [task_output_vars[inst.name + '/loss'] for inst in instances], axis=0) loss = layers.reduce_sum(task_id_vec * losses) main_reader = main_inst.reader['train'] num_examples = main_reader.num_examples for inst in instances: max_train_steps = int( main_conf['num_epochs'] * inst.mix_ratio * (num_examples // main_conf['batch_size'] // dev_count)) if inst.is_target: print('{}: expected train steps {}.'.format( inst.name, max_train_steps)) inst.steps_pur_epoch = inst.reader[ 'train'].num_examples // main_conf['batch_size'] // dev_count inst.expected_train_steps = max_train_steps global_max_train_steps = int( main_conf['num_epochs'] * sum(mrs) * (num_examples // main_conf['batch_size'] // dev_count)) print( 'Estimated overall train steps {}.'.format(global_max_train_steps)) if 'warmup_proportion' in main_conf and main_conf[ 'warmup_proportion'] > 0: warmup_steps = int(global_max_train_steps * main_conf['warmup_proportion']) print('Warmup steps: ' + str(warmup_steps)) else: warmup_steps = 0 # steps_pur_epoch = num_examples // main_conf['batch_size'] // dev_count # build optimizer # 其实也完全可以支持每个任务用它自己的optimizer if 'optimizer' in main_conf: optim_mod = importlib.import_module(OPTIMIZER_DIR + '.' + main_conf['optimizer']) optimize = getattr(optim_mod, OPTIMIZE_METHOD) optimize(loss, main_conf, max_train_steps, warmup_steps, fluid.default_main_program()) loss.persistable = True if main_conf.get('use_ema', False): assert 'ema_decay' in main_conf, "ema_decay should be set when use_ema is enabled." ema = fluid.optimizer.ExponentialMovingAverage( main_conf['ema_decay']) ema.update() # prepare for train self.train_backbone = train_backbone self.train_program = fluid.CompiledProgram( fluid.default_main_program()).with_data_parallel( loss_name=loss.name) self.saver_program = fluid.default_main_program() self.main_inst = main_inst self.fetches = fetches self.has_init_train = True self.has_init_pred = True # self.max_train_steps = max_train_steps # self.steps_pur_epoch = steps_pur_epoch self.exe.run(fluid.default_startup_program()) print("\nRandomly initialize parameters...\n")
def transformer(src_vocab_size, trg_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, label_smooth_eps, bos_idx=0, use_py_reader=False, is_test=False): if weight_sharing: assert src_vocab_size == trg_vocab_size, ( "Vocabularies in source and target should be same for weight sharing." ) data_input_names = encoder_data_input_fields + \ decoder_data_input_fields[:-1] + label_data_input_fields if use_py_reader: all_inputs, reader = make_all_py_reader_inputs(data_input_names, is_test) else: all_inputs = make_all_inputs(data_input_names) # print("all inputs",all_inputs) enc_inputs_len = len(encoder_data_input_fields) dec_inputs_len = len(decoder_data_input_fields[:-1]) enc_inputs = all_inputs[0:enc_inputs_len] dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len + dec_inputs_len] label = all_inputs[-2] weights = all_inputs[-1] enc_output = wrap_encoder(src_vocab_size, 64, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, enc_inputs) predict = wrap_decoder( trg_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, dec_inputs, enc_output, ) # Padding index do not contribute to the total loss. The weights is used to # cancel padding index in calculating the loss. if label_smooth_eps: label = layers.label_smooth(label=layers.one_hot(input=label, depth=trg_vocab_size), epsilon=label_smooth_eps) cost = layers.softmax_with_cross_entropy( logits=predict, label=label, soft_label=True if label_smooth_eps else False) weighted_cost = cost * weights sum_cost = layers.reduce_sum(weighted_cost) token_num = layers.reduce_sum(weights) token_num.stop_gradient = True avg_cost = sum_cost / token_num return sum_cost, avg_cost, predict, token_num, reader if use_py_reader else None
def _build_decoder(self, z_mean=None, z_log_var=None, enc_output=None, mode='train', beam_size=10): dec_input = layers.dropout(self.tar_emb, dropout_prob=self.dec_dropout_in, dropout_implementation="upscale_in_train") # `output_layer` will be used within BeamSearchDecoder output_layer = lambda x: layers.fc(x, size=self.tar_vocab_size, num_flatten_dims=len(x.shape) - 1, name="output_w") # `sample_output_layer` samples an id from the logits distribution instead of argmax(logits) # it will be used within BeamSearchDecoder sample_output_layer = lambda x: layers.unsqueeze( layers.one_hot(layers.unsqueeze( layers.sampling_id(layers.softmax( layers.squeeze(output_layer(x), [1])), dtype='int'), [1]), depth=self.tar_vocab_size), [1]) if mode == 'train': latent_z = self._sampling(z_mean, z_log_var) else: latent_z = layers.gaussian_random_batch_size_like( self.tar, shape=[-1, self.latent_size]) dec_first_hidden_cell = layers.fc(latent_z, 2 * self.hidden_size * self.num_layers, name='fc_hc') dec_first_hidden, dec_first_cell = layers.split( dec_first_hidden_cell, 2) if self.num_layers > 1: dec_first_hidden = layers.split(dec_first_hidden, self.num_layers) dec_first_cell = layers.split(dec_first_cell, self.num_layers) else: dec_first_hidden = [dec_first_hidden] dec_first_cell = [dec_first_cell] dec_initial_states = [[h, c] for h, c in zip(dec_first_hidden, dec_first_cell) ] dec_cell = DecoderCell(self.num_layers, self.hidden_size, latent_z, self.param_attr_initializer, self.param_attr_scale, self.dec_dropout_out) if mode == 'train': dec_output, _ = rnn(cell=dec_cell, inputs=dec_input, initial_states=dec_initial_states, sequence_length=self.tar_sequence_length) dec_output = output_layer(dec_output) return dec_output elif mode == 'greedy': start_token = 1 end_token = 2 max_length = 100 beam_search_decoder = BeamSearchDecoder( dec_cell, start_token, end_token, beam_size=1, embedding_fn=self.tar_embeder, output_fn=output_layer) outputs, _ = dynamic_decode(beam_search_decoder, inits=dec_initial_states, max_step_num=max_length) return outputs elif mode == 'sampling': start_token = 1 end_token = 2 max_length = 100 beam_search_decoder = BeamSearchDecoder( dec_cell, start_token, end_token, beam_size=1, embedding_fn=self.tar_embeder, output_fn=sample_output_layer) outputs, _ = dynamic_decode(beam_search_decoder, inits=dec_initial_states, max_step_num=max_length) return outputs else: print("mode not supprt", mode)
def model(): """model""" user_phone_brand_id = layers.data(name='user_phone_brand', shape=[1], dtype='int64') user_gender_id = layers.data(name='user_gender', shape=[1], dtype='int64') user_age_id = layers.data(name='user_age', shape=[1], dtype='int64') user_status_id = layers.data(name='user_status', shape=[1], dtype="int64") user_trade_id = fluid.layers.data(name='user_trade', shape=[1], dtype='int64') user_cater_id = fluid.layers.data(name='user_cater', shape=[1], dtype='int64') user_income_id = fluid.layers.data(name='user_income', shape=[1], dtype='int64') user_city_id = fluid.layers.data(name='user_city', shape=[1], dtype='int64') user_click_id = fluid.layers.data(name='user_click', shape=[1], dtype='int64') user_b_click_id = fluid.layers.data(name='user_b_click', shape=[1], dtype='int64') user_c_click_id = fluid.layers.data(name='user_c_click', shape=[1], dtype='int64') user_d_click_id = fluid.layers.data(name='user_d_click', shape=[1], dtype='int64') week_id = layers.data(name='week', shape=[1], dtype="int64") hour_id = layers.data(name='hour', shape=[1], dtype='int64') content_b_c_d_id = layers.data(name='content_b_c_d', shape=[1], dtype='int64') content_tags_id = layers.data(name='content_tags', shape=[1], dtype='int64', lod_level=1) content_subtags_id = layers.data(name='content_subtags', shape=[1], dtype='int64', lod_level=1) user_content_tag_click_id = layers.data(name='user_content_tag_click', shape=[1], dtype='int64') user_content_subtag_click_id = layers.data(name='user_content_subtag_click', shape=[1], dtype='int64') content_pctr_discrete_id = layers.data(name='content_pctr_discrete', shape=[1], dtype='int64') # dnn_score_discrete_id = layers.data(name='dnn_score_discrete', shape=[1], dtype='int64') content_pctr = layers.data(name='content_pctr', shape=[1], dtype='float32') # dnn_score = layers.data(name='dnn_score', shape=[1], dtype='float32') # content_emb = layers.data(name='content_emb', shape=[64], dtype='float32') # user_emb = layers.data(name='user_emb', shape=[64], dtype='float32') user_click_tags_id = layers.data( name='user_click_tags_id', shape=[1], dtype='int64', lod_level=1) user_click_subtags_id = layers.data( name='user_click_subtags_id', shape=[1], dtype='int64', lod_level=1) candidate_title_word = layers.data(name='candidate_title', shape=[1], dtype='int64', lod_level=1) candidate_subtitle_word = layers.data(name='candidate_subtitle', shape=[1], dtype='int64', lod_level=1) candidate_title_len_id = layers.data(name='candidate_title_len', shape=[1], dtype='int64') candidate_subtitle_len_id = layers.data(name='candidate_subtitle_len', shape=[1], dtype='int64') click_title_list = layers.data(name='click_title_list', shape=[1], dtype='int64', lod_level=2) click_subtitle_list = layers.data(name='click_subtitle_list', shape=[1], dtype='int64', lod_level=2) click_title_len_list = layers.data(name='click_title_len_list', shape=[1], dtype='int64', lod_level=1) click_subtitle_len_list = layers.data(name='click_subtitle_len_list', shape=[1], dtype='int64', lod_level=1) label = layers.data(name='label', shape=[1], dtype='int64') # dnn_score_discrete_id.name, dnn_score.name, content_emb.name,user_emb.name, load_list = [user_phone_brand_id, user_gender_id, user_age_id, user_status_id, user_trade_id, user_cater_id, user_income_id, user_city_id, user_click_id, user_b_click_id, user_c_click_id, user_d_click_id, week_id, hour_id, content_b_c_d_id, content_tags_id, content_subtags_id, user_content_tag_click_id, user_content_subtag_click_id, content_pctr_discrete_id, content_pctr, user_click_tags_id, user_click_subtags_id, candidate_title_word, candidate_subtitle_word, candidate_title_len_id, candidate_subtitle_len_id, click_title_list, click_subtitle_list, click_title_len_list, click_subtitle_len_list, label] feed_order = [x.name for x in load_list] user_phone_brand_emb = layers.embedding( input=user_phone_brand_id, dtype='float32', size=[7, EMB_LEN], param_attr='user_phone_brand_emb', is_sparse=True) user_gender_emb = layers.embedding( input=user_gender_id, dtype='float32', size=[3, EMB_LEN], param_attr='user_gender_emb', is_sparse=True) user_age_emb = layers.embedding( input=user_age_id, dtype='float32', size=[8, EMB_LEN], param_attr='user_age_emb', is_sparse=True) user_status_emb = layers.embedding( input=user_status_id, dtype='float32', size=[3, EMB_LEN], is_sparse=True, param_attr='user_status_emb') user_trade_emb = layers.embedding( input=user_trade_id, dtype='float32', size=[24, EMB_LEN], is_sparse=True, param_attr='user_trade_emb') user_cater_emb = layers.embedding( input=user_cater_id, dtype='float32', size=[4, EMB_LEN], is_sparse=True, param_attr='user_cater_emb') user_income_emb = layers.embedding( input=user_income_id, dtype='float32', size=[6, EMB_LEN], is_sparse=True, param_attr='user_income_emb') user_city_emb = layers.embedding( input=user_city_id, dtype='float32', size=[4000, EMB_LEN], is_sparse=True, param_attr='user_city_emb') user_click_emb = layers.embedding( input=user_click_id, dtype='float32', size=[6, EMB_LEN], is_sparse=True, param_attr='user_click_emb') user_b_click_emb = layers.embedding( input=user_b_click_id, dtype='float32', size=[6, EMB_LEN], is_sparse=True, param_attr='user_b_click_emb') user_c_click_emb = layers.embedding( input=user_c_click_id, dtype='float32', size=[6, EMB_LEN], is_sparse=True, param_attr='user_c_click_emb') user_d_click_emb = layers.embedding( input=user_d_click_id, dtype='float32', size=[6, EMB_LEN], is_sparse=True, param_attr='user_d_click_emb') week_emb = layers.embedding( input=week_id, dtype='float32', size=[8, EMB_LEN], is_sparse=True, param_attr='week_emb') hour_emb = layers.embedding( input=hour_id, dtype='float32', size=[24, EMB_LEN], is_sparse=True, param_attr='hour_emb') content_b_c_d_emb = layers.embedding( input=content_b_c_d_id, dtype='float32', size=[3, EMB_LEN], is_sparse=True, param_attr='content_b_c_d_emb') content_tags_emb = layers.embedding( input=content_tags_id, size=[11, EMB_LEN], dtype='float32', is_sparse=True, param_attr=fluid.ParamAttr( name="content_tags_emb", learning_rate=0.5, regularizer=fluid.regularizer.L2Decay(1.0)) ) content_tags_emb_avg = fluid.layers.sequence_pool(input=content_tags_emb, pool_type='average') content_subtags_emb = layers.embedding( input=content_subtags_id, size=[65, EMB_LEN], dtype='float32', is_sparse=True, param_attr=fluid.ParamAttr( name="content_subtags_emb", learning_rate=0.5, regularizer=fluid.regularizer.L2Decay(1.0)) ) content_subtags_emb_avg = fluid.layers.sequence_pool( input=content_subtags_emb, pool_type='average') user_content_tag_click_emb = layers.embedding( input=user_content_tag_click_id, dtype='float32', size=[11 * 6, EMB_LEN], is_sparse=True, param_attr='user_content_tag_click_emb') user_content_subtag_click_emb = layers.embedding( input=user_content_subtag_click_id, dtype='float32', size=[65 * 6, EMB_LEN], is_sparse=True, param_attr='user_content_subtag_click_emb') content_pctr_discrete_emb = layers.embedding( input=content_pctr_discrete_id, dtype='float32', size=[55, EMB_LEN], is_sparse=True, param_attr='content_pctr_discrete_emb') # dnn_score_discrete_emb = layers.embedding( # input=dnn_score_discrete_id, dtype='float32', # size=[21, EMB_LEN], is_sparse=True, param_attr='dnn_score_discrete_emb') user_click_tags_id_emb = layers.embedding( input=user_click_tags_id, size=[11 * 6, EMB_LEN], dtype='float32', is_sparse=True, param_attr="user_content_tag_click_emb") user_click_tags_id_emb_avg = fluid.layers.sequence_pool( input=user_click_tags_id_emb, pool_type='average') user_click_subtags_id_emb = layers.embedding( input=user_click_subtags_id, size=[65 * 6, EMB_LEN], dtype='float32', is_sparse=True, param_attr="user_content_subtag_click_emb") user_click_subtags_id_emb_avg = fluid.layers.sequence_pool( input=user_click_subtags_id_emb, pool_type='average') # 候选内容feature生成 cand_title_emb = layers.embedding(input=candidate_title_word, size=[19962, EMB_LEN], dtype='float32', is_sparse=False, param_attr='word_embedding') cand_title_conv_pool = nets.sequence_conv_pool( input=cand_title_emb, num_filters=NUM_FILTERS, filter_size=3, act="relu", pool_type="average", param_attr='title_emb_conv', bias_attr='title_emb_conv_b') cand_subtitle_emb = layers.embedding(input=candidate_subtitle_word, size=[19962, EMB_LEN], dtype='float32', is_sparse=False, param_attr='word_embedding') cand_subtitle_conv_pool = nets.sequence_conv_pool( input=cand_subtitle_emb, num_filters=NUM_FILTERS, filter_size=3, act="relu", pool_type="average", param_attr='subtitle_emb_conv', bias_attr='subtitle_emb_conv_b') cand_title_len_emb = layers.embedding(input=candidate_title_len_id, size=[100, EMB_LEN], dtype='float32', is_sparse=True, param_attr='title_len_emb') cand_subtitle_len_emb = layers.embedding(input=candidate_subtitle_len_id, size=[100, EMB_LEN], dtype='float32', is_sparse=True, param_attr='subtitle_len_emb') cand_title_inf = layers.concat( input=[cand_title_conv_pool, cand_subtitle_conv_pool, cand_title_len_emb, cand_subtitle_len_emb], axis=-1) cand_title_feature = layers.fc( input=cand_title_inf, size=32, act="relu", param_attr='title_feature_list') #共享参数 # 用户历史点击内容feature生成 click_title_emb = layers.embedding(input=click_title_list, size=[19962, EMB_LEN], dtype='float32', is_sparse=False, param_attr='word_embedding') click_title_drnn = fluid.layers.DynamicRNN() with click_title_drnn.block(): title_emb = click_title_drnn.step_input(click_title_emb) click_title_conv_pool = nets.sequence_conv_pool( input=title_emb, num_filters=NUM_FILTERS, filter_size=3, act="relu", pool_type="average", param_attr='title_emb_conv', bias_attr='title_emb_conv_b') click_title_drnn.output(click_title_conv_pool) click_title_conv_pool_list = click_title_drnn() click_subtitle_emb = layers.embedding(input=click_subtitle_list, size=[19962, EMB_LEN], dtype='float32', is_sparse=False, param_attr='word_embedding') click_subtitle_drnn = fluid.layers.DynamicRNN() with click_subtitle_drnn.block(): subtitle_emb = click_subtitle_drnn.step_input(click_subtitle_emb) click_subtitle_conv_pool = nets.sequence_conv_pool( input=subtitle_emb, num_filters=NUM_FILTERS, filter_size=3, act="relu", pool_type="average", param_attr='subtitle_emb_conv', bias_attr='subtitle_emb_conv_b') click_subtitle_drnn.output(click_subtitle_conv_pool) click_subtitle_conv_pool_list = click_subtitle_drnn() click_title_len_emb_list = layers.embedding(input=click_title_len_list, size=[100, EMB_LEN], dtype='float32', is_sparse=True, param_attr='title_len_emb') click_subtitle_len_emb_list = layers.embedding(input=click_subtitle_len_list, size=[100, EMB_LEN], dtype='float32', is_sparse=True, param_attr='subtitle_len_emb') click_title_inf_list = layers.concat( input=[click_title_conv_pool_list, click_subtitle_conv_pool_list, click_title_len_emb_list, click_subtitle_len_emb_list], axis=-1) click_title_feature_list = layers.fc( input=click_title_inf_list, size=32, act="relu", param_attr='title_feature_list') #共享参数 user_click_title_feature = layers.sequence_pool(input=click_title_feature_list, pool_type="average") user_emb_feature = layers.concat( input=[user_phone_brand_emb, user_gender_emb, user_age_emb, user_status_emb, user_trade_emb, user_cater_emb, user_income_emb, user_city_emb, user_click_emb, user_b_click_emb, user_c_click_emb, user_d_click_emb], axis=1) content_emb_feature = layers.concat( input=[content_b_c_d_emb, content_tags_emb_avg, content_subtags_emb_avg, content_pctr_discrete_emb, cand_title_feature], axis=1) cross_emb_feature = layers.concat( input=[user_content_tag_click_emb, user_content_subtag_click_emb, user_click_tags_id_emb_avg, user_click_subtags_id_emb_avg, user_click_title_feature], axis=1) env_emb_feature = layers.concat( input=[week_emb, hour_emb], axis=1) combined_features = layers.concat(input=[ user_emb_feature, content_emb_feature, cross_emb_feature, env_emb_feature], axis=1) fc1 = layers.fc(input=combined_features, size=200, act='relu', param_attr='fc1', bias_attr='fc1_b') fc2 = layers.fc(input=fc1, size=200, act="relu", param_attr='fc2', bias_attr='fc2_b') fc3 = layers.fc(input=fc2, size=200, act="relu", param_attr='fc3', bias_attr='fc3_b') content_pctr_discrete_id_one_hot = layers.one_hot( content_pctr_discrete_id, 55, allow_out_of_range=False) final_layer = layers.concat(input=[fc3, content_pctr, content_pctr_discrete_id_one_hot], axis=1) predict = layers.fc( input=final_layer, size=2, act="softmax", param_attr='final_predict', bias_attr='final_predict_b') auc = fluid.layers.auc( input=predict, label=label, num_thresholds=2 ** 12) cost = layers.cross_entropy(input=predict, label=label) avg_cost = layers.reduce_mean(cost) loader = fluid.io.DataLoader.from_generator( feed_list=load_list, capacity=256, use_double_buffer=True, iterable=True) return {'predict': predict, 'avg_cost': avg_cost, 'feed_order': feed_order, 'loader': loader, 'auc': auc}
def encoder(x, y, vocab_size, emb_size, init_hidden=None, init_cell=None, para_name='', custom_samples=None, custom_probabilities=None, test_mode=False, args=None): x_emb = layers.embedding(input=x, size=[vocab_size, emb_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr(name='embedding_para')) rnn_input = x_emb rnn_outs = [] rnn_outs_ori = [] cells = [] projs = [] for i in range(args.num_layers): rnn_input = dropout(rnn_input, test_mode, args) if init_hidden and init_cell: h0 = layers.squeeze(layers.slice(init_hidden, axes=[0], starts=[i], ends=[i + 1]), axes=[0]) c0 = layers.squeeze(layers.slice(init_cell, axes=[0], starts=[i], ends=[i + 1]), axes=[0]) else: h0 = c0 = None rnn_out, cell, input_proj = lstmp_encoder( rnn_input, args.hidden_size, h0, c0, para_name + 'layer{}'.format(i + 1), emb_size, test_mode, args) rnn_out_ori = rnn_out if i > 0: rnn_out = rnn_out + rnn_input rnn_out = dropout(rnn_out, test_mode, args) cell = dropout(cell, test_mode, args) rnn_outs.append(rnn_out) rnn_outs_ori.append(rnn_out_ori) rnn_input = rnn_out cells.append(cell) projs.append(input_proj) softmax_weight = layers.create_parameter([vocab_size, emb_size], dtype="float32", name="softmax_weight") softmax_bias = layers.create_parameter([vocab_size], dtype="float32", name='softmax_bias') projection = layers.matmul(rnn_outs[-1], softmax_weight, transpose_y=True) projection = layers.elementwise_add(projection, softmax_bias) projection = layers.reshape(projection, shape=[-1, vocab_size]) if args.sample_softmax and (not test_mode): loss = layers.sampled_softmax_with_cross_entropy( logits=projection, label=y, num_samples=args.n_negative_samples_batch, seed=args.random_seed) else: label = layers.one_hot(input=y, depth=vocab_size) loss = layers.softmax_with_cross_entropy(logits=projection, label=label, soft_label=True) return [x_emb, projection, loss], rnn_outs, rnn_outs_ori, cells, projs
def _forward(self, inputs, is_training): """ Real forward process of model in different mode(train/test). """ outputs = {} src_token = inputs["src_token"] src_mask = inputs["src_mask"] src_pos = inputs["src_pos"] src_type = inputs["src_type"] src_turn = inputs["src_turn"] tgt_token = inputs["tgt_token"][:, :-1] tgt_mask = inputs["tgt_mask"][:, :-1] tgt_pos = inputs["tgt_pos"][:, :-1] tgt_type = inputs["tgt_type"][:, :-1] tgt_turn = inputs["tgt_turn"][:, :-1] input_mask = layers.concat([src_mask, tgt_mask], axis=1) input_mask.stop_gradient = True src_embed = self.embedder(src_token, src_pos, src_type, src_turn) tgt_embed = self.embedder(tgt_token, tgt_pos, tgt_type, tgt_turn) embed = layers.concat([src_embed, tgt_embed], axis=1) embed = self.embed_layer_norm(embed) batch_size = src_token.shape[0] src_len = src_token.shape[1] tgt_len = tgt_token.shape[1] if self.num_latent > 0: post_embed, post_probs, post_logits = self._posteriori_network( input_mask, embed, batch_size, src_len, tgt_len) outputs["post_logits"] = post_logits if self.use_discriminator: pos_probs, neg_probs = self._discriminator_network( input_mask, embed, batch_size, src_len, tgt_len, post_embed) outputs["pos_probs"] = pos_probs outputs["neg_probs"] = neg_probs if is_training: z = F.gumbel_softmax(post_logits, self.tau) else: indices = layers.argmax(post_logits, axis=1) z = layers.one_hot(F.unsqueeze(indices, [1]), self.num_latent) latent_embeddings = self.latent_embeddings latent_embed = layers.matmul(z, latent_embeddings) outputs["latent_embed"] = latent_embed else: latent_embed = None latent_embed, dec_probs = self._generation_network( input_mask, embed, batch_size, src_len, tgt_len, latent_embed) outputs["dec_probs"] = dec_probs if self.num_latent > 0 and self.with_bow: if self.two_layer_predictor: latent_embed = self.pre_bow_predictor(latent_embed) bow_logits = self.bow_predictor(latent_embed) bow_probs = layers.softmax(bow_logits) outputs["bow_probs"] = bow_probs return outputs
def transformer(model_input, src_vocab_size, trg_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, label_smooth_eps, bos_idx=0, is_test=False): if weight_sharing: assert src_vocab_size == trg_vocab_size, ( "Vocabularies in source and target should be same for weight sharing." ) enc_inputs = (model_input.src_word, model_input.src_pos, model_input.src_slf_attn_bias) dec_inputs = (model_input.trg_word, model_input.trg_pos, model_input.trg_slf_attn_bias, model_input.trg_src_attn_bias) label = model_input.lbl_word weights = model_input.lbl_weight enc_output = wrap_encoder(enc_inputs, src_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, bos_idx=bos_idx) predict = wrap_decoder(dec_inputs, trg_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, enc_output=enc_output) # Padding index do not contribute to the total loss. The weights is used to # cancel padding index in calculating the loss. if label_smooth_eps: # TODO: use fluid.input.one_hot after softmax_with_cross_entropy removing # the enforcement that the last dimension of label must be 1. label = layers.label_smooth(label=layers.one_hot(input=label, depth=trg_vocab_size), epsilon=label_smooth_eps) cost = layers.softmax_with_cross_entropy( logits=predict, label=label, soft_label=True if label_smooth_eps else False) weighted_cost = layers.elementwise_mul(x=cost, y=weights, axis=0) sum_cost = layers.reduce_sum(weighted_cost) token_num = layers.reduce_sum(weights) token_num.stop_gradient = True avg_cost = sum_cost / token_num return sum_cost, avg_cost, predict, token_num