def test_softmax(self): program = Program() with program_guard(program): data = layers.data(name='data', shape=[10], dtype='float32') hid = layers.fc(input=data, size=20) self.assertIsNotNone(layers.softmax(hid)) print(str(program))
def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate, is_test=False): """ Scaled Dot-Product Attention """ scaled_q = layers.scale(x=q, scale=d_key**-0.5) product = layers.matmul(x=scaled_q, y=k, transpose_y=True) if attn_bias: product += attn_bias weights = layers.softmax(product, use_cudnn=True) if dropout_rate: weights = layers.dropout(weights, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=is_test) out = layers.matmul(weights, v) return out
def forward(self, inputs, labels=None, logits_softmax=False): """前向预测 """ emb = self.embedding(inputs) hid_fc1 = self._hid_fc1(emb) gru_forward = self._gru_forward(hid_fc1) gru_forward_tanh = L.tanh(gru_forward) if self.bi_direction: gru_backward = self._gru_backward(hid_fc1) gru_backward_tanh = L.tanh(gru_backward) encoded_vector = L.concat( input=[gru_forward_tanh, gru_backward_tanh], axis=2) encoded_vector = L.reduce_max(encoded_vector, dim=1) else: encoded_vector = L.reduce_max(gru_forward_tanh, dim=1) hid_fc_2 = self._hid_fc2(encoded_vector) logits = self._output_fc(hid_fc_2) # 输出logits为softmax后的结果 if logits_softmax: logits = L.softmax(logits) # 如果没有给标签 则输出logits结果 if labels is None: return logits if len(labels.shape) == 1: labels = L.reshape(labels, [-1, 1]) #print("labels shape: {}".format(labels.shape)) loss = L.softmax_with_cross_entropy(logits, labels) # 如果输出logits的激活函数为softmax 则不能用softmax_with_cross_entropy #loss = L.cross_entropy(logits, labels) loss = L.reduce_mean(loss) return loss, logits
def forward(self, *args, **kwargs): """ Args: logits_softmax (optional, boolean): if true, return logits after softmax Returns: loss (`Variable` of shape []): Cross entropy loss mean over batch if labels not set, doesn't return logits (`Variable` of shape [batch_size, hidden_size]): output logits of classifier """ logits_softmax = kwargs.pop("logits_softmax", False) loss, logits = super(ErnieModelCustomized, self).forward(*args, **kwargs) if logits_softmax: logits = L.softmax(logits, use_cudnn=True) if loss is None: return logits else: return loss, logits
def compute_mog_loss(self, y, t): """compute the loss where output distribution is a mixture of Gaussians. Args: y (Variable): shape(B, T, C_output), dtype float32, the parameterd of the output distribution. It is the concatenation of 3 parts, the logits of every distribution, the mean of each distribution and the log standard deviation of each distribution. Each part's shape is (B, T, n_mixture), where `n_mixture` means the number of Gaussians in the mixture. t (Variable): shape(B, T), dtype float32, the target audio. Note that the target's corresponding time index is one step ahead of the output distribution. And output distribution whose input contains padding is neglected in loss computation. Returns: Variable: shape(1, ), dtype float32, the loss. """ n_mixture = self.output_dim // 3 # context size is not taken in to account y = y[:, self.context_size:, :] t = t[:, self.context_size:] w, mu, log_std = F.split(y, 3, dim=2) # 100.0 is just a large float log_std = F.clip(log_std, min=self.log_scale_min, max=100.) inv_std = F.exp(-log_std) p_mixture = F.softmax(w, axis=-1) t = F.unsqueeze(t, axes=[-1]) if n_mixture > 1: # t = F.expand_as(t, log_std) t = F.expand(t, [1, 1, n_mixture]) x_std = inv_std * (t - mu) exponent = F.exp(-0.5 * x_std * x_std) pdf_x = 1.0 / math.sqrt(2.0 * math.pi) * inv_std * exponent pdf_x = p_mixture * pdf_x # pdf_x: [bs, len] pdf_x = F.reduce_sum(pdf_x, dim=-1) per_sample_loss = -F.log(pdf_x + 1e-9) loss = F.reduce_mean(per_sample_loss) return loss
def forward(self, inputs, labels=None, logits_softmax=False): """前向预测 """ #print("\n".join(map(lambda ids: "/ ".join([id_2_token[x] for x in ids]), inputs.numpy()))) # inputs shape = [batch_size, seq_len] #print("inputs shape: {}".format(inputs.shape)) # emb shape = [batch_size, seq_len, emb_dim] emb = self.embedding(inputs) #print("emb shape: {}".format(emb.shape)) conv_pool_res = self.textcnn(emb) hid_fc = self._hid_fc(conv_pool_res) #print("hid_fc shape: {}".format(hid_fc.shape)) logits = self._output_fc(hid_fc) #print("logits shape: {}".format(logits.shape)) # 输出logits为softmax后的结果 if logits_softmax: logits = L.softmax(logits) # 如果没有给标签 则输出logits结果 if labels is None: return logits # 调整label的形状 if len(labels.shape) == 1: labels = L.reshape(labels, [-1, 1]) #logging.info("labels shape: {}".format(labels.shape)) loss = L.softmax_with_cross_entropy(logits, labels) # 如果输出logits的激活函数为softmax 则不能用softmax_with_cross_entropy #loss = L.cross_entropy(logits, labels) loss = L.reduce_mean(loss) #acc = L.accuracy(input=prediction, label=label) return loss, logits
def _dot_product_relative(q, k, v, bias, dropout=0.1, cache=None, params_type="normal"): depth_constant = int(k.shape[3]) heads = layers.shape(k)[1] length = layers.shape(k)[2] max_relative_position = 4 pre_name = "relative_positions_" if params_type == "fixed": pre_name = "fixed_relative_positions_" elif params_type == "new": pre_name = "new_relative_positions_" relations_keys = generate_relative_positions_embeddings( length, depth_constant, max_relative_position, name=pre_name + "keys", cache=cache is not None) relations_values = generate_relative_positions_embeddings( length, depth_constant, max_relative_position, name=pre_name + "values", cache=cache is not None) logits = _relative_attention_inner(q, k, relations_keys, True) if bias is not None: logits += bias weights = layers.softmax(logits, name="attention_weights") weights = layers.dropout(weights, dropout_prob=float(dropout)) output = _relative_attention_inner(weights, v, relations_values, False) return output
def attention(self, hidden, encoder_output, encoder_output_proj, encoder_padding_mask): # 定义attention用以计算context,即 c_i,这里使用Bahdanau attention机制 decoder_state_proj = layers.unsqueeze( layers.fc(hidden, size=self.hidden_size, bias_attr=False), [1]) mixed_state = fluid.layers.elementwise_add( encoder_output_proj, layers.expand(decoder_state_proj, [1, layers.shape(decoder_state_proj)[1], 1])) attn_scores = layers.squeeze( layers.fc(input=mixed_state, size=1, num_flatten_dims=2, bias_attr=False), [2]) if encoder_padding_mask is not None: attn_scores = layers.elementwise_add(attn_scores, encoder_padding_mask) attn_scores = layers.softmax(attn_scores) context = layers.reduce_sum(layers.elementwise_mul(encoder_output, attn_scores, axis=0), dim=1) return context
def forward(self, x): b, c, h, w = x.shape f_query = self.conv_query(x) f_query = reshape(f_query, (b, -1, h * w)) f_query = transpose(f_query, (0, 2, 1)) f_key = self.conv_key(x) f_key = reshape(f_key, (b, -1, h * w)) f_value = self.conv_value(x) f_value = reshape(f_value, (b, -1, h * w)) f_value = transpose(f_value, (0, 2, 1)) f_similarity = bmm(f_query, f_key) # [h*w, h*w] f_similarity = softmax(f_similarity) f_similarity = transpose(f_similarity, (0, 2, 1)) f_attention = bmm(f_similarity, f_value) # [h*w, c] f_attention = reshape(f_attention, (b, c, h, w)) out = self.gamma * f_attention + x return out
def forward(self, queries, keys, values, attn_bias, cache=None): # compute q ,k ,v q, k, v = self._prepare_qkv(queries, keys, values, cache) # scale dot product attention product = layers.matmul( x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5) if attn_bias is not None: product += attn_bias weights = layers.softmax(product) if self.dropout_rate: weights = layers.dropout( weights, dropout_prob=self.dropout_rate, is_test=False) out = layers.matmul(weights, v) # combine heads out = layers.transpose(out, perm=[0, 2, 1, 3]) out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) # project to output out = self.proj_fc(out) return out
def forward(self, queries, keys, values, attn_bias, past_cache): assert len(queries.shape) == len(keys.shape) == len(values.shape) == 3 # bsz, q_len, q_dim = queries.shape # bsz, k_len, k_dim = keys.shape # bsz, v_len, v_dim = values.shape # assert k_len == v_len q = self.q(queries) k = self.k(keys) v = self.v(values) cache = (k, v) if past_cache is not None: cached_k, cached_v = past_cache k = L.concat([cached_k, k], 1) v = L.concat([cached_v, v], 1) q = L.transpose(L.reshape(q, [0, 0, self.n_head, q.shape[-1] // self.n_head]), [0, 2, 1, 3]) # [batch, head, seq, dim] k = L.transpose(L.reshape(k, [0, 0, self.n_head, k.shape[-1] // self.n_head]), [0, 2, 1, 3]) # [batch, head, seq, dim] v = L.transpose(L.reshape(v, [0, 0, self.n_head, v.shape[-1] // self.n_head]), [0, 2, 1, 3]) # [batch, head, seq, dim] q = L.scale(q, scale=self.d_key ** -0.5) score = L.matmul(q, k, transpose_y=True) if attn_bias is not None: score += attn_bias score = L.softmax(score, use_cudnn=True) score = self.dropout(score) out = L.matmul(score, v) out = L.transpose(out, [0, 2, 1, 3]) out = L.reshape(out, [0, 0, out.shape[2] * out.shape[3]]) out = self.o(out) return out, cache
def forward(self, queries, keys, values, attn_bias, cache=None): # compute q ,k ,v keys = queries if keys is None else keys values = keys if values is None else values q = self.q_fc(queries) k = self.k_fc(keys) v = self.v_fc(values) # split head q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key]) q = layers.transpose(x=q, perm=[0, 2, 1, 3]) k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key]) k = layers.transpose(x=k, perm=[0, 2, 1, 3]) v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value]) v = layers.transpose(x=v, perm=[0, 2, 1, 3]) if cache is not None: cache_k, cache_v = cache["k"], cache["v"] k = layers.concat([cache_k, k], axis=2) v = layers.concat([cache_v, v], axis=2) cache["k"], cache["v"] = k, v # scale dot product attention product = layers.matmul(x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5) if attn_bias is not None: product += attn_bias weights = layers.softmax(product) if self.dropout_rate: weights = layers.dropout(weights, dropout_prob=self.dropout_rate, is_test=False) out = layers.matmul(weights, v) out = layers.transpose(out, perm=[0, 2, 1, 3]) out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) out = self.proj_fc(out) return out
def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): """ Scaled Dot-Product Attention """ scaled_q = layers.scale(x=q, scale=d_key**-0.5) product = layers.matmul(x=scaled_q, y=k, transpose_y=True) if attn_bias: product += attn_bias weights = layers.softmax(product) # memorize the weights in block self.model['blocks'][curr_block_id]['multi_head_attention'][ 'softmax'] = weights if dropout_rate and self.is_training: weights = layers.dropout( weights, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False) out = layers.matmul(weights, v) return out
def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): """ Scaled Dot-Product Attention q: (-1, 16, 80, 64) k: (-1, 16, 80, 64) v: (-1, 16, 80, 64) attn_bias: (-1, 16, 80, 80) """ scaled_q = layers.scale(x=q, scale=d_key ** -0.5) product = layers.matmul(x=scaled_q, y=k, transpose_y=True) if attn_bias: product += attn_bias weights = layers.softmax(product) if dropout_rate: weights = layers.dropout( weights, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", seed=seed, is_test=False) out = layers.matmul(weights, v) # out: (-1, 16, 80, 64) return out
def forward(self, outputs, target_sizes): """ Perform the computation Parameters: outputs: raw outputs of the model target_sizes: tensor of dimension [batch_size x 2] containing the size of each image For evaluation, this must be the original image size (before any data augmentation) For visualization, this should be the image size after data augment, but before padding """ out_logits, out_bbox = outputs["pred_logits"], outputs["pred_boxes"] assert len(out_logits) == len(target_sizes) assert target_sizes.shape[1] == 2 prob = L.softmax(out_logits, -1) # [bs, num_queries, num_classes + 1] labels = L.argmax(prob[:, :, :], axis=-1) # [bs, num_queries] scores = L.reduce_max(prob, dim=-1) # [bs, num_queries] # convert to [x0, y0, x1, y1] format bs, num_queries, _ = out_bbox.shape out_bbox = L.reshape(out_bbox, (-1, 4)) boxes = box_ops.box_cxcywh_to_xyxy(out_bbox) boxes = L.reshape(boxes, (bs, num_queries, 4)) # and fromm relative [0, 1] to absolute [0, height] coordinates img_h, img_w = target_sizes[:, 0], target_sizes[:, 1] scale_fct = L.stack([img_w, img_h, img_w, img_h], 1) # [bs, 4] scale_fct = L.expand(L.unsqueeze(scale_fct, [1]), (1, num_queries, 1)) boxes = boxes * scale_fct results = [{ 'scores': s, 'labels': l, 'boxes': b } for s, l, b in zip(scores.numpy(), labels.numpy(), boxes.numpy())] return results
def forward(self, queries, keys=None, values=None, mask=None): keys = queries if keys is None else keys values = keys if values is None else values q = self.q_proj(queries) k = self.q_proj(keys) v = self.q_proj(values) q = layers.transpose(layers.reshape(q, shape=[0, 0, self.n_head, self.d_key]), [0, 2, 1, 3]) k = layers.transpose(layers.reshape(k, shape=[0, 0, self.n_head, self.d_key]), [0, 2, 1, 3]) v = layers.transpose(layers.reshape(v, shape=[0, 0, self.n_head, self.d_value]), [0, 2, 1, 3]) scaled_q = layers.scale(x=q, scale=self.d_key ** -0.5) product = layers.matmul(x=scaled_q, y=k, transpose_y=True) if mask is not None: product -= (1 - layers.transpose(layers.unsqueeze(mask, 1), [0, 1, 3, 2])) * 1e10 weights = layers.softmax(product) if self.dropout_rate: weights = layers.dropout( weights, dropout_prob=self.dropout_rate, dropout_implementation="upscale_in_train", is_test=not self.training) out = layers.matmul(weights, v) out = layers.reshape(layers.transpose(out, [0, 2, 1, 3]), [0, 0, self.d_value * self.n_head]) return out
def knowledge_seq2seq(config): """ knowledge seq2seq """ emb_size = config.embed_size hidden_size = config.hidden_size input_size = emb_size num_layers = config.num_layers bi_direc = config.bidirectional batch_size = config.batch_size vocab_size = config.vocab_size run_type = config.run_type enc_input = layers.data(name="enc_input", shape=[1], dtype='int64', lod_level=1) #enc_input --> goal enc_mask = layers.data(name="enc_mask", shape=[-1, 1], dtype='float32') goal_input = layers.data(name="goal_input", shape=[1], dtype='int64', lod_level=1) #goal_input --> x cue_input = layers.data(name="cue_input", shape=[1], dtype='int64', lod_level=1) #cue_input --> kg #cue_mask = layers.data(name='cue_mask', shape=[-1, 1], dtype='float32') memory_mask = layers.data(name='memory_mask', shape=[-1, 1], dtype='float32') tar_input = layers.data(name='tar_input', shape=[1], dtype='int64', lod_level=1) #tar_input --> y # tar_mask = layers.data(name="tar_mask", shape=[-1, 1], dtype='float32') rnn_hidden_size = hidden_size if bi_direc: rnn_hidden_size //= 2 enc_out, enc_last_hidden = \ rnn_encoder(enc_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc, dropout=0.0, batch_first=True, name="rnn_enc") goal_out, goal_last_hidden = \ rnn_encoder(goal_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc, dropout=0.0, batch_first=True, name="rnn_enc1") context_goal_out = fluid.layers.concat( input=[enc_last_hidden, goal_last_hidden], axis=2) context_goal_out = layers.reshape(context_goal_out, shape=[-1, 1, rnn_hidden_size * 4]) # context_goal_out = layers.squeeze(context_goal_out, axes=[1]) context_goal_out = fluid.layers.fc(context_goal_out, size=rnn_hidden_size * 2, bias_attr=False) context_goal_out = layers.unsqueeze(context_goal_out, axes=[0]) bridge_out = fc(context_goal_out, hidden_size, hidden_size, name="bridge") bridge_out = layers.tanh(bridge_out) cue_last_mask = layers.data(name='cue_last_mask', shape=[-1], dtype='float32') knowledge_out, knowledge_last_hidden = \ rnn_encoder(cue_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc, dropout=0.0, batch_first=True, last_mask=cue_last_mask, name="knowledge_enc") query = layers.slice(bridge_out, axes=[0], starts=[0], ends=[1]) query = layers.squeeze(query, axes=[0]) query = layers.unsqueeze(query, axes=[1]) query = layers.reshape(query, shape=[batch_size, -1, hidden_size]) cue_memory = layers.slice(knowledge_last_hidden, axes=[0], starts=[0], ends=[1]) cue_memory = layers.reshape(cue_memory, shape=[batch_size, -1, hidden_size]) memory_mask = layers.reshape(memory_mask, shape=[batch_size, 1, -1]) weighted_cue, cue_att = dot_attention(query, cue_memory, mask=memory_mask) cue_att = layers.reshape(cue_att, shape=[batch_size, -1]) knowledge = weighted_cue if config.use_posterior: print("config.use_posterior", config.use_posterior) target_out, target_last_hidden = \ rnn_encoder(tar_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc, dropout=0.0, batch_first=True, name="knowledge_enc1") target_goal_out = fluid.layers.concat( input=[target_last_hidden, goal_last_hidden], axis=2) target_goal_out = layers.reshape(target_goal_out, shape=[-1, 1, rnn_hidden_size * 4]) # target_goal_out = layers.squeeze(target_goal_out, axes=[1]) target_goal_out = fluid.layers.fc(target_goal_out, size=rnn_hidden_size * 2, bias_attr=False) target_goal_out = layers.unsqueeze(target_goal_out, axes=[0]) # get attenion # target_query = layers.slice(target_last_hidden, axes=[0], starts=[0], ends=[1]) target_query = layers.slice(target_goal_out, axes=[0], starts=[0], ends=[1]) target_query = layers.squeeze(target_query, axes=[0]) target_query = layers.unsqueeze(target_query, axes=[1]) target_query = layers.reshape(target_query, shape=[batch_size, -1, hidden_size]) weight_target, target_att = dot_attention(target_query, cue_memory, mask=memory_mask) target_att = layers.reshape(target_att, shape=[batch_size, -1]) # add to output knowledge = weight_target enc_memory_mask = layers.data(name="enc_memory_mask", shape=[-1, 1], dtype='float32') enc_memory_mask = layers.unsqueeze(enc_memory_mask, axes=[1]) # decoder init_hidden, enc_memory, enc_mask dec_init_hidden = bridge_out pad_value = fluid.layers.assign(input=np.array([0.0], dtype='float32')) enc_memory, origl_len_1 = layers.sequence_pad(x=enc_out, pad_value=pad_value) enc_memory.persistable = True gru_unit = GRU_unit(input_size + hidden_size, hidden_size, num_layers=num_layers, dropout=0.0, name="decoder_gru_unit") cue_gru_unit = GRU_unit(hidden_size + hidden_size, hidden_size, num_layers=num_layers, dropout=0.0, name="decoder_cue_gru_unit") tgt_vocab_size = config.vocab_size if run_type == "train": if config.use_bow: bow_logits = fc(knowledge, hidden_size, hidden_size, name='bow_fc_1') bow_logits = layers.tanh(bow_logits) bow_logits = fc(bow_logits, hidden_size, tgt_vocab_size, name='bow_fc_2') bow_logits = layers.softmax(bow_logits) bow_label = layers.data(name='bow_label', shape=[-1, config.max_len], dtype='int64') bow_mask = layers.data(name="bow_mask", shape=[-1, config.max_len], dtype='float32') bow_logits = layers.expand(bow_logits, [1, config.max_len, 1]) bow_logits = layers.reshape(bow_logits, shape=[-1, tgt_vocab_size]) bow_label = layers.reshape(bow_label, shape=[-1, 1]) bow_loss = layers.cross_entropy(bow_logits, bow_label, soft_label=False) bow_loss = layers.reshape(bow_loss, shape=[-1, config.max_len]) bow_loss *= bow_mask bow_loss = layers.reduce_sum(bow_loss, dim=[1]) bow_loss = layers.reduce_mean(bow_loss) dec_input = layers.data(name="dec_input", shape=[-1, 1, 1], dtype='int64') dec_mask = layers.data(name="dec_mask", shape=[-1, 1], dtype='float32') dec_knowledge = weight_target knowledge_goal_out = fluid.layers.concat( input=[dec_knowledge, target_query], axis=2) knowledge_goal_out = layers.reshape(knowledge_goal_out, shape=[-1, 1, rnn_hidden_size * 4]) # knowledge_goal_out = layers.squeeze(knowledge_goal_out, axes=[1]) knowledge_goal_out = fluid.layers.fc(knowledge_goal_out, size=rnn_hidden_size * 2, bias_attr=False) knowledge_goal_out = layers.unsqueeze(knowledge_goal_out, axes=[0]) decoder_logits = \ rnn_decoder(gru_unit, cue_gru_unit, dec_input, input_size, hidden_size, num_layers, enc_memory, enc_memory_mask, dec_knowledge, vocab_size, init_hidden=dec_init_hidden, mask=dec_mask, dropout=config.dropout) target_label = layers.data(name='target_label', shape=[-1, 1], dtype='int64') target_mask = layers.data(name='target_mask', shape=[-1, 1], dtype='float32') decoder_logits = layers.reshape(decoder_logits, shape=[-1, tgt_vocab_size]) target_label = layers.reshape(target_label, shape=[-1, 1]) nll_loss = layers.cross_entropy(decoder_logits, target_label, soft_label=False) nll_loss = layers.reshape(nll_loss, shape=[batch_size, -1]) nll_loss *= target_mask nll_loss = layers.reduce_sum(nll_loss, dim=[1]) nll_loss = layers.reduce_mean(nll_loss) prior_attn = cue_att + 1e-10 posterior_att = target_att posterior_att.stop_gradient = True prior_attn = layers.log(prior_attn) kl_loss = posterior_att * (layers.log(posterior_att + 1e-10) - prior_attn) kl_loss = layers.reduce_mean(kl_loss) kl_and_nll_factor = layers.data(name='kl_and_nll_factor', shape=[1], dtype='float32') kl_and_nll_factor = layers.reshape(kl_and_nll_factor, shape=[-1]) final_loss = bow_loss + kl_loss * kl_and_nll_factor + nll_loss * kl_and_nll_factor return [bow_loss, kl_loss, nll_loss, final_loss] elif run_type == "test": beam_size = config.beam_size batch_size = config.batch_size token = layers.fill_constant(shape=[batch_size * beam_size, 1], value=config.bos_id, dtype='int64') token = layers.reshape(token, shape=[-1, 1]) max_decode_len = config.max_dec_len dec_knowledge = knowledge INF = 100000000.0 init_score_np = np.ones([beam_size * batch_size], dtype='float32') * -INF for i in range(batch_size): init_score_np[i * beam_size] = 0.0 pre_score = layers.assign(init_score_np) pos_index_np = np.arange(batch_size).reshape(-1, 1) pos_index_np = \ np.tile(pos_index_np, (1, beam_size)).reshape(-1).astype('int32') * beam_size pos_index = layers.assign(pos_index_np) id_array = [] score_array = [] index_array = [] init_enc_memory = layers.expand(enc_memory, [1, beam_size, 1]) init_enc_memory = layers.reshape( init_enc_memory, shape=[batch_size * beam_size, -1, hidden_size]) init_enc_mask = layers.expand(enc_memory_mask, [1, beam_size, 1]) init_enc_mask = layers.reshape(init_enc_mask, shape=[batch_size * beam_size, 1, -1]) dec_knowledge = layers.reshape(dec_knowledge, shape=[-1, 1, hidden_size]) init_dec_knowledge = layers.expand(dec_knowledge, [1, beam_size, 1]) init_dec_knowledge = layers.reshape( init_dec_knowledge, shape=[batch_size * beam_size, -1, hidden_size]) dec_init_hidden = layers.expand(dec_init_hidden, [1, 1, beam_size]) dec_init_hidden = layers.reshape(dec_init_hidden, shape=[1, -1, hidden_size]) length_average = config.length_average UNK = config.unk_id EOS = config.eos_id for i in range(1, max_decode_len + 1): dec_emb = get_embedding(token, input_size, vocab_size) dec_out, dec_last_hidden = \ decoder_step(gru_unit, cue_gru_unit, dec_emb, dec_init_hidden, input_size, hidden_size, init_enc_memory, init_enc_mask, init_dec_knowledge, mask=None) output_in_size = hidden_size + hidden_size rnnout = layers.dropout(dec_out, dropout_prob=config.dropout, is_test=True) rnnout = fc(rnnout, output_in_size, hidden_size, name='dec_out_fc1') rnnout = fc(rnnout, hidden_size, vocab_size, name='dec_out_fc2') log_softmax_output = log_softmax(rnnout) log_softmax_output = layers.squeeze(log_softmax_output, axes=[1]) if i > 1: if length_average: log_softmax_output = layers.elementwise_add( (log_softmax_output / i), (pre_score * (1.0 - 1.0 / i)), axis=0) else: log_softmax_output = layers.elementwise_add( log_softmax_output, pre_score, axis=0) else: log_softmax_output = layers.elementwise_add(log_softmax_output, pre_score, axis=0) log_softmax_output = layers.reshape(log_softmax_output, shape=[batch_size, -1]) topk_score, topk_index = layers.topk(log_softmax_output, k=beam_size) topk_score = layers.reshape(topk_score, shape=[-1]) topk_index = layers.reshape(topk_index, shape=[-1]) vocab_var = layers.fill_constant([1], dtype='int64', value=vocab_size) new_token = topk_index % vocab_var index = topk_index // vocab_var id_array.append(new_token) index_array.append(index) index = index + pos_index score_array.append(topk_score) eos_ids = layers.fill_constant([beam_size * batch_size], dtype='int64', value=EOS) unk_ids = layers.fill_constant([beam_size * batch_size], dtype='int64', value=UNK) eos_eq = layers.cast(layers.equal(new_token, eos_ids), dtype='float32') topk_score += eos_eq * -100000000.0 unk_eq = layers.cast(layers.equal(new_token, unk_ids), dtype='float32') topk_score += unk_eq * -100000000.0 # update token = new_token pre_score = topk_score token = layers.reshape(token, shape=[-1, 1]) index = layers.cast(index, dtype='int32') dec_last_hidden = layers.squeeze(dec_last_hidden, axes=[0]) dec_init_hidden = layers.gather(dec_last_hidden, index=index) dec_init_hidden = layers.unsqueeze(dec_init_hidden, axes=[0]) init_enc_memory = layers.gather(init_enc_memory, index) init_enc_mask = layers.gather(init_enc_mask, index) init_dec_knowledge = layers.gather(init_dec_knowledge, index) final_score = layers.concat(score_array, axis=0) final_ids = layers.concat(id_array, axis=0) final_index = layers.concat(index_array, axis=0) final_score = layers.reshape( final_score, shape=[max_decode_len, beam_size * batch_size]) final_ids = layers.reshape( final_ids, shape=[max_decode_len, beam_size * batch_size]) final_index = layers.reshape( final_index, shape=[max_decode_len, beam_size * batch_size]) return final_score, final_ids, final_index
def _get_bboxes_single(self, cls_scores, bbox_preds, mlvl_points, img_shape, scale_factor, rescale=False, with_nms=True): # mlvl_points 里面每个元素是[格子行数*格子列数, 3] 具体是(格子左上角x坐标, 格子左上角y坐标, 格子边长) nms_cfg = self.nms_cfg assert len(cls_scores) == len(bbox_preds) == len(mlvl_points) mlvl_bboxes = [] mlvl_scores = [] # 遍历每个fpn输出层 for i_lvl, (cls_score, bbox_pred, points) in enumerate( zip(cls_scores, bbox_preds, mlvl_points)): # cls_score.shape = [80, h, w] # bbox_pred.shape = [ 4, h, w] # points.shape = [h*w, 3] 具体是(格子左上角x坐标, 格子左上角y坐标, 格子边长) cls_score = L.transpose(cls_score, [1, 2, 0]) # [h, w, 80] cls_score = L.reshape(cls_score, (-1, self.num_classes)) # [h*w, 80] if self.use_sigmoid_cls: scores = L.sigmoid(cls_score) # [h*w, 80] else: scores = L.softmax(cls_score) bbox_pred = L.transpose(bbox_pred, [1, 2, 0]) # [h, w, 4] bbox_pred = L.reshape(bbox_pred, (-1, 4)) # [h*w, 4] nms_top_k = nms_cfg.get('nms_top_k', -1) if nms_top_k > 0 and scores.shape[0] > nms_top_k: if self.use_sigmoid_cls: max_scores = L.reduce_max(scores, dim=1) else: # remind that we set FG labels to [0, num_class-1] # since mmdet v2.0 # BG cat_id: num_class # max_scores, _ = scores[:, :-1].max(dim=1) pass _, topk_inds = L.topk(max_scores, k=nms_top_k) scores = L.gather(scores, topk_inds) # [M, 80] points = L.gather(points, topk_inds) # [M, 3] 格子xy坐标、边长 bbox_pred = L.gather(bbox_pred, topk_inds) # [M, 4] # [M, 4] 格子xy坐标重复2次。格子左上角坐标。 bbox_pos_center = L.concat([points[:, :2], points[:, :2]], axis=1) # [M, 4] 物体最终预测坐标(x1y1x2y2格式) = bbox_pred*格子边长 + 格子左上角坐标 bboxes = bbox_pred * self.fpn_stride[i_lvl] + bbox_pos_center x1 = L.clip(bboxes[:, 0], 0.0, img_shape[1]) y1 = L.clip(bboxes[:, 1], 0.0, img_shape[0]) x2 = L.clip(bboxes[:, 2], 0.0, img_shape[1]) y2 = L.clip(bboxes[:, 3], 0.0, img_shape[0]) bboxes = paddle.stack([x1, y1, x2, y2], axis=-1) # [M, 4] mlvl_bboxes.append(bboxes) mlvl_scores.append(scores) mlvl_scores = L.concat(mlvl_scores, axis=0) # [M2, 80] 各个fpn层预测的分数汇合在一起 mlvl_bboxes = L.concat(mlvl_bboxes, axis=0) # [M2, 4] 各个fpn层预测的bbox(x1y1x2y2格式)汇合在一起 if rescale: scale_factor_ = paddle.to_tensor(scale_factor) mlvl_bboxes /= scale_factor_ # [M2, 4] 预测的bbox(x1y1x2y2格式) pred_scores = L.unsqueeze(mlvl_scores, axes=0) # [1, M2, 80] pred_boxes = L.unsqueeze(mlvl_bboxes, axes=0) # [1, M2, 4],最终坐标 pred_scores = L.transpose(pred_scores, perm=[0, 2, 1]) # [1, 80, M2],最终分数 # nms pred = None i = 0 nms_cfg = copy.deepcopy(self.nms_cfg) nms_type = nms_cfg.pop('nms_type') if nms_type == 'matrix_nms': pred = fluid.layers.matrix_nms(pred_boxes[i:i+1, :, :], pred_scores[i:i+1, :, :], background_label=-1, **nms_cfg) elif nms_type == 'multiclass_nms': pred = fluid.layers.multiclass_nms(pred_boxes[i:i+1, :, :], pred_scores[i:i+1, :, :], background_label=-1, **nms_cfg) return pred
def beam_search(): max_len = layers.fill_constant( shape=[1], dtype=start_tokens.dtype, value=max_out_len) step_idx = layers.fill_constant( shape=[1], dtype=start_tokens.dtype, value=0) cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond) # array states will be stored for each step. ids = layers.array_write(start_tokens, step_idx) scores = layers.array_write(init_scores, step_idx) # cell states will be overwrited at each step. # caches contains states of history steps to reduce redundant # computation in decoder. caches = [{ "k": layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, 0, d_model], dtype=enc_output.dtype, value=0), "v": layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, 0, d_model], dtype=enc_output.dtype, value=0) } for i in range(n_layer)] with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) pre_scores = layers.array_read(array=scores, i=step_idx) # sequence_expand can gather sequences according to lod thus can be # used in beam search to sift states corresponding to selected ids. pre_src_attn_bias = layers.sequence_expand( x=trg_src_attn_bias, y=pre_scores) pre_enc_output = layers.sequence_expand(x=enc_output, y=pre_scores) pre_caches = [{ "k": layers.sequence_expand( x=cache["k"], y=pre_scores), "v": layers.sequence_expand( x=cache["v"], y=pre_scores), } for cache in caches] pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_enc_output, # cann't use pre_ids here since it has lod value=1, shape=[-1, 1], dtype=pre_ids.dtype), y=layers.increment( x=step_idx, value=1.0, in_place=False), axis=0) logits = wrap_decoder( trg_vocab_size, max_in_len, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, dropout_rate, weight_sharing, dec_inputs=( pre_ids, pre_pos, None, pre_src_attn_bias, trg_data_shape, slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, src_attn_post_softmax_shape), enc_output=pre_enc_output, caches=pre_caches) topk_scores, topk_indices = layers.topk( input=layers.softmax(logits), k=beam_size) accu_scores = layers.elementwise_add( x=layers.log(topk_scores), y=layers.reshape( pre_scores, shape=[-1]), axis=0) # beam_search op uses lod to distinguish branches. topk_indices = layers.lod_reset(topk_indices, pre_ids) selected_ids, selected_scores = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=beam_size, end_id=eos_idx) layers.increment(x=step_idx, value=1.0, in_place=True) # update states layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.assign(pre_src_attn_bias, trg_src_attn_bias) layers.assign(pre_enc_output, enc_output) for i in range(n_layer): layers.assign(pre_caches[i]["k"], caches[i]["k"]) layers.assign(pre_caches[i]["v"], caches[i]["v"]) layers.assign( layers.elementwise_add( x=slf_attn_pre_softmax_shape, y=attn_pre_softmax_shape_delta), slf_attn_pre_softmax_shape) layers.assign( layers.elementwise_add( x=slf_attn_post_softmax_shape, y=attn_post_softmax_shape_delta), slf_attn_post_softmax_shape) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=beam_size, end_id=eos_idx) return finished_ids, finished_scores
input_sequence = layers.data(name = "story", dtype = "int64", shape = [-1, story_maxlen, 1]) question = layers.data(name = "query", dtype = "int64", shape = [-1, query_maxlen, 1]) true_answer = layers.data(name = "true_answer", dtype = "int64", shape = [-1, 1]) input_encoder_m = layers.embedding(input = input_sequence, size = [vocab_size, 64]) input_encoder_m = layers.dropout(input_encoder_m, 0.3) input_encoder_c = layers.embedding(input = input_sequence, size = [vocab_size, query_maxlen]) input_encoder_c = layers.dropout(input_encoder_c, 0.3) question_encoder = layers.embedding(input = input_sequence, size = [vocab_size, 64]) question_encoder = layers.dropout(question_encoder, 0.3) match = layers.elementwise_mul(input_encoder_m, question_encoder) response = layers.softmax(match, axis = -1) answer = layers.concat([response, question_encoder], axis = -1) _, _, answer = basic_lstm(answer, None, None, 32) answer = layers.transpose(answer, perm = (1, 0, 2)) answer = layers.reshape(answer, shape = [-1, 32]) answer = layers.dropout(answer, 0.3) answer = layers.fc(answer, size = vocab_size, act = "softmax") loss = layers.cross_entropy(answer, true_answer) loss = layers.reduce_mean(loss) optimizer = fluid.optimizer.AdamOptimizer(learning_rate = 0.01) optimizer.minimize(loss)
def forward(self, ref_image, ref_label, label, k): """ Encode the reference image to get features for weight generation. Args: ref_image ((NxK)x3xHxW): Reference images. ref_label ((NxK)xCxHxW): Reference labels. label (NxCxHxW): Target label. k (int): Number of reference images. Returns: (tuple) - x (NxC2xH2xW2): Encoded features from reference images for the main branch (as input to the decoder). - encoded_ref (list of Variable): Encoded features from reference images for the weight generation branch. - attention (Nx(KxH1xW1)x(H1xW1)): Attention maps. - atn_vis (1x1xH1xW1): Visualization for attention scores. - ref_idx (Nx1): Index for which image to use from the reference image. """ if self.concat_ref_label: # concat reference label map and image together for encoding. concat_ref = L.concat([ref_image, ref_label], axis=1) x = self.ref_img_first(concat_ref) elif self.mul_ref_label: x = self.ref_img_first(ref_image) x_label = self.ref_label_first(ref_label) else: x = self.ref_img_first(ref_image) atn_ref_image = atn_ref_label = None atn = atn_vis = ref_idx = None for i in range(self.num_downsamples): x = getattr(self, 'ref_img_down_' + str(i))(x) if self.mul_ref_label: x_label = getattr(self, 'ref_label_down_' + str(i))(x_label) # Preserve reference for attention module. if k > 1 and i == self.num_downsample_atn - 1: x, atn, atn_vis = self.attention_module(x, label, ref_label) if self.mul_ref_label: x_label, _, _ = self.attention_module( x_label, None, None, atn) atn_sum = L.reshape(atn, (label.shape[0], k, -1)) # [b, k, h*w*h*w] atn_sum = L.reduce_sum(atn_sum, dim=2) ref_idx = L.argmax(atn_sum, axis=1) # Get all corresponding layers in the encoder output for generating # weights in corresponding layers. encoded_image_ref = [x] if self.mul_ref_label: encoded_ref_label = [x_label] for i in reversed(range(self.num_downsamples)): # 4 -> 0 conv = getattr(self, 'ref_img_up_' + str(i))(encoded_image_ref[-1]) encoded_image_ref.append(conv) if self.mul_ref_label: conv_label = getattr(self, 'ref_label_up_' + str(i))( encoded_ref_label[-1]) encoded_ref_label.append(conv_label) if self.mul_ref_label: encoded_ref = [] for i in range(len(encoded_image_ref)): conv, conv_label = encoded_image_ref[i], encoded_ref_label[i] b, c, h, w = conv.shape conv_label = L.softmax(conv_label, axis=1) conv_label = L.reshape(conv_label, (b, 1, c, h * w)) # conv_label = L.expand(conv_label, (1, c, 1, 1)) conv = L.reshape(conv, (b, c, 1, h * w)) # conv = L.expand(conv, (1, 1, c, 1)) conv_prod = conv * conv_label # (b, c, c, h * w) conv_prod = L.reduce_sum(conv_prod, dim=3, keep_dim=True) # (b, c, c, 1) encoded_ref.append(conv_prod) else: encoded_ref = encoded_image_ref encoded_ref = encoded_ref[::-1] # level0 -> level4 return x, encoded_ref, atn, atn_vis, ref_idx
def scaled_dot_product_attention_with_sen_norm(q, k, v, attn_bias, d_key, dropout_rate, attn_s): """ Scaled Dot-Product Attention with sentence-level normalize :param q: (batch_size, n_head, tgt_len, dim_per_head) :param k: (batch_size, n_blocks, n_head, n_tokens, dim_per_head) :param v: (batch_size, n_blocks, n_head, n_tokens, dim_per_head) :param attn_bias: (batch_size, n_blocks, n_head, tgt_len, n_tokens) :param attn_s: [batch, n_heads, query_len, key_s_len] :return: """ # print("q.shape = %s" % str(q.shape)) # (batch_size, n_block, n_head, tgt_len, dim_per_head) q = layers.expand(layers.unsqueeze(q, axes=[1]), expand_times=[1, key_s_len, 1, 1, 1]) # print("q.shape = %s" % str(q.shape)) # (batch_size*n_block, n_head, tgt_len, dim_per_head) # q = layers.reshape(q, shape=[-1, n_head, query_len, d_key]) # print("q.shape = %s" % str(q.shape)) scaled_q = layers.scale(x=q, scale=d_key**-0.5) # (batch_size, n_block, n_head, tgt_len, n_token) product = layers.matmul(x=scaled_q, y=k, transpose_y=True) if attn_bias: product += attn_bias # (batch_size, n_block, n_head, tgt_len, n_token) weights = layers.softmax( product) # (batch_size, n_block, n_head, tgt_len, n_token) # attn_w = layers.reshape(weights, shape=[batch_size, key_s_len, n_head, query_len, -1]) # (batch_size, n_head, tgt_len, n_block, n_token) attn_w = layers.transpose(weights, perm=[0, 2, 3, 1, 4]) # (batch_size, n_head, tgt_len, n_block, n_token) attn_w = layers.elementwise_mul(attn_w, layers.unsqueeze(attn_s, axes=[-1]), axis=0) # (batch_size, n_head, tgt_len, n_block*n_token) attn_w = layers.reshape(attn_w, shape=[batch_size, n_head, query_len, -1]) if dropout_rate: attn_w = layers.dropout( # (batch_size, n_head, tgt_len, n_block*n_token) attn_w, dropout_prob=dropout_rate, seed=dropout_seed, dropout_implementation="upscale_in_train", is_test=False) # values_w = layers.reshape(v, shape=[batch_size, key_s_len, n_head, -1, d_value]) values_w = layers.transpose(v, perm=[0, 2, 1, 3, 4]) # (batch_size, n_head, n_block*n_token, dim_per_head) values_w = layers.reshape(values_w, shape=[batch_size, n_head, -1, d_value]) out = layers.matmul( attn_w, values_w) # (batch_size, n_head, tgt_len, dim_per_head) # Project back to the model size. combine_out = __combine_heads_word( out) # (batch_size, query_len, emb_dim) proj_out = layers.fc( input=combine_out, # (batch_size, tgt_len, model_dim) size=d_model, num_flatten_dims=2, param_attr=fluid.ParamAttr(name=name + '_word_fc.w_0', initializer=param_initializer), bias_attr=name + '_word_fc.b_0') return proj_out, attn_w
def _build_decoder(self, z_mean=None, z_log_var=None, enc_output=None, mode='train', beam_size=10): dec_input = layers.dropout(self.tar_emb, dropout_prob=self.dec_dropout_in, dropout_implementation="upscale_in_train") # `output_layer` will be used within BeamSearchDecoder output_layer = lambda x: layers.fc(x, size=self.tar_vocab_size, num_flatten_dims=len(x.shape) - 1, name="output_w") # `sample_output_layer` samples an id from the logits distribution instead of argmax(logits) # it will be used within BeamSearchDecoder sample_output_layer = lambda x: layers.unsqueeze( fluid.one_hot(layers.unsqueeze( layers.sampling_id(layers.softmax( layers.squeeze(output_layer(x), [1])), dtype='int'), [1]), depth=self.tar_vocab_size), [1]) if mode == 'train': latent_z = self._sampling(z_mean, z_log_var) else: latent_z = layers.gaussian_random_batch_size_like( self.tar, shape=[-1, self.latent_size]) dec_first_hidden_cell = layers.fc(latent_z, 2 * self.hidden_size * self.num_layers, name='fc_hc') dec_first_hidden, dec_first_cell = layers.split( dec_first_hidden_cell, 2) if self.num_layers > 1: dec_first_hidden = layers.split(dec_first_hidden, self.num_layers) dec_first_cell = layers.split(dec_first_cell, self.num_layers) else: dec_first_hidden = [dec_first_hidden] dec_first_cell = [dec_first_cell] dec_initial_states = [[h, c] for h, c in zip(dec_first_hidden, dec_first_cell) ] dec_cell = DecoderCell(self.num_layers, self.hidden_size, latent_z, self.param_attr_initializer, self.param_attr_scale, self.dec_dropout_out) if mode == 'train': dec_output, _ = rnn(cell=dec_cell, inputs=dec_input, initial_states=dec_initial_states, sequence_length=self.tar_sequence_length) dec_output = output_layer(dec_output) return dec_output elif mode == 'greedy': start_token = 1 end_token = 2 max_length = 100 beam_search_decoder = BeamSearchDecoder( dec_cell, start_token, end_token, beam_size=1, embedding_fn=self.tar_embeder, output_fn=output_layer) outputs, _ = dynamic_decode(beam_search_decoder, inits=dec_initial_states, max_step_num=max_length) return outputs elif mode == 'sampling': start_token = 1 end_token = 2 max_length = 100 beam_search_decoder = BeamSearchDecoder( dec_cell, start_token, end_token, beam_size=1, embedding_fn=self.tar_embeder, output_fn=sample_output_layer) outputs, _ = dynamic_decode(beam_search_decoder, inits=dec_initial_states, max_step_num=max_length) return outputs else: print("mode not supprt", mode)
def infilling_decode(self): if self.task_type == "dialog": emb_num = 4 else: emb_num = 3 input_shapes = [[-1, self.max_seq_len, 1]] * emb_num + \ [[-1, self.max_seq_len, self.max_seq_len]] input_dtypes = ['int64'] * emb_num + ['float32'] input_lod_levels = [0] * emb_num + [0] shapes = input_shapes + [[-1, self.max_seq_len, 1], [-1, self.max_seq_len, 1], [-1, 1], [-1], [-1, 1, self.max_seq_len], [-1, 1]] dtypes = input_dtypes + [ 'int64', 'int64', 'float32', 'int32', 'float32', 'int64' ] lod_levels = input_lod_levels + [2, 2, 2, 0, 0, 0] inputs = self.to_ternsor(shapes, dtypes, lod_levels) pyreader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=50, iterable=False) emb_ids = {} for key, value in zip(self.emb_keys, inputs[:emb_num]): emb_ids[key] = value input_mask = inputs[emb_num] tgt_ids, tgt_pos, init_scores, parent_idx, tgt_input_mask, data_ids = inputs[ -6:] ernie = ErnieModel(emb_ids=emb_ids, input_mask=input_mask, config=self.ernie_config, use_fp16=self.use_fp16, task_type=self.task_type, decoding=True, gather_idx=parent_idx) max_len = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=self.max_dec_len, force_cpu=True) step_idx = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=0, force_cpu=True) pos_idx = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=1, force_cpu=True) cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond) ids = layers.array_write(layers.reshape(tgt_ids, (-1, 1)), step_idx) pos_biases = layers.array_write(layers.reshape(tgt_pos, (-1, 1)), step_idx) scores = layers.array_write(init_scores, step_idx) tgt_masks = layers.array_write(tgt_input_mask, step_idx) with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) pos_bias = layers.array_read(array=pos_biases, i=step_idx) pos_bias = layers.gather(input=pos_bias, index=parent_idx) tmp_mask = layers.array_read(tgt_masks, i=step_idx) def gen_batch_like(value, dtype="int64", shape=[-1, 1, 1], is_scalar=True): if is_scalar: return layers.fill_constant_batch_size_like( input=parent_idx, value=value, shape=shape, dtype=dtype) else: return layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=parent_idx, value=1, shape=shape, dtype=dtype), y=value, axis=0) tmp_mask = layers.gather(input=tmp_mask, index=parent_idx) append_0_mask = gen_batch_like(0.0, dtype=tmp_mask.dtype) append_1_mask = gen_batch_like(1.0, dtype=tmp_mask.dtype) tmp_mask = layers.concat([tmp_mask, append_1_mask], axis=2) pre_mask = layers.concat([tmp_mask, append_0_mask], axis=2) cur_mask = layers.concat([tmp_mask, append_1_mask], axis=2) cur_ids = gen_batch_like(self.attn_id) pre_pos = gen_batch_like(step_idx, is_scalar=False) cur_pos = gen_batch_like(pos_idx, is_scalar=False) if self.continuous_position: pre_pos = pre_pos + pos_bias cur_pos = cur_pos + pos_bias dec_emb_ids = { "word_embedding": layers.concat([pre_ids, cur_ids], axis=1), "pos_embedding": layers.concat([pre_pos, cur_pos], axis=1) } if self.task_type == "dialog": role_ids = gen_batch_like(0) turn_ids = gen_batch_like(0) dec_emb_ids["role_embedding"] = layers.concat( [role_ids, role_ids], axis=1) dec_emb_ids["turn_embedding"] = layers.concat( [turn_ids, turn_ids], axis=1) else: sent_ids = gen_batch_like(self.tgt_type_id) dec_emb_ids["sent_embedding"] = layers.concat( [sent_ids, sent_ids], axis=1) dec_mask = layers.concat([pre_mask, cur_mask], axis=1) dec_out = ernie.encode(dec_emb_ids, dec_mask, parent_idx, remove_query=True) fc_out = self.cal_logit(dec_out[:, 1:, :], None) topk_scores, topk_indices = layers.topk( input=layers.softmax(fc_out), k=self.beam_size) pre_lenpen = layers.pow( (5.0 + layers.cast(step_idx, pre_scores.dtype)) / 6.0, self.length_penalty) cur_lenpen = layers.pow( (5.0 + layers.cast(pos_idx, pre_scores.dtype)) / 6.0, self.length_penalty) accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores * pre_lenpen, axis=0) / cur_lenpen topk_indices = layers.lod_reset(topk_indices, pre_ids) accu_scores = layers.lod_reset(accu_scores, pre_ids) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=self.beam_size, end_id=self.eos_idx, return_parent_idx=True) layers.increment(x=step_idx, value=1.0, in_place=True) layers.increment(x=pos_idx, value=1.0, in_place=True) layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.array_write(tmp_mask, i=step_idx, array=tgt_masks) layers.array_write(pos_bias, i=step_idx, array=pos_biases) layers.assign(gather_idx, parent_idx) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=self.beam_size, end_id=self.eos_idx) graph_vars = { "finished_ids": finished_ids, "finished_scores": finished_scores, "data_ids": data_ids } for k, v in graph_vars.items(): v.persistable = True return pyreader, graph_vars
if args.init_checkpoint is not None: print('loading checkpoint from %s' % args.init_checkpoint) sd, _ = FD.load_dygraph(args.init_checkpoint) model.set_dict(sd) test_batch_data = batchify(test_features, args.bsz, args.max_seqlen) if args.debug: print(len(test_batch_data)) print(test_batch_data[0]) token_ids, seg_ids, labels = test_batch_data[0] for r1, r2 in zip(token_ids[:5], seg_ids[:5]): print(r1) print(r2) print(convert_ids_to_tokens(tokenizer.vocab, r1)) y_pred = [] with FD.base._switch_tracer_mode_guard_(is_train=False): model.eval() for step, d in enumerate(tqdm(test_batch_data, desc='predicting')): ids, sids, _ = d ids, sids = FD.to_variable(ids), FD.to_variable(sids) _, logits = model(ids, sids) #print('\n'.join(map(str, logits.numpy().tolist()))) y_pred += L.softmax(logits, -1).numpy().tolist() if args.debug and len(y_pred) > 5: break print(len(y_pred), y_pred[:5]) print(test_segs[:5]) with open(args.save_path, 'wb') as f: pickle.dump({'segs': test_segs, 'y_pred': y_pred}, f)
def forward(self, outputs, targets): """ Performs the matching Params: outputs: This is a dict contains at least these entries: "pred_logits": Tensor of dim[batch_size, num_queries, num_classes] with the classification logits "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicated box coordinates targets: This is a list of targets (len(targets) == batch_size), where each target is a dict containing: "labels": Tensor of dim[num_target_boxes] (where num_target_boxes is the number of ground-truth) objects in the target) containing the class labels "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordiantes Returns: A list of size batch_size, containing tuples of (index_i, index_j) where: - index_i is the indices of the selected predictions (in order) - index_j is the indices of the corresponding selected targets (in order) For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) """ with dg.no_grad(): bs, num_queries, num_classes = outputs["pred_logits"].shape # We flatten to compute the cost matrices in a batch out_prob = L.reshape( outputs["pred_logits"], [-1, num_classes]) # [batch_size * num_queries, num_classes] out_prob = L.softmax( out_prob, axis=-1) # [batch_size * num_queries, num_classes] out_bbox = L.reshape(outputs["pred_boxes"], [-1, 4]) # [batch_size * num_queries, 4] # Alse concat the target labels and boxes tgt_ids = L.concat([v["labels"] for v in targets]).astype( "int64") # [batch_size * num_target_boxes_i] tgt_bbox = L.concat([v["boxes"] for v in targets]).astype( "float32") # [batch_size * num_target_boxes_i] # Compute the classification cost. Contrary to the loss, we don't use the NLL, # but approximate it in 1 - proba[target class]. # The 1 is a constant that donesn't change the matching, it can be ommitted. cost_class = -out_prob.numpy()[:, tgt_ids.numpy( )] # [batch_size * num_queries, num_all_target_boxes] cost_class = dg.to_variable(cost_class) # Compute the L1 cost between boxes num_all_target_boxes = tgt_bbox.shape[0] expanded_out_bbox = L.expand( L.unsqueeze(out_bbox, [1]), [1, num_all_target_boxes, 1 ]) # [batch_size * num_queries, num_all_target_boxes, 4] expanded_tgt_bbox = L.expand( L.unsqueeze(tgt_bbox, [0]), [bs * num_queries, 1, 1 ]) # [batch_size * num_queries, num_all_target_boxes, 4] cost_bbox = F.loss.l1_loss( expanded_out_bbox, expanded_tgt_bbox, reduction='none' ) # [batch_size * num_queries, num_all_target_boxes, 4] cost_bbox = L.reduce_mean( cost_bbox, -1) # [batch_size * num_queries, num_all_target_boxes] # Compute the giou cost between boxes cost_giou = -generalied_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox)) # Final cost matrix C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou C = L.reshape( C, [bs, num_queries, -1 ]) # [batch_size, num_queries, num_all_target_boxes] sizes = [len(v["boxes"]) for v in targets] indices = [ linear_sum_assignment(c[i].numpy()) for i, c in enumerate(L.split(C, sizes, dim=-1)) ] return [(dg.to_variable(i.astype("int64")), dg.to_variable(j.astype("int64"))) for i, j in indices]
def KL(pred, target): pred = L.log(L.softmax(pred)) target = L.softmax(target) loss = L.kldiv_loss(pred, target) return loss
def inference(self, model, inputs, outputs): """ Run inference. Args: inputs(dict): Its key is input name(str) and its value is a Variable. model(object): A generate model. Need to implement `_generation_network` and `_calc_logits`. Returns: dict(str:Variable): Its key is output name(str) and its value is a Variable. """ # prepare while loop max_len = layers.fill_constant( shape=[1], dtype="int64", value=self.max_dec_len, force_cpu=True) min_len = layers.fill_constant( shape=[1], dtype="int64", value=self.min_dec_len, force_cpu=True) step_idx = layers.fill_constant( shape=[1], dtype="int64", value=0, force_cpu=True) ids = layers.array_write(layers.reshape(inputs["tgt_ids"], (-1, 1)), step_idx) pos_biases = layers.array_write(layers.reshape(inputs["tgt_pos"], (-1, 1)), step_idx) scores = layers.array_write(inputs["init_score"], step_idx) tgt_generation_mask = layers.array_write(inputs["tgt_generation_mask"], step_idx) parent_idx = inputs["parent_idx"] if self.decoding_strategy == "beam_search": beam_size = self.beam_size else: beam_size = 1 eos_penalty = np.zeros(self.vocab_size, dtype="float32") eos_penalty[self.eos_id] = -1e9 eos_penalty = layers.assign(eos_penalty) token_penalty = np.zeros(self.vocab_size, dtype="float32") token_penalty[self.unk_id] = -1e9 if self.mask_id >= 0: token_penalty[self.mask_id] = -1e9 token_penalty = layers.assign(token_penalty) # start while loop cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond) with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) pos_bias = layers.array_read(array=pos_biases, i=step_idx) pos_bias = layers.gather(input=pos_bias, index=parent_idx) tmp_tgt_generation_mask = layers.array_read(tgt_generation_mask, i=step_idx) dtype = tmp_tgt_generation_mask.dtype append_mask = layers.fill_constant_batch_size_like( input=pre_ids, value=1.0, shape=[-1, 1, 1], dtype=dtype) tmp_tgt_generation_mask = layers.concat([tmp_tgt_generation_mask, append_mask], axis=2) pre_mask = tmp_tgt_generation_mask = layers.gather(input=tmp_tgt_generation_mask, index=parent_idx) pre_sent = layers.fill_constant_batch_size_like( input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype) if self.continuous_position: pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) + pos_bias else: pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) if self.use_role: pre_role = layers.fill_constant_batch_size_like( input=pre_mask, value=0, shape=[-1, 1, 1], dtype=pre_ids.dtype) else: pre_role = None dec_out, _ = model._generation_network( token_ids=pre_ids, type_ids=pre_sent, pos_ids=pre_pos, role_ids=pre_role, generation_mask=tmp_tgt_generation_mask, gather_idx=parent_idx) logits = model._calc_logits(dec_out) # ignore unk and mask token if self.ignore_unk: logits = layers.elementwise_add(logits, token_penalty, axis=1) # min dec length min_len_cond = layers.less_than(x=step_idx, y=min_len) def min_len_penalty(): """Plus minimum length penalty.""" return layers.elementwise_add(logits, eos_penalty, axis=1) def no_penalty(): """No penalty.""" return logits logits = layers.case([(min_len_cond, min_len_penalty)], default=no_penalty) # get probs probs = layers.softmax(logits / self.temperature) if self.decoding_strategy == "beam_search": topk_scores, topk_indices = layers.topk( input=probs, k=beam_size) else: if self.decoding_strategy.startswith("sampling"): sampling_ids = layers.sampling_id(probs, dtype="int") elif self.decoding_strategy.startswith("topk_sampling"): topk_probs, _ = layers.topk(input=probs, k=self.topk) ge_cond = layers.cast( layers.greater_equal( probs, layers.unsqueeze(topk_probs[:, -1], [1])), "float32") old_probs = probs probs = probs * ge_cond / layers.reduce_sum(topk_probs, dim=-1, keep_dim=True) sampling_ids = layers.sampling_id(probs, dtype="int") probs = old_probs else: raise ValueError(self.decoding_strategy) sampling_scores = layers.one_hot( layers.unsqueeze(sampling_ids, [1]), probs.shape[1] ) sampling_scores = sampling_scores * probs - (1 - sampling_scores) * 1e3 topk_scores, topk_indices = layers.topk( input=sampling_scores, k=1) pre_len = layers.cast(step_idx, "float32") layers.increment(x=step_idx, value=1.0, in_place=True) cur_len = layers.cast(step_idx, "float32") # update scores if self.length_average: accu_scores = layers.elementwise_add( x=layers.log(topk_scores), y=pre_scores * pre_len, axis=0) / cur_len elif self.length_penalty > 0: pre_lp = layers.pow((5 + pre_len) / 6, self.length_penalty) cur_lp = layers.pow((5 + cur_len) / 6, self.length_penalty) accu_scores = layers.elementwise_add( x=layers.log(topk_scores), y=pre_scores * pre_lp, axis=0) / cur_lp else: accu_scores = layers.elementwise_add( x=layers.log(topk_scores), y=pre_scores, axis=0) topk_indices = layers.lod_reset(topk_indices, pre_ids) accu_scores = layers.lod_reset(accu_scores, pre_ids) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=beam_size, end_id=self.eos_id, return_parent_idx=True) layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.array_write(pre_mask, i=step_idx, array=tgt_generation_mask) layers.array_write(pos_bias, i=step_idx, array=pos_biases) layers.assign(gather_idx, parent_idx) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=beam_size, end_id=self.eos_id) predictions = { "finished_ids": finished_ids, "finished_scores": finished_scores, "token_ids": inputs["token_ids"], "data_id": inputs["data_id"] } return predictions
def wrap_decoder(trg_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, embedding_sharing, dec_inputs=None, enc_output=None, caches=None, is_train=True, params_type="normal"): """ The wrapper assembles together all needed layers for the decoder. """ if dec_inputs is None: # This is used to implement independent decoder program in inference. trg_word, reverse_trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, enc_output = \ make_all_inputs(decoder_data_input_fields) else: trg_word, reverse_trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs dec_input = prepare_decoder( trg_word, trg_pos, trg_vocab_size, d_model, max_length, prepostprocess_dropout, word_emb_param_name=word_emb_param_names[0] if embedding_sharing else word_emb_param_names[1], training=is_train, params_type=params_type) dec_output = decoder( dec_input, enc_output, trg_slf_attn_bias, trg_src_attn_bias, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, caches=caches) # Reshape to 2D tensor to use GEMM instead of BatchedGEMM dec_output = layers.reshape( dec_output, shape=[-1, dec_output.shape[-1]], inplace=True) assert params_type == "fixed" or params_type == "normal" or params_type == "new" pre_name = "forwardforward" if params_type == "fixed": pre_name = "fixed_forwardfixed_forward" elif params_type == "new": pre_name = "new_forwardnew_forward" if weight_sharing and embedding_sharing: predict = layers.matmul( x=dec_output, y=fluid.default_main_program().global_block().var( pre_name + word_emb_param_names[0]), transpose_y=True) elif weight_sharing: predict = layers.matmul( x=dec_output, y=fluid.default_main_program().global_block().var( pre_name + word_emb_param_names[1]), transpose_y=True) else: predict = layers.fc(input=dec_output, size=trg_vocab_size, bias_attr=False) if dec_inputs is None: # Return probs for independent decoder program. predict = layers.softmax(predict) return predict
def net(self, class_dim=5, CAM=False): """Create second stage model Args: class_dim: dim of multi-class vector CAM: 是否创建CAM heatmap Returns: * A list contain 4/5 tensors / ops: - loss, cross-entropy loss tensor - accuracy, accuracy metric tensor - predict, model output tensor activated by softmax - hacked_img_id, img_id tensor - cam_heatmap, only if CAM == True, class activation map tensor * reader, reader op to feed data into placeholder """ self.input_feature = fluid.data(name='{}_input'.format(self.name), shape=[-1] + self.data_shape, dtype='uint8') self.label = fluid.data(name='{}_label'.format(self.name), shape=[-1, 1], dtype='int64') self.img_id = fluid.data(name='{}_img_id'.format(self.name), shape=[-1, 1], dtype='int64') # Lesion Net lesion = lesionnet.LesionNet() # Backbone if self.main_arch in ResNetModels: model = resnet.__dict__[self.main_arch]() elif self.main_arch in DenseNetModels: model = densenet.__dict__[self.main_arch]() elif self.main_arch == "inception": model = inception.InceptionV4() else: raise ValueError("Model {} is not supported.".format( self.main_arch)) inp = FL.transpose(FL.cast(self.input_feature, "float32"), perm=[0, 3, 1, 2]) / 255. # Element wise mul of lesion prob maps and input image lesion_probs = lesion.net(inp, class_dim=4) # bs, 4, 16, 16 lesion_probs = FL.split(lesion_probs, num_or_sections=4, dim=1) # probs, bs*1*16*16 4 I = FL.image_resize(inp, out_shape=(512, 512), resample="BILINEAR") Is = [] for L in lesion_probs: W = FL.image_resize(L, out_shape=(512, 512), resample="NEAREST") # bs, 1, 512, 512 temp_I = FL.elementwise_mul( I, FL.expand(W + 1., expand_times=[1, 3, 1, 1])) # W + 1., bs, 3, 512, 512 Is.append(temp_I) I = FL.concat(Is, axis=1) # bs, 3*4, 512, 512 I.stop_gradient = True lesion_pos_prob = 1. - lesion_probs[0] main_arch_out = model.net(I, class_dim=class_dim, lesion_map=lesion_pos_prob, CAM=CAM) if CAM: logit, heatmaps = main_arch_out else: logit = main_arch_out predict = FL.softmax(logit) accuracy = self.create_acc_op(predict, self.label) loss = self.create_loss_op(predict, self.label) reader = self.create_reader_op( [self.img_id, self.input_feature, self.label]) # This is a hack hacked_img_id = FL.cast(self.img_id, "int32") if CAM: cam_heatmap = self.create_cam_op(predict, class_dim, heatmaps) return [loss, accuracy, predict, hacked_img_id, cam_heatmap], reader return [loss, accuracy, predict, hacked_img_id], reader
def soft_cross_entropy(inp, target): inp_likelihood = L.log_softmax(inp, axis=-1) target_prob = L.softmax(target, axis=-1) return -1. * L.mean(L.reduce_sum(inp_likelihood * target_prob, dim=-1))