def _mask_finished_probs(self, probs, finished): """mask finished beams. it makes 1. all finished beams probs to be -inf, except end_token which is 0 2. unfinished beams to remain unchanged Args: probs (Variable): with shape [batch_size, vocab_size] finished (Variable): with shape [batch_size] Returns: Variable Raises: NULL """ # 初始化 no-end mask noend_array = [-INF] * self._vocab_size noend_array[self._end_token] = 0 self._noend_mask_tensor = layers.assign(np.array(noend_array, "float32")) finished = layers.cast(finished, dtype=probs.dtype) # finished --> 0; not finished --> -1 not_finished = fluider.increment(finished, value=-1) # shape = [batch_size, vocab_size] finished_expended = layers.expand(layers.unsqueeze(finished, [1]), [1, self._vocab_size]) probs = layers.elementwise_mul(finished_expended, self._noend_mask_tensor, axis=-1) - \ layers.elementwise_mul(probs, not_finished, axis=0) return probs
def layer_norm(x, begin_norm_axis=1, epsilon=1e-12, param_attr=None, bias_attr=None): """ Replace build-in layer_norm op with this function """ helper = LayerHelper('layer_norm', **locals()) mean = layers.reduce_mean(x, dim=begin_norm_axis, keep_dim=True) shift_x = layers.elementwise_sub(x=x, y=mean, axis=0) variance = layers.reduce_mean( layers.square(shift_x), dim=begin_norm_axis, keep_dim=True) r_stdev = layers.rsqrt(variance + epsilon) norm_x = layers.elementwise_mul(x=shift_x, y=r_stdev, axis=0) param_shape = [reduce(lambda x, y: x * y, norm_x.shape[begin_norm_axis:])] param_dtype = norm_x.dtype scale = helper.create_parameter( attr=param_attr, shape=param_shape, dtype=param_dtype, default_initializer=fluid.initializer.Constant(1.)) bias = helper.create_parameter( attr=bias_attr, shape=param_shape, dtype=param_dtype, is_bias=True, default_initializer=fluid.initializer.Constant(0.)) out = layers.elementwise_mul(x=norm_x, y=scale, axis=-1) out = layers.elementwise_add(x=out, y=bias, axis=-1) return out
def attn_flow(q_enc, p_enc, p_ids_name, args): """Bidirectional Attention layer""" tag = p_ids_name + "__" drnn = layers.DynamicRNN() with drnn.block(): h_cur = drnn.step_input(p_enc) u_all = drnn.static_input(q_enc) h_expd = layers.sequence_expand(x=h_cur, y=u_all) s_t_mul = layers.elementwise_mul(x=u_all, y=h_expd, axis=0) s_t_sum = layers.reduce_sum(input=s_t_mul, dim=1, keep_dim=True) s_t_re = layers.reshape(s_t_sum, shape=[-1, 0]) s_t = layers.sequence_softmax(input=s_t_re) u_expr = layers.elementwise_mul(x=u_all, y=s_t, axis=0) u_expr = layers.sequence_pool(input=u_expr, pool_type='sum') b_t = layers.sequence_pool(input=s_t_sum, pool_type='max') drnn.output(u_expr, b_t) U_expr, b = drnn() b_norm = layers.sequence_softmax(input=b) h_expr = layers.elementwise_mul(x=p_enc, y=b_norm, axis=0) h_expr = layers.sequence_pool(input=h_expr, pool_type='sum') H_expr = layers.sequence_expand(x=h_expr, y=p_enc) H_expr = layers.lod_reset(x=H_expr, y=p_enc) h_u = layers.elementwise_mul(x=p_enc, y=U_expr, axis=0) h_h = layers.elementwise_mul(x=p_enc, y=H_expr, axis=0) g = layers.concat(input=[p_enc, U_expr, h_u, h_h], axis=1) return dropout(g, args)
def lstm_step(x_t, hidden_t_prev, cell_t_prev, size, para_name, args): """Util function for pointer network""" def linear(inputs, para_name, args): return layers.fc(input=inputs, size=size, param_attr=fluid.ParamAttr(name=para_name + '_w'), bias_attr=fluid.ParamAttr(name=para_name + '_b')) input_cat = layers.concat([hidden_t_prev, x_t], axis=1) forget_gate = layers.sigmoid(x=linear(input_cat, para_name + '_lstm_f', args)) input_gate = layers.sigmoid(x=linear(input_cat, para_name + '_lstm_i', args)) output_gate = layers.sigmoid(x=linear(input_cat, para_name + '_lstm_o', args)) cell_tilde = layers.tanh(x=linear(input_cat, para_name + '_lstm_c', args)) cell_t = layers.sums(input=[ layers.elementwise_mul( x=forget_gate, y=cell_t_prev), layers.elementwise_mul( x=input_gate, y=cell_tilde) ]) hidden_t = layers.elementwise_mul(x=output_gate, y=layers.tanh(x=cell_t)) return hidden_t, cell_t
def _create_mask(self, input_mask, append_head=False, auto_regressive=False): """ Create attention mask. @param : input_mask @type : Variable(shape: [batch_size, max_seq_len]) @param : auto_regressive @type : bool """ input_mask = fluid.layers.unsqueeze(input=input_mask, axes=[2]) seq_len = input_mask.shape[1] input_mask = layers.cast(input_mask, self._dtype) mask1 = layers.expand(input_mask, [1, 1, seq_len]) mask2 = layers.transpose(mask1, [0, 2, 1]) mask = layers.elementwise_mul(mask1, mask2) if append_head: mask = layers.concat([mask[:, :1, :], mask], axis=1) mask = layers.concat([mask[:, :, :1], mask], axis=2) seq_len += 1 if auto_regressive: seq_mask = self.sequence_mask[:seq_len, :seq_len] mask = layers.elementwise_mul(mask, seq_mask) mask = 1 - mask return mask
def mask_probs(probs, finished, noend_mask_tensor): finished = layers.cast(finished, dtype=probs.dtype) probs = layers.elementwise_mul(layers.expand( layers.unsqueeze(finished, [2]), [1, 1, self.trg_vocab_size]), noend_mask_tensor, axis=-1) - layers.elementwise_mul( probs, (finished - 1), axis=0) return probs
def get_single_direction_output(rnn_input, encode_hidden, unit_list, mask=None, direc_index=0): rnn = StaticRNN() #print(rnn_input.shape) with rnn.step(): step_input = rnn.step_input(rnn_input) if mask: step_mask = rnn.step_input(mask) for i in range(num_layers): if init_hidden: pre_hidden = rnn.memory(init=init_hidden[i, direc_index]) else: pre_hidden = rnn.memory(batch_ref=rnn_input, shape=[-1, hidden_size], ref_batch_dim_idx=1) encode_h = encode_hidden[i] pre_encode_hidden = layers.concat([pre_hidden, encode_h], axis=1) new_hidden = unit_list[i](step_input, pre_encode_hidden) if mask: new_hidden = layers.elementwise_mul( new_hidden, step_mask, axis=0) - layers.elementwise_mul( pre_hidden, (step_mask - 1), axis=0) rnn.update_memory(pre_hidden, new_hidden) rnn.step_output(new_hidden) step_input = new_hidden if dropout_prob is not None and dropout_prob > 0.0: step_input = layers.dropout(step_input, dropout_prob=dropout_prob, ) rnn.step_output(step_input) rnn_out = rnn() last_hidden_array = [] all_hidden_array = [] # 增加这个来得到所有隐含状态 rnn_output = rnn_out[-1] for i in range(num_layers): last_hidden = rnn_out[i] all_hidden_array.append(last_hidden) last_hidden = last_hidden[-1] last_hidden_array.append(last_hidden) all_hidden_array = layers.concat(all_hidden_array, axis=0) all_hidden_array = layers.reshape(all_hidden_array, shape=[num_layers, input.shape[0], -1, hidden_size]) last_hidden_output = layers.concat(last_hidden_array, axis=0) last_hidden_output = layers.reshape(last_hidden_output, shape=[num_layers, -1, hidden_size]) return rnn_output, last_hidden_output, all_hidden_array
def _birnn_encoder(self, inputs, input_len, name_lens, name_pos, name_tok_len): """forward Args: inputs (Variable): shape=[batch_size, max_seq_len, hidden_size] input_len (Variable): shape=[batch_size] name_lens (Variable): shape=[batch_size] name_pos (Variable): shape=[batch_size, max_name_len, max_tokens] name_tok_len (Variable): shape=[batch_size, max_name_len] Returns: TODO Raises: NULL """ rnn_output, rnn_final_state = self._rnn_encoder.forward( inputs, input_len) max_name_len = name_pos.shape[1] name_begin = name_pos[:, :, 0] name_repr_mask = layers.sequence_mask(name_lens, max_name_len, dtype=name_tok_len.dtype) len_delta = layers.elementwise_mul(name_tok_len - 1, name_repr_mask, axis=0) name_end = name_begin + len_delta if self._bidirectional: name_fwd_repr_gathered = nn_utils.batch_gather_2d( rnn_output, name_end)[:, :, :self._hidden_size] name_bwd_repr_gathered = nn_utils.batch_gather_2d( rnn_output, name_begin)[:, :, self._hidden_size:] name_repr_gathered = layers.concat( input=[name_fwd_repr_gathered, name_bwd_repr_gathered], axis=-1) new_hidden_size = self._hidden_size * 2 else: name_repr_gathered = layers.gather_nd(rnn_output, name_end) new_hidden_size = self._hidden_size name_repr_tmp = layers.reshape( name_repr_gathered, shape=[-1, max_name_len, new_hidden_size]) name_repr_mask = layers.cast(name_repr_mask, dtype=name_repr_tmp.dtype) name_repr = layers.elementwise_mul(name_repr_tmp, name_repr_mask, axis=0) return name_repr, None
def sag_pool(gw, feature, ratio, graph_id, dataset, name, activation=L.tanh): """Implementation of self-attention graph pooling (SAGPool) This is an implementation of the paper SELF-ATTENTION GRAPH POOLING (https://arxiv.org/pdf/1904.08082.pdf) Args: gw: Graph wrapper object. feature: A tensor with shape (num_nodes, feature_size). ratio: The pooling ratio of nodes we want to select. graph_id: The graphs that the nodes belong to. dataset: To differentiate FRANKENSTEIN dataset and other datasets. name: The name of SAGPool layer. activation: The activation function. Return: new_feature: A tensor with shape (num_nodes, feature_size), and the unselected nodes' feature is masked by zero. ratio_length: The selected node numbers of each graph. """ if dataset == "FRANKENSTEIN": gcn_ = gcn else: gcn_ = norm_gcn score = gcn_(gw=gw, feature=feature, hidden_size=1, activation=None, norm=gw.node_feat["norm"], name=name) score = L.squeeze(score, axes=[]) perm, ratio_length = topk_pool(gw, score, graph_id, ratio) mask = L.zeros_like(score) mask = L.cast(mask, dtype="float32") updates = L.ones_like(perm) updates = L.cast(updates, dtype="float32") mask = L.scatter(mask, perm, updates) new_feature = L.elementwise_mul(feature, mask, axis=0) temp_score = activation(score) new_feature = L.elementwise_mul(new_feature, temp_score, axis=0) return new_feature, ratio_length
def forward(self, x): """ Forward process of LayerNorm. """ mean = layers.reduce_mean(x, dim=list(range(self._begin_norm_axis, len(x.shape))), keep_dim=True) shift_x = layers.elementwise_sub(x=x, y=mean, axis=0) variance = layers.reduce_mean(layers.square(shift_x), dim=list(range(self._begin_norm_axis, len(x.shape))), keep_dim=True) r_stdev = layers.rsqrt(variance + self._epsilon) norm_x = layers.elementwise_mul(x=shift_x, y=r_stdev, axis=0) out = layers.elementwise_mul(x=norm_x, y=self._scale_w, axis=-1) out = layers.elementwise_add(x=out, y=self._bias_w, axis=-1) return out
def forward(self, input, pre_hidden, pre_cell): concat_input_hidden = layers.concat([input, pre_hidden], 1) gate_input = layers.matmul(x=concat_input_hidden, y=self._weight) gate_input = layers.elementwise_add(gate_input, self._bias) i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) new_cell = layers.elementwise_add( layers.elementwise_mul( pre_cell, layers.sigmoid(layers.elementwise_add(f, self._forget_bias))), layers.elementwise_mul(layers.sigmoid(i), layers.tanh(j))) new_hidden = layers.tanh(new_cell) * layers.sigmoid(o) return new_hidden, new_cell
def _select_table(condition, inputs, table_enc, table_len, table_mask_by_col, ptr_net, grammar, name=None): """select_table. Args: condition (TYPE): NULL inputs (Variable): shape = [batch_size, max_len, hidden_size]. infer 阶段 max_len 恒为1 table_enc (TYPE): NULL table_len (TYPE): NULL ptr_net (TYPE): NULL grammar (TYPE): NULL name (str): table_mask_by_col (Variable): Returns: TODO Raises: NULL """ condition = layers.cast(condition, dtype='float32') table_mask_by_len = layers.sequence_mask(table_len, maxlen=grammar.MAX_TABLE, dtype='float32') table_mask_by_len = layers.reshape(table_mask_by_len, [-1, grammar.MAX_TABLE]) table_mask_by_col = layers.reshape(table_mask_by_col, [-1, grammar.MAX_TABLE]) table_mask = layers.elementwise_mul(table_mask_by_len, table_mask_by_col) predicts = ptr_net.forward(inputs, table_enc, table_mask) zeros_l = tensor.fill_constant_batch_size_like( predicts, shape=[-1, grammar.grammar_size], dtype='float32', value=-INF) zeros_r = tensor.fill_constant_batch_size_like( predicts, shape=[-1, grammar.MAX_COLUMN + grammar.MAX_VALUE], dtype='float32', value=-INF) final_output = tensor.concat([zeros_l, predicts, zeros_r], axis=-1) true_final_output = layers.elementwise_mul(final_output, condition, axis=0) return true_final_output
def _select_column(condition, inputs, column_enc, column_len, ptr_net, grammar, column2table_mask, name=None): """select_column. Args: condition (TYPE): NULL inputs (Variable): shape = [batch_size, max_len, hidden_size]. infer 阶段 max_len 恒为1 column_enc (TYPE): NULL column_len (TYPE): NULL ptr_net (TYPE): NULL grammar (TYPE): NULL column2table_mask (Variable): name (str): Returns: TODO Raises: NULL """ condition = layers.cast(condition, dtype='float32') column_mask = layers.sequence_mask(column_len, maxlen=grammar.MAX_COLUMN, dtype='float32') column_mask = layers.reshape(column_mask, [-1, grammar.MAX_COLUMN]) predicts = ptr_net.forward(inputs, column_enc, column_mask) pred_ids = layers.argmax(predicts, axis=-1) valid_table_mask = nn_utils.batch_gather(column2table_mask, pred_ids) ## concat zeros to vocab size zeros_l = tensor.fill_constant_batch_size_like( predicts, shape=[-1, grammar.grammar_size + grammar.MAX_TABLE], dtype='float32', value=-INF) zeros_r = tensor.fill_constant_batch_size_like( predicts, shape=[-1, grammar.MAX_VALUE], dtype='float32', value=-INF) final_output = tensor.concat([zeros_l, predicts, zeros_r], axis=-1) true_final_output = layers.elementwise_mul(final_output, condition, axis=0) true_valid_table_mask = layers.elementwise_mul(valid_table_mask, condition, axis=0) return true_final_output, true_valid_table_mask
def _process_type_leaf(condition, decoder, grammar_stack, next_inputs, finished): """Process when output type is LEAF Args: condition (TYPE): NULL decoder (TYPE): NULL grammar_stack (StackData): (gmr_stack_data, gmr_stack_pos) next_inputs (DecoderInputsWrapper): (input_var, action, grammar_mask) finished (TYPE): NULL Returns: None Raises: NULL """ ## pop stack next_output, valid_pos, gmr_stack_tmp = data_structure.Stack.pop( grammar_stack, mask=True, in_place=False) valid_pos = fluider.squeeze(valid_pos, [1]) ## update next grammar mask next_actions = layers.elementwise_mul(decoder.grammar_action(next_output), layers.cast( valid_pos, dtype=next_inputs.action.dtype), axis=0) next_gmr_mask = layers.elementwise_mul( decoder.grammar_mask(next_output), layers.cast(valid_pos, dtype=next_inputs.gmr_mask.dtype), axis=0) ## save result, while condition is True new_gmr_stack_data, new_gmr_stack_pos, new_actions, new_gmr_mask = nn_utils.ifelse( condition, [gmr_stack_tmp.data, gmr_stack_tmp.pos, next_actions, next_gmr_mask], [ grammar_stack.data, grammar_stack.pos, next_inputs.action, next_inputs.gmr_mask ]) layers.utils.map_structure( layers.assign, [new_gmr_stack_data, new_gmr_stack_pos, next_actions, new_gmr_mask], [ grammar_stack.data, grammar_stack.pos, next_inputs.action, next_inputs.gmr_mask ]) layers.logical_or(finished, layers.logical_and(condition, layers.logical_not(valid_pos)), out=finished)
def attention(self, hidden, encoder_output, encoder_output_proj, encoder_padding_mask): # 定义attention用以计算context,即 c_i,这里使用Bahdanau attention机制 decoder_state_proj = layers.unsqueeze( layers.fc(hidden, size=self.hidden_size, bias_attr=False), [1]) # 拿解码器的一个向量,和编码器的所有输出,进行一个结合/混合/融合/交融/关联 mixed_state = fluid.layers.elementwise_add( encoder_output_proj, layers.expand(decoder_state_proj, [1, layers.shape(decoder_state_proj)[1], 1])) # 解码器的一个向量,和编码器的所有输出,进行一个结合/混合/融合/交融/关联 后,进行全连接转成一个数值关系 attn_scores = layers.squeeze( layers.fc(input=mixed_state, size=1, num_flatten_dims=2, bias_attr=False), [2]) if encoder_padding_mask is not None: attn_scores = layers.elementwise_add(attn_scores, encoder_padding_mask) # 数值关系softmax,变成了权重关系 attn_scores = layers.softmax(attn_scores) # 加权平均权重,就是解码器的一个向量一顿操作后,拿到的上下文向量 context = layers.reduce_sum(layers.elementwise_mul(encoder_output, attn_scores, axis=0), dim=1) return context
def custom_dynamic_rnn(p_vec, init_state, decoder_size): context = layers.fc(input=p_vec, size=decoder_size, act=None) drnn = layers.DynamicRNN() with drnn.block(): H_s = drnn.step_input(p_vec) ctx = drnn.static_input(context) c_prev = drnn.memory(init=init_state, need_reorder=True) m_prev = drnn.memory(init=init_state, need_reorder=True) m_prev1 = layers.fc(input=m_prev, size=decoder_size, act=None) m_prev1 = layers.sequence_expand(x=m_prev1, y=ctx) Fk = ctx + m_prev1 Fk = layers.fc(input=Fk, size=decoder_size, act='tanh') logits = layers.fc(input=Fk, size=1, act=None) scores = layers.sequence_softmax(input=logits) attn_ctx = layers.elementwise_mul(x=ctx, y=scores, axis=0) attn_ctx = layers.sequence_pool(input=attn_ctx, pool_type='sum') hidden_t, cell_t = lstm_step(attn_ctx, hidden_t_prev=m_prev1, cell_t_prev=c_prev, size=decoder_size) drnn.update_memory(ex_mem=m_prev, new_mem=hidden_t) drnn.update_memory(ex_mem=c_prev, new_mem=cell_t) drnn.output(scores) beta = drnn() return beta
def compute_position_embedding(radians, speaker_position_rate): """Compute sin/cos interleaved matrix from the radians. Arg: radians (Variable): shape(n_vocab, embed_dim), dtype float32, the radians matrix. speaker_position_rate (Variable): shape(B, ), speaker positioning rate. Returns: Variable: shape(B, n_vocab, embed_dim), the sin, cos interleaved matrix. """ _, embed_dim = radians.shape batch_size = speaker_position_rate.shape[0] scaled_radians = F.elementwise_mul(F.expand(F.unsqueeze(radians, [0]), [batch_size, 1, 1]), speaker_position_rate, axis=0) odd_mask = (np.arange(embed_dim) % 2).astype(np.float32) odd_mask = dg.to_variable(odd_mask) out = odd_mask * F.cos(scaled_radians) \ + (1 - odd_mask) * F.sin(scaled_radians) out = F.concat( [F.zeros((batch_size, 1, embed_dim), radians.dtype), out[:, 1:, :]], axis=1) return out
def input_true(x, condition, reverse=False): """input instances in x, while corrensponding condition is true Args: x (Variable): shape = [batch_size, ...] condition (Variable): shape = [batch_size, 1] reverse (Variable): Default is False Returns: TODO Raises: NULL """ x_dtype = x.dtype if x_dtype == PaddleVarType.bool: x = layers.cast(x, dtype='int32') if condition.dtype != x.dtype: condition = layers.cast(condition, dtype=x.dtype) if reverse: condition = 1.0 - condition output = layers.elementwise_mul(x, condition, axis=0) if x_dtype == PaddleVarType.bool: output = layers.cast(output, dtype=x_dtype) return output
def compute_l2_normalized_weight(v, g, dim): shape = v.shape ndim = len(shape) if dim is None: v_normalized = v / (F.reduce_sum(F.square(v)) + 1e-12) elif dim == 0: param_matrix = F.reshape(v, (shape[0], np.prod(shape[1:]))) v_normalized = F.l2_normalize(param_matrix, axis=1) elif dim == -1 or dim == ndim - 1: param_matrix = F.reshape(v, (np.prod(shape[:-1]), shape[-1])) v_normalized = F.l2_normalize(param_matrix, axis=0) else: perm = list(range(ndim)) perm[0] = dim perm[dim] = 0 transposed_param = F.transpose(v, perm) param_matrix = F.reshape( transposed_param, (transposed_param.shape[0], np.prod(transposed_param.shape[1:]))) v_normalized = F.l2_normalize(param_matrix, axis=1) v_normalized = F.transpose(v_normalized, perm) v_normalized = F.reshape(v_normalized, shape) weight = F.elementwise_mul(v_normalized, g, axis=dim) return weight
def decoder_step(gru_unit, cue_gru_unit, step_in, hidden, input_size, hidden_size, memory, memory_mask, knowledge, mask=None): """ decoder step """ # get attention out # get hidden top layers top_hidden = layers.slice(hidden, axes=[0], starts=[0], ends=[1]) top_hidden = layers.squeeze(top_hidden, axes=[0]) top_hidden = layers.unsqueeze(top_hidden, axes=[1]) weight_memory, attn = dot_attention(top_hidden, memory, memory_mask) step_in = layers.unsqueeze(step_in, axes=[1]) rnn_input_list = [step_in, weight_memory] if weight_memory.shape[0] == -1: knowledge_1 = layers.reshape(knowledge, shape=weight_memory.shape) else: knowledge_1 = knowledge cue_input_list = [knowledge_1, weight_memory] output_list = [weight_memory] rnn_input = layers.concat(rnn_input_list, axis=2) rnn_input = layers.squeeze(rnn_input, axes=[1]) rnn_output, rnn_last_hidden = gru_unit(rnn_input, hidden, mask) cue_input = layers.concat(cue_input_list, axis=2) cue_input = layers.squeeze(cue_input, axes=[1]) cue_rnn_out, cue_rnn_last_hidden = cue_gru_unit(cue_input, hidden, mask) h_y = layers.tanh( fc(rnn_last_hidden, hidden_size, hidden_size, name="dec_fc1")) h_cue = layers.tanh( fc(cue_rnn_last_hidden, hidden_size, hidden_size, name="dec_fc2")) concate_y_cue = layers.concat([h_y, h_cue], axis=2) k = layers.sigmoid(fc(concate_y_cue, hidden_size * 2, 1, name='dec_fc3')) new_hidden = h_y * k - h_cue * (k - 1.0) new_hidden_tmp = layers.transpose(new_hidden, perm=[1, 0, 2]) output_list.append(new_hidden_tmp) real_out = layers.concat(output_list, axis=2) if mask: mask_tmp = layers.unsqueeze(mask, axes=[0]) new_hidden = layers.elementwise_mul((new_hidden - hidden), mask_tmp, axis=0) new_hidden += hidden return real_out, new_hidden
def _weight_norm(v, g, dim): shape = v.shape ndims = len(shape) if dim is None: v_normalized = v / (F.sqrt(F.reduce_sum(F.square(v))) + 1e-12) elif dim == 0: p_matrix = F.reshape(v, (shape[0], -1)) v_normalized = F.l2_normalize(p_matrix, axis=1) v_normalized = F.reshape(v_normalized, shape) elif dim == -1 or dim == ndims - 1: p_matrix = F.reshape(v, (-1, shape[-1])) v_normalized = F.l2_normalize(p_matrix, axis=0) v_normalized = F.reshape(v_normalized, shape) else: perm = list(range(ndims)) perm[0] = dim perm[dim] = 0 p_transposed = F.transpose(v, perm) transposed_shape = p_transposed.shape p_matrix = F.reshape(p_transposed, (p_transposed.shape[0], -1)) v_normalized = F.l2_normalize(p_matrix, axis=1) v_normalized = F.reshape(v_normalized, transposed_shape) v_normalized = F.transpose(v_normalized, perm) weight = F.elementwise_mul(v_normalized, g, axis=dim if dim is not None else -1) return weight
def dot_product_pooling(k, v, attn_bias, dropout_rate): """ Scaled Dot-Product Attention :param k: (batch_size, n_head, key_len, 1) :param v: (batch_size, n_head, key_len, dim_per_head) :param attn_bias: (batch_size, n_head, key_len, key_len) :param dropout_rate: :param is_test: :return: """ product = layers.squeeze(k, axes=[3]) # (batch_size, n_head, key_len) if attn_bias: # (batch_size, n_head, 1, key_len) attn_bias_sliced = fluid.layers.slice(attn_bias, axes=[2], starts=[0], ends=[1]) product += layers.squeeze(attn_bias_sliced, axes=[2 ]) # (batch_size, n_head, key_len) weights = layers.softmax(product) # (batch_size, n_head, key_len) if dropout_rate: weights = layers.dropout(weights, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False) pooling_out = layers.elementwise_mul( x=v, y=weights, axis=0) # (batch_size, n_head, key_len, dim_per_head) pooling_out = layers.reduce_sum( pooling_out, dim=[2]) # (batch_size, n_head, dim_per_head) return pooling_out
def _apply_rule(condition, inputs, gmr_mask, grammar, name=None): """apply_rule. Args: condition (TYPE): NULL inputs (Variable): shape = [batch_size, max_len, hidden_size]. infer 阶段 max_len 恒为1 gmr_mask (TYPE): NULL grammar (TYPE): NULL Returns: TODO Raises: NULL """ fc_name = None if name is not None: fc_name = name + '_apply_rule_fc' condition = layers.cast(condition, dtype='float32') gmr_output = layers.fc(inputs, size=grammar.grammar_size, **nn_utils.param_attr(fc_name, INIT_SCALE, need_bias=True)) gmr_output_masked = layers.elementwise_add(gmr_output, gmr_mask) zeros = layers.fill_constant_batch_size_like( gmr_output_masked, shape=[-1, grammar.MAX_TABLE + grammar.MAX_COLUMN + grammar.MAX_VALUE], dtype='float32', value=-INF) final_output = tensor.concat([gmr_output_masked, zeros], axis=-1) true_final_output = layers.elementwise_mul(final_output, condition, axis=0) return true_final_output
def func(self, place): # the shape of input variable should be clearly specified, not inlcude -1. shape = [2, 3, 4, 5] eps = 0.005 dtype = np.float64 x = layers.data('x', shape, False, dtype) y = layers.data('y', shape, False, dtype) x.persistable = True y.persistable = True out = layers.elementwise_mul(x, y) x_arr = np.random.uniform(-1, 1, shape).astype(dtype) y_arr = np.random.uniform(-1, 1, shape).astype(dtype) gradient_checker.triple_grad_check([x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps) fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) gradient_checker.triple_grad_check_for_dygraph(self.multiply_wrapper, [x, y], out, x_init=[x_arr, y_arr], place=place) fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): """ Scaled Dot-Product Attention [[ 0 L*L -inf -inf -inf ]]maxLen*maxLen """ scaled_q = layers.scale(x=q, scale=d_key**-0.5) product = layers.matmul(x=scaled_q, y=k, transpose_y=True) if attn_bias: product += attn_bias weights = layers.softmax(product) ############################ # add code layers.Print(attn_bias, message="The content of input layer:") attn_mask = attn_bias == 0 attn_mask = layers.cast(attn_mask, 'float64') layers.Print(weights) weights = layers.elementwise_mul(attn_mask, weights) layers.Print(weights) # weights = layers.elementwise_mul(weights, attn_mask) ############################ if dropout_rate: weights = layers.dropout(weights, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False) out = layers.matmul(weights, v) return out
def __init__(self, x, y, y_aux, cfg): self.program = fluid.default_main_program().clone() with fluid.program_guard(self.program): model = ACGAN(cfg.latent_size, cfg.num_classes) self.fake, self.aux = model.network_d(x, name='d') self.fake_loss = layers.sigmoid_cross_entropy_with_logits( x=self.fake, label=y) self.aux_loss = layers.softmax_with_cross_entropy(logits=self.aux, label=y_aux) self.unweighted_loss = layers.reduce_sum(self.fake_loss + self.aux_loss) self.infer_program = self.program.clone(for_test=True) # we don't want the discriminator to also maximize the classification # accuracy of the auxiliary classifier on generated images, so we # don't train discriminator to produce class labels for generated # images (see https://openreview.net/forum?id=rJXTf9Bxg). # To preserve sum of sample weights for the auxiliary classifier, # we assign sample weight of 2 to the real images. fake_loss_weight = layers.ones(shape=[cfg.batch_size * 2, 1], dtype='float32') aux_loss_weight_zeros = layers.zeros(shape=[cfg.batch_size, 1], dtype='float32') aux_loss_weight_twos = layers.fill_constant( shape=[cfg.batch_size, 1], value=2.0, dtype='float32') aux_loss_weight = layers.concat( [aux_loss_weight_twos, aux_loss_weight_zeros]) self.fake_loss = layers.elementwise_mul(self.fake_loss, fake_loss_weight) self.aux_loss = layers.elementwise_mul(self.aux_loss, aux_loss_weight) self.loss = layers.reduce_sum(self.fake_loss) + layers.reduce_sum( self.aux_loss) vars = [] for var in self.program.list_vars(): if fluid.io.is_parameter(var) and (var.name.startswith("d")): vars.append(var.name) optimizer = fluid.optimizer.Adam(learning_rate=cfg.adam_lr, beta1=cfg.adam_beta_1, name="net_d") optimizer.minimize(self.loss, parameter_list=vars)
def _dygraph_clip(self, params_grads): params_and_grads = [] # clip by value first for p, g in params_grads: if g is None: continue if self._need_clip_func is not None and not self._need_clip_func( p): params_and_grads.append((p, g)) continue new_grad = layers.clip(x=g, min=-self.clip_value, max=self.clip_value) params_and_grads.append((p, new_grad)) params_grads = params_and_grads # clip by global norm params_and_grads = [] sum_square_list = [] for p, g in params_grads: if g is None: continue if self._need_clip_func is not None and not self._need_clip_func( p): continue merge_grad = g if g.type == core.VarDesc.VarType.SELECTED_ROWS: merge_grad = layers.merge_selected_rows(g) merge_grad = layers.get_tensor_from_selected_rows(merge_grad) square = layers.square(merge_grad) sum_square = layers.reduce_sum(square) sum_square_list.append(sum_square) # all parameters have been filterd out if len(sum_square_list) == 0: return params_grads global_norm_var = layers.concat(sum_square_list) global_norm_var = layers.reduce_sum(global_norm_var) global_norm_var = layers.sqrt(global_norm_var) max_global_norm = layers.fill_constant(shape=[1], dtype='float32', value=self.clip_norm) clip_var = layers.elementwise_div(x=max_global_norm, y=layers.elementwise_max( x=global_norm_var, y=max_global_norm)) for p, g in params_grads: if g is None: continue if self._need_clip_func is not None and not self._need_clip_func( p): params_and_grads.append((p, g)) continue new_grad = layers.elementwise_mul(x=g, y=clip_var) params_and_grads.append((p, new_grad)) return params_and_grads
def forward(self, input, state): #logging.info("input shape: {}".format(input.shape)) pre_hidden, pre_cell = state #logging.info("pre hidden shape: {}".format(pre_hidden.shape)) #logging.info("pre cell shape: {}".format(pre_cell.shape)) # i,f,c,o 四个值均有Wx+Wh+b 即W(x+h)+b # 因此: # 实际相乘为[x, b]·W+b # x,b 横向相连, shape为[batch_size, input_size+hidden_size] # W的shape为[input_size+hidden_size, 4*hidden_size] # b的shape为[4*hidden_size,] # 横向连接 # shape: [batch_size, input_size+hidden_size] concat_input_hidden = L.concat([input, pre_hidden], axis=1) #logging.info("x concat h shape: {}".format(concat_input_hidden.shape)) # 计算Wx+Wh+b # shape: [batch_size, 4*hidden_size] gate_input = L.matmul(x=concat_input_hidden, y=self._weight) #logging.info("[x, b]·W shape: {}".format(gate_input.shape)) # shape: [batch_size, 4*hidden_size] gate_input = L.elementwise_add(gate_input, self._bias) #logging.info("[x, b]·W+b shape: {}".format(gate_input.shape)) # i,f,c,o四值按最后一维分开 因此每个的最后一维都是hidden_size i, f, c, o = L.split(gate_input, num_or_sections=4, dim=-1) # new_c = pre_c·sigmoid(f+forget_bias) + sigmoid(i)·tanh(c) # shape: [batch_size, hidden_size] new_cell = L.elementwise_add( L.elementwise_mul( pre_cell, L.sigmoid(L.elementwise_add(f, self._forget_bias))), L.elementwise_mul(L.sigmoid(i), L.tanh(c)) ) #logging.info("new_cell shape: {}".format(new_cell.shape)) # new_h = tanh(new_c)*sigmoid(o) # shape: [batch_size, hidden_size] new_hidden = L.tanh(new_cell) * L.sigmoid(o) #logging.info("new_hidden shape: {}".format(new_hidden.shape)) return new_hidden, [new_hidden, new_cell]
def build_model(self, enc_input, dec_input, tgt_label, label_weights): """Build the model with source encoding and target decoding""" enc_word_output, enc_sen_output = self.encode(enc_input) dec_output = self.decode(dec_input, enc_word_output, enc_sen_output) predict_token_idx = layers.argmax(dec_output, axis=-1) correct_token_idx = layers.cast(layers.equal( tgt_label, layers.reshape(predict_token_idx, shape=[-1, 1])), dtype='float32') weighted_correct = layers.elementwise_mul(x=correct_token_idx, y=label_weights, axis=0) sum_correct = layers.reduce_sum(weighted_correct) sum_correct.stop_gradient = True # Padding index do not contribute to the total loss. The weights is used to # cancel padding index in calculating the loss. if self._label_smooth_eps: # TODO: use fluid.input.one_hot after softmax_with_cross_entropy removing # the enforcement that the last dimension of label must be 1. tgt_label = layers.label_smooth(label=layers.one_hot( input=tgt_label, depth=self.voc_size), epsilon=self._label_smooth_eps) cost = layers.softmax_with_cross_entropy( logits=dec_output, label=tgt_label, soft_label=True if self._label_smooth_eps else False) weighted_cost = layers.elementwise_mul(x=cost, y=label_weights, axis=0) sum_cost = layers.reduce_sum(weighted_cost) token_num = layers.reduce_sum(label_weights) token_num.stop_gradient = True avg_cost = sum_cost / token_num graph_vars = { "loss": avg_cost, "sum_correct": sum_correct, "token_num": token_num, } for k, v in graph_vars.items(): v.persistable = True return graph_vars
def body_func(step_idx, pre_ids, pre_scores, gather_idx, caches, trg_src_attn_bias): # gather cell states corresponding to selected parent pre_caches = map_structure( lambda x: layers.gather(x, index=gather_idx), caches) pre_src_attn_bias = layers.gather(trg_src_attn_bias, index=gather_idx) pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_src_attn_bias, # cann't use lod tensor here value=1, shape=[-1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) logits = wrap_decoder((pre_ids, pre_pos, None, pre_src_attn_bias), trg_vocab_size, max_in_len, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, enc_output=enc_output, caches=pre_caches, bos_idx=bos_idx) # intra-beam topK topk_scores, topk_indices = layers.topk( input=layers.softmax(logits), k=beam_size) accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores, axis=0) # beam_search op uses lod to differentiate branches. accu_scores = layers.lod_reset(accu_scores, pre_ids) # topK reduction across beams, also contain special handle of # end beams and end sentences(batch reduction) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=beam_size, end_id=eos_idx, return_parent_idx=True) step_idx = layers.increment(x=step_idx, value=1.0, in_place=False) layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) return (step_idx, selected_ids, selected_scores, gather_idx, pre_caches, pre_src_attn_bias)