def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate): """ Scaled Dot-Product Attention """ # FIXME(guosheng): Optimize the shape in reshape_op or softmax_op. # The current implementation of softmax_op only supports 2D tensor, # consequently it cannot be directly used here. # If to use the reshape_op, Besides, the shape of product inferred in # compile-time is not the actual shape in run-time. It cann't be used # to set the attribute of reshape_op. # So, here define the softmax for temporary solution. def __softmax(x, eps=1e-9): exp_out = layers.exp(x=x) sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False) return layers.elementwise_div(x=exp_out, y=sum_out, axis=0) scaled_q = layers.scale(x=q, scale=d_model**-0.5) product = layers.matmul(x=scaled_q, y=k, transpose_y=True) weights = __softmax(layers.elementwise_add(x=product, y=attn_bias)) if dropout_rate: weights = layers.dropout( weights, dropout_prob=dropout_rate, is_test=False) out = layers.matmul(weights, v) return out
def _ernie_model_forward(self, src_ids, sent_ids=None, pos_ids=None, input_mask=None, attn_bias=None, past_cache=None, use_causal_mask=False, num_layers=12, depth=1., head_mask=None): assert len( src_ids.shape ) == 2, 'expect src_ids.shape = [batch, sequecen], got %s' % (repr( src_ids.shape)) assert attn_bias is not None if past_cache else True, 'if `past_cache` is specified; attn_bias should not be None' d_batch = L.shape(src_ids)[0] d_seqlen = L.shape(src_ids)[1] if pos_ids is None: pos_ids = L.reshape(L.range(0, d_seqlen, 1, dtype='int32'), [1, -1]) pos_ids = L.cast(pos_ids, 'int64') if attn_bias is None: if input_mask is None: input_mask = L.cast(src_ids != 0, 'float32') assert len(input_mask.shape) == 2 input_mask = L.unsqueeze(input_mask, axes=[-1]) attn_bias = L.matmul(input_mask, input_mask, transpose_y=True) if use_causal_mask: sequence = L.reshape( L.range(0, d_seqlen, 1, dtype='float32') + 1., [1, 1, -1, 1]) causal_mask = L.cast( (L.matmul(sequence, 1. / sequence, transpose_y=True) >= 1.), 'float32') attn_bias *= causal_mask else: assert len( attn_bias.shape ) == 3, 'expect attn_bias tobe rank 3, got %r' % attn_bias.shape attn_bias = (1. - attn_bias) * -10000.0 attn_bias = L.unsqueeze(attn_bias, [1]) attn_bias.stop_gradient = True if sent_ids is None: sent_ids = L.zeros_like(src_ids) if head_mask is not None: if len(head_mask.shape) == 1: head_mask = L.unsqueeze( L.unsqueeze(L.unsqueeze(L.unsqueeze(head_mask, 0), 0), -1), -1) head_mask = L.expand(head_mask, expand_times=[num_layers, 1, 1, 1, 1]) elif len(head_mask.shape) == 2: head_mask = L.unsqueeze(L.unsqueeze(L.unsqueeze(head_mask, 1), -1), -1) else: head_mask = [None] * num_layers src_embedded = self.word_emb(src_ids) pos_embedded = self.pos_emb(pos_ids) sent_embedded = self.sent_emb(sent_ids) embedded = src_embedded + pos_embedded + sent_embedded embedded = self.dropout(self.ln(embedded)) encoded, hidden_list, cache_list = self.encoder_stack( embedded, attn_bias, past_cache=past_cache, num_layers=num_layers, depth_mult=depth, head_mask=head_mask) if self.pooler is not None: pooled = self.pooler(encoded[:, 0, :]) else: pooled = None additional_info = { 'hiddens': hidden_list, 'caches': cache_list, } if self.return_additional_info: return pooled, encoded, additional_info else: return pooled, encoded
def scaled_dot_product_attention(q, k, v, attn_bias, biaffine_transformation, biaffine_transformation_bias, structure_mask, with_ent_structure, d_key, dropout_rate): """ Scaled Dot-Product Attention """ scaled_q = layers.scale(x=q, scale=d_key**-0.5) product = layers.matmul(x=scaled_q, y=k, transpose_y=True) if with_ent_structure: # TRANSFORMATION # 1.reshape input # q: [bs, n_head, seq, hidden] -> [bs, 1, n_head, seq, hidden] -> [bs, 5, n_head, seq, hidden] # -> [5, n_head, bs, seq, hidden] -> [5, n_head, bs * seq, hidden] # transformation: [dependencies(5), n_head, hidden, hidden] # k: [bs, n_head, seq, hidden] -> [bs, 1, n_head, seq, hidden] q_ = layers.unsqueeze(scaled_q, [1]) q_ = layers.expand(q_, [1, biaffine_transformation.shape[0], 1, 1, 1]) q_ = layers.transpose(q_, perm=[1, 2, 0, 3, 4]) q_ = layers.reshape( q_, shape=[0, 0, -1, biaffine_transformation.shape[3]], inplace=True) k_ = layers.unsqueeze(k, [1]) k_ = layers.expand(k_, [1, biaffine_transformation.shape[0], 1, 1, 1]) # 2.implement matmul # q * transformation: [5, n_head, bs * seq, hidden] # q * transformation: [5, n_head, bs * seq, hidden] -> [5, n_head, bs, seq, hidden] # -> [bs, dependencies(5), n_head, seq, hidden] # q * transformation * k: [bs, dependencies(5), n_head, seq, seq] structured_bias = layers.matmul(x=q_, y=biaffine_transformation) structured_bias = layers.reshape( structured_bias, shape=[0, 0, -1, k_.shape[3], k_.shape[4]], inplace=True) structured_bias = layers.transpose(structured_bias, perm=[2, 0, 1, 3, 4]) structured_bias = layers.matmul(x=structured_bias, y=k_, transpose_y=True) structured_bias = layers.elementwise_add( structured_bias, biaffine_transformation_bias, axis=1) # mask & apply structured_bias = structured_bias * structure_mask structured_bias = layers.reduce_sum(structured_bias, dim=1) product += structured_bias if attn_bias: product += attn_bias weights = layers.softmax(product) if dropout_rate: weights = layers.dropout(weights, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False) out = layers.matmul(weights, v) return out
def forward(self): src, dsts = L.read_file(self.pyreader) if self.is_sparse: # sparse mode use 2 dims input. src = L.reshape(src, [-1, 1]) dsts = L.reshape(dsts, [-1, 1]) if self.num_part is not None and self.num_part != 1 and not self.is_distributed: src_embed = split_embedding(src, self.num_nodes, self.hidden_size, self.embed_init, "weight", self.num_part, self.is_sparse, learning_rate=self.embedding_lr) dsts_embed = split_embedding(dsts, self.num_nodes, self.hidden_size, self.embed_init, "weight", self.num_part, self.is_sparse, learning_rate=self.embedding_lr) else: print("Leo: L.embedding:", self.num_nodes) src_embed = L.embedding(src, (self.num_nodes, self.hidden_size), self.is_sparse, self.is_distributed, param_attr=F.ParamAttr( name="weight", learning_rate=self.embedding_lr, initializer=self.embed_init)) print("Leo: L.embedding:", src_embed) dsts_embed = L.embedding(dsts, (self.num_nodes, self.hidden_size), self.is_sparse, self.is_distributed, param_attr=F.ParamAttr( name="weight", learning_rate=self.embedding_lr, initializer=self.embed_init)) if self.is_sparse: # reshape back src_embed = L.reshape(src_embed, [-1, 1, self.hidden_size]) dsts_embed = L.reshape(dsts_embed, [-1, self.neg_num + 1, self.hidden_size]) logits = L.matmul(src_embed, dsts_embed, transpose_y=True) # [batch_size, 1, neg_num+1] pos_label = L.fill_constant_batch_size_like(logits, [-1, 1, 1], "float32", 1) neg_label = L.fill_constant_batch_size_like(logits, [-1, 1, self.neg_num], "float32", 0) label = L.concat([pos_label, neg_label], -1) pos_weight = L.fill_constant_batch_size_like(logits, [-1, 1, 1], "float32", self.neg_num) neg_weight = L.fill_constant_batch_size_like(logits, [-1, 1, self.neg_num], "float32", 1) weight = L.concat([pos_weight, neg_weight], -1) weight.stop_gradient = True label.stop_gradient = True loss = L.sigmoid_cross_entropy_with_logits(logits, label) loss = loss * weight loss = L.reduce_mean(loss) loss = loss * ((self.neg_num + 1) / 2 / self.neg_num) loss.persistable = True self.loss = loss return loss
def forward(self, src_ids, sent_ids=None, pos_ids=None, input_mask=None, attn_bias=None, past_cache=None, use_causal_mask=False): """ Args: src_ids (`Variable` of shape `[batch_size, seq_len]`): Indices of input sequence tokens in the vocabulary. sent_ids (optional, `Variable` of shape `[batch_size, seq_len]`): aka token_type_ids, Segment token indices to indicate first and second portions of the inputs. if None, assume all tokens come from `segment_a` pos_ids(optional, `Variable` of shape `[batch_size, seq_len]`): Indices of positions of each input sequence tokens in the position embeddings. input_mask(optional `Variable` of shape `[batch_size, seq_len]`): Mask to avoid performing attention on the padding token indices of the encoder input. attn_bias(optional, `Variable` of shape `[batch_size, seq_len, seq_len] or False`): 3D version of `input_mask`, if set, overrides `input_mask`; if set not False, will not apply attention mask past_cache(optional, tuple of two lists: cached key and cached value, each is a list of `Variable`s of shape `[batch_size, seq_len, hidden_size]`): cached key/value tensor that will be concated to generated key/value when performing self attention. if set, `attn_bias` should not be None. Returns: pooled (`Variable` of shape `[batch_size, hidden_size]`): output logits of pooler classifier encoded(`Variable` of shape `[batch_size, seq_len, hidden_size]`): output logits of transformer stack info (Dictionary): addtional middle level info, inclues: all hidden stats, k/v caches. """ # d_batch, d_seqlen = src_ids.shape assert len(src_ids.shape) == 2, 'expect src_ids.shape = [batch, sequecen], got %s' % (repr(src_ids.shape)) assert attn_bias is not None if past_cache else True, 'if `past_cache` is specified; attn_bias should not be None' d_batch = L.shape(src_ids)[0] d_seqlen = L.shape(src_ids)[1] if pos_ids is None: pos_ids = L.reshape(L.range(0, d_seqlen, 1, dtype='int32'), [1, -1]) pos_ids = L.cast(pos_ids, 'int64') if attn_bias is None: if input_mask is None: input_mask = L.cast(src_ids != 0, 'float32') assert len(input_mask.shape) == 2 input_mask = L.unsqueeze(input_mask, axes=[-1]) attn_bias = L.matmul(input_mask, input_mask, transpose_y=True) if use_causal_mask: sequence = L.reshape(L.range(0, d_seqlen, 1, dtype='float32') + 1., [1, 1, -1, 1]) causal_mask = L.cast((L.matmul(sequence, 1. / sequence, transpose_y=True) >= 1.), 'float32') attn_bias *= causal_mask else: assert len(attn_bias.shape) == 3, 'expect attn_bias tobe rank 3, got %r' % attn_bias.shape attn_bias = (1. - attn_bias) * -10000.0 attn_bias = L.unsqueeze(attn_bias, [1]) attn_bias = L.expand(attn_bias, [1, self.n_head, 1, 1]) # avoid broadcast =_= attn_bias.stop_gradient = True if sent_ids is None: sent_ids = L.zeros_like(src_ids) src_embedded = self.word_emb(src_ids) pos_embedded = self.pos_emb(pos_ids) sent_embedded = self.sent_emb(sent_ids) embedded = src_embedded + pos_embedded + sent_embedded embedded = self.dropout(self.ln(embedded)) encoded, hidden_list, cache_list = self.encoder_stack(embedded, attn_bias, past_cache=past_cache) if self.pooler is not None: pooled = self.pooler(encoded[:, 0, :]) else: pooled = None additional_info = { 'hiddens': hidden_list, 'caches': cache_list, } if self.return_additional_info: return pooled, encoded, additional_info else: return pooled, encoded
def _build_decoder(self, enc_last_hidden, enc_last_cell, mode='train', beam_size=10): softmax_weight = layers.create_parameter([self.hidden_size, self.tar_vocab_size], dtype="float32", name="softmax_weight", \ default_initializer=fluid.initializer.UniformInitializer(low=-self.init_scale, high=self.init_scale)) if mode == 'train': dec_output, dec_last_hidden, dec_last_cell = basic_lstm( self.tar_emb, enc_last_hidden, enc_last_cell, \ self.hidden_size, num_layers=self.num_layers, \ batch_first=self.batch_first, \ dropout_prob=self.dropout, \ param_attr = ParamAttr( initializer=fluid.initializer.UniformInitializer(low=-self.init_scale, high=self.init_scale) ), \ bias_attr = ParamAttr( initializer = fluid.initializer.Constant(0.0) )) dec_output = layers.matmul(dec_output, softmax_weight) return dec_output elif mode == 'beam_search' or mode == 'greedy_search': dec_unit_list = [] name = 'basic_lstm' for i in range(self.num_layers): new_name = name + "_layers_" + str(i) dec_unit_list.append( BasicLSTMUnit(new_name, self.hidden_size, dtype='float32')) def decoder_step(current_in, pre_hidden_array, pre_cell_array): new_hidden_array = [] new_cell_array = [] step_in = current_in for i in range(self.num_layers): pre_hidden = pre_hidden_array[i] pre_cell = pre_cell_array[i] new_hidden, new_cell = dec_unit_list[i](step_in, pre_hidden, pre_cell) new_hidden_array.append(new_hidden) new_cell_array.append(new_cell) step_in = new_hidden return step_in, new_hidden_array, new_cell_array if mode == 'beam_search': max_src_seq_len = layers.shape(self.src)[1] max_length = max_src_seq_len * 2 #max_length = layers.fill_constant( [1], dtype='int32', value = 10) pre_ids = layers.fill_constant([1, 1], dtype='int64', value=1) full_ids = layers.fill_constant([1, 1], dtype='int64', value=1) score = layers.fill_constant([1], dtype='float32', value=0.0) #eos_ids = layers.fill_constant( [1, 1], dtype='int64', value=2) pre_hidden_array = [] pre_cell_array = [] pre_feed = layers.fill_constant([beam_size, self.hidden_size], dtype='float32', value=0) for i in range(self.num_layers): pre_hidden_array.append( layers.expand(enc_last_hidden[i], [beam_size, 1])) pre_cell_array.append( layers.expand(enc_last_cell[i], [beam_size, 1])) eos_ids = layers.fill_constant([beam_size], dtype='int64', value=2) init_score = np.zeros((beam_size)).astype('float32') init_score[1:] = -INF pre_score = layers.assign(init_score) #pre_score = layers.fill_constant( [1,], dtype='float32', value= 0.0) tokens = layers.fill_constant([beam_size, 1], dtype='int64', value=1) enc_memory = layers.expand(self.enc_output, [beam_size, 1, 1]) pre_tokens = layers.fill_constant([beam_size, 1], dtype='int64', value=1) finished_seq = layers.fill_constant([beam_size, 1], dtype='int64', value=0) finished_scores = layers.fill_constant([beam_size], dtype='float32', value=-INF) finished_flag = layers.fill_constant([beam_size], dtype='float32', value=0.0) step_idx = layers.fill_constant(shape=[1], dtype='int32', value=0) cond = layers.less_than(x=step_idx, y=max_length) # default force_cpu=True parent_idx = layers.fill_constant([1], dtype='int32', value=0) while_op = layers.While(cond) def compute_topk_scores_and_seq(sequences, scores, scores_to_gather, flags, beam_size, select_beam=None, generate_id=None): scores = layers.reshape(scores, shape=[1, -1]) _, topk_indexs = layers.topk(scores, k=beam_size) topk_indexs = layers.reshape(topk_indexs, shape=[-1]) # gather result top_seq = layers.gather(sequences, topk_indexs) topk_flags = layers.gather(flags, topk_indexs) topk_gather_scores = layers.gather(scores_to_gather, topk_indexs) if select_beam: topk_beam = layers.gather(select_beam, topk_indexs) else: topk_beam = select_beam if generate_id: topk_id = layers.gather(generate_id, topk_indexs) else: topk_id = generate_id return top_seq, topk_gather_scores, topk_flags, topk_beam, topk_id def grow_alive(curr_seq, curr_scores, curr_log_probs, curr_finished, select_beam, generate_id): curr_scores += curr_finished * -INF return compute_topk_scores_and_seq(curr_seq, curr_scores, curr_log_probs, curr_finished, beam_size, select_beam, generate_id=generate_id) def grow_finished(finished_seq, finished_scores, finished_flag, curr_seq, curr_scores, curr_finished): finished_seq = layers.concat([ finished_seq, layers.fill_constant( [beam_size, 1], dtype='int64', value=1) ], axis=1) curr_scores += (1.0 - curr_finished) * -INF #layers.Print( curr_scores, message="curr scores") curr_finished_seq = layers.concat([finished_seq, curr_seq], axis=0) curr_finished_scores = layers.concat( [finished_scores, curr_scores], axis=0) curr_finished_flags = layers.concat( [finished_flag, curr_finished], axis=0) return compute_topk_scores_and_seq(curr_finished_seq, curr_finished_scores, curr_finished_scores, curr_finished_flags, beam_size) def is_finished(alive_log_prob, finished_scores, finished_in_finished): max_out_len = 200 max_length_penalty = layers.pow( layers.fill_constant([1], dtype='float32', value=((5.0 + max_out_len) / 6.0)), alpha) lower_bound_alive_score = layers.slice( alive_log_prob, starts=[0], ends=[1], axes=[0]) / max_length_penalty lowest_score_of_fininshed_in_finished = finished_scores * finished_in_finished lowest_score_of_fininshed_in_finished += ( 1.0 - finished_in_finished) * -INF lowest_score_of_fininshed_in_finished = layers.reduce_min( lowest_score_of_fininshed_in_finished) met = layers.less_than( lower_bound_alive_score, lowest_score_of_fininshed_in_finished) met = layers.cast(met, 'float32') bound_is_met = layers.reduce_sum(met) finished_eos_num = layers.reduce_sum(finished_in_finished) finish_cond = layers.less_than( finished_eos_num, layers.fill_constant([1], dtype='float32', value=beam_size)) return finish_cond def grow_top_k(step_idx, alive_seq, alive_log_prob, parant_idx): pre_ids = alive_seq dec_step_emb = layers.embedding( input=pre_ids, size=[self.tar_vocab_size, self.hidden_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr( name='target_embedding', initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale))) dec_att_out, new_hidden_array, new_cell_array = decoder_step( dec_step_emb, pre_hidden_array, pre_cell_array) projection = layers.matmul(dec_att_out, softmax_weight) logits = layers.softmax(projection) current_log = layers.elementwise_add(x=layers.log(logits), y=alive_log_prob, axis=0) base_1 = layers.cast(step_idx, 'float32') + 6.0 base_1 /= 6.0 length_penalty = layers.pow(base_1, alpha) len_pen = layers.pow( ((5. + layers.cast(step_idx + 1, 'float32')) / 6.), alpha) current_log = layers.reshape(current_log, shape=[1, -1]) current_log = current_log / length_penalty topk_scores, topk_indices = layers.topk(input=current_log, k=beam_size) topk_scores = layers.reshape(topk_scores, shape=[-1]) topk_log_probs = topk_scores * length_penalty generate_id = layers.reshape( topk_indices, shape=[-1]) % self.tar_vocab_size selected_beam = layers.reshape( topk_indices, shape=[-1]) // self.tar_vocab_size topk_finished = layers.equal(generate_id, eos_ids) topk_finished = layers.cast(topk_finished, 'float32') generate_id = layers.reshape(generate_id, shape=[-1, 1]) pre_tokens_list = layers.gather(tokens, selected_beam) full_tokens_list = layers.concat( [pre_tokens_list, generate_id], axis=1) return full_tokens_list, topk_log_probs, topk_scores, topk_finished, selected_beam, generate_id, \ dec_att_out, new_hidden_array, new_cell_array with while_op.block(): topk_seq, topk_log_probs, topk_scores, topk_finished, topk_beam, topk_generate_id, attention_out, new_hidden_array, new_cell_array = \ grow_top_k( step_idx, pre_tokens, pre_score, parent_idx) alive_seq, alive_log_prob, _, alive_beam, alive_id = grow_alive( topk_seq, topk_scores, topk_log_probs, topk_finished, topk_beam, topk_generate_id) finished_seq_2, finished_scores_2, finished_flags_2, _, _ = grow_finished( finished_seq, finished_scores, finished_flag, topk_seq, topk_scores, topk_finished) finished_cond = is_finished(alive_log_prob, finished_scores_2, finished_flags_2) layers.increment(x=step_idx, value=1.0, in_place=True) layers.assign(alive_beam, parent_idx) layers.assign(alive_id, pre_tokens) layers.assign(alive_log_prob, pre_score) layers.assign(alive_seq, tokens) layers.assign(finished_seq_2, finished_seq) layers.assign(finished_scores_2, finished_scores) layers.assign(finished_flags_2, finished_flag) # update init_hidden, init_cell, input_feed new_feed = layers.gather(attention_out, parent_idx) layers.assign(new_feed, pre_feed) for i in range(self.num_layers): new_hidden_var = layers.gather(new_hidden_array[i], parent_idx) layers.assign(new_hidden_var, pre_hidden_array[i]) new_cell_var = layers.gather(new_cell_array[i], parent_idx) layers.assign(new_cell_var, pre_cell_array[i]) length_cond = layers.less_than(x=step_idx, y=max_length) layers.logical_and(x=length_cond, y=finished_cond, out=cond) tokens_with_eos = tokens all_seq = layers.concat([tokens_with_eos, finished_seq], axis=0) all_score = layers.concat([pre_score, finished_scores], axis=0) _, topk_index = layers.topk(all_score, k=beam_size) topk_index = layers.reshape(topk_index, shape=[-1]) final_seq = layers.gather(all_seq, topk_index) final_score = layers.gather(all_score, topk_index) return final_seq elif mode == 'greedy_search': max_src_seq_len = layers.shape(self.src)[1] max_length = max_src_seq_len * 2 #max_length = layers.fill_constant( [1], dtype='int32', value = 10) pre_ids = layers.fill_constant([1, 1], dtype='int64', value=1) full_ids = layers.fill_constant([1, 1], dtype='int64', value=1) score = layers.fill_constant([1], dtype='float32', value=0.0) eos_ids = layers.fill_constant([1, 1], dtype='int64', value=2) pre_hidden_array = [] pre_cell_array = [] pre_feed = layers.fill_constant([1, self.hidden_size], dtype='float32', value=0) for i in range(self.num_layers): pre_hidden_array.append(enc_last_hidden[i]) pre_cell_array.append(enc_last_cell[i]) #pre_hidden_array.append( layers.fill_constant( [1, hidden_size], dtype='float32', value=0) ) #pre_cell_array.append( layers.fill_constant( [1, hidden_size], dtype='float32', value=0) ) step_idx = layers.fill_constant(shape=[1], dtype='int32', value=0) cond = layers.less_than(x=step_idx, y=max_length) # default force_cpu=True while_op = layers.While(cond) with while_op.block(): dec_step_emb = layers.embedding( input=pre_ids, size=[self.tar_vocab_size, self.hidden_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr( name='target_embedding', initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale))) dec_att_out, new_hidden_array, new_cell_array = decoder_step( dec_step_emb, pre_hidden_array, pre_cell_array) projection = layers.matmul(dec_att_out, softmax_weight) logits = layers.softmax(projection) logits = layers.log(logits) current_log = layers.elementwise_add(logits, score, axis=0) topk_score, topk_indices = layers.topk(input=current_log, k=1) new_ids = layers.concat([full_ids, topk_indices]) layers.assign(new_ids, full_ids) #layers.Print( full_ids, message="ful ids") layers.assign(topk_score, score) layers.assign(topk_indices, pre_ids) layers.assign(dec_att_out, pre_feed) for i in range(self.num_layers): layers.assign(new_hidden_array[i], pre_hidden_array[i]) layers.assign(new_cell_array[i], pre_cell_array[i]) layers.increment(x=step_idx, value=1.0, in_place=True) eos_met = layers.not_equal(topk_indices, eos_ids) length_cond = layers.less_than(x=step_idx, y=max_length) layers.logical_and(x=length_cond, y=eos_met, out=cond) return full_ids raise Exception("error") else: print("mode not supprt", mode)
def net(self, inputs, is_infer=False): if is_infer: bs = self.evaluate_batch_size else: bs = self.train_batch_size stdv = 1.0 / math.sqrt(self.hidden_size) def embedding_layer(input, table_name, emb_dim, initializer_instance=None): emb = fluid.embedding(input=input, size=[self.dict_size, emb_dim], param_attr=fluid.ParamAttr( name=table_name, initializer=initializer_instance)) return emb sparse_initializer = fluid.initializer.Uniform(low=-stdv, high=stdv) items_emb = embedding_layer(inputs[0], "emb", self.hidden_size, sparse_initializer) pre_state = items_emb for i in range(self.step): pre_state = layers.reshape(x=pre_state, shape=[bs, -1, self.hidden_size]) state_in = layers.fc( input=pre_state, name="state_in", size=self.hidden_size, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr(initializer=fluid.initializer. Uniform(low=-stdv, high=stdv)), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) # [batch_size, uniq_max, h] state_out = layers.fc( input=pre_state, name="state_out", size=self.hidden_size, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr(initializer=fluid.initializer. Uniform(low=-stdv, high=stdv)), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) # [batch_size, uniq_max, h] state_adj_in = layers.matmul(inputs[3], state_in) # [batch_size, uniq_max, h] state_adj_out = layers.matmul( inputs[4], state_out) # [batch_size, uniq_max, h] gru_input = layers.concat([state_adj_in, state_adj_out], axis=2) gru_input = layers.reshape(x=gru_input, shape=[-1, self.hidden_size * 2]) gru_fc = layers.fc(input=gru_input, name="gru_fc", size=3 * self.hidden_size, bias_attr=False) pre_state, _, _ = fluid.layers.gru_unit( input=gru_fc, hidden=layers.reshape(x=pre_state, shape=[-1, self.hidden_size]), size=3 * self.hidden_size) final_state = layers.reshape(pre_state, shape=[bs, -1, self.hidden_size]) seq = layers.gather_nd(final_state, inputs[1]) last = layers.gather_nd(final_state, inputs[2]) seq_fc = layers.fc( input=seq, name="seq_fc", size=self.hidden_size, bias_attr=False, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) # [batch_size, seq_max, h] last_fc = layers.fc( input=last, name="last_fc", size=self.hidden_size, bias_attr=False, act=None, num_flatten_dims=1, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) # [bathc_size, h] seq_fc_t = layers.transpose(seq_fc, perm=[1, 0, 2]) # [seq_max, batch_size, h] add = layers.elementwise_add(seq_fc_t, last_fc) # [seq_max, batch_size, h] b = layers.create_parameter( shape=[self.hidden_size], dtype='float32', default_initializer=fluid.initializer.Constant(value=0.0)) # [h] add = layers.elementwise_add(add, b) # [seq_max, batch_size, h] add_sigmoid = layers.sigmoid(add) # [seq_max, batch_size, h] add_sigmoid = layers.transpose(add_sigmoid, perm=[1, 0, 2]) # [batch_size, seq_max, h] weight = layers.fc( input=add_sigmoid, name="weight_fc", size=1, act=None, num_flatten_dims=2, bias_attr=False, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) # [batch_size, seq_max, 1] weight *= inputs[5] weight_mask = layers.elementwise_mul( seq, weight, axis=0) # [batch_size, seq_max, h] global_attention = layers.reduce_sum(weight_mask, dim=1) # [batch_size, h] final_attention = layers.concat([global_attention, last], axis=1) # [batch_size, 2*h] final_attention_fc = layers.fc( input=final_attention, name="final_attention_fc", size=self.hidden_size, bias_attr=False, act=None, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) # [batch_size, h] # all_vocab = layers.create_global_var( # shape=[items_num - 1], # value=0, # dtype="int64", # persistable=True, # name="all_vocab") all_vocab = np.arange(1, self.dict_size).reshape((-1)).astype('int32') all_vocab = fluid.layers.cast(x=fluid.layers.assign(all_vocab), dtype='int64') all_emb = fluid.embedding( input=all_vocab, param_attr=fluid.ParamAttr(name="emb", initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), size=[self.dict_size, self.hidden_size]) # [all_vocab, h] logits = layers.matmul(x=final_attention_fc, y=all_emb, transpose_y=True) # [batch_size, all_vocab] softmax = layers.softmax_with_cross_entropy( logits=logits, label=inputs[6]) # [batch_size, 1] self.loss = layers.reduce_mean(softmax) # [1] acc = RecallK(input=logits, label=inputs[6], k=20) self._cost = self.loss if is_infer: self._infer_results['P@20'] = acc self._infer_results['LOSS'] = self.loss return self._metrics["LOSS"] = self.loss self._metrics["Train_P@20"] = acc
def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None): weight_1_arr = [] weight_2_arr = [] bias_arr = [] hidden_array = [] cell_array = [] mask_array = [] for i in range(num_layers): weight_1 = layers.create_parameter([hidden_size * 2, hidden_size*4], dtype="float32", name="fc_weight1_"+str(i), \ default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale)) weight_1_arr.append(weight_1) bias_1 = layers.create_parameter( [hidden_size * 4], dtype="float32", name="fc_bias1_" + str(i), default_initializer=fluid.initializer.Constant(0.0)) bias_arr.append(bias_1) pre_hidden = layers.slice(init_hidden, axes=[0], starts=[i], ends=[i + 1]) pre_cell = layers.slice(init_cell, axes=[0], starts=[i], ends=[i + 1]) pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size]) pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size]) hidden_array.append(pre_hidden) cell_array.append(pre_cell) input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2]) rnn = PaddingRNN() with rnn.step(): input = rnn.step_input(input_embedding) for k in range(num_layers): pre_hidden = rnn.memory(init=hidden_array[k]) pre_cell = rnn.memory(init=cell_array[k]) weight_1 = weight_1_arr[k] bias = bias_arr[k] nn = layers.concat([input, pre_hidden], 1) gate_input = layers.matmul(x=nn, y=weight_1) gate_input = layers.elementwise_add(gate_input, bias) #i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) i = layers.slice(gate_input, axes=[1], starts=[0], ends=[hidden_size]) j = layers.slice(gate_input, axes=[1], starts=[hidden_size], ends=[hidden_size * 2]) f = layers.slice(gate_input, axes=[1], starts=[hidden_size * 2], ends=[hidden_size * 3]) o = layers.slice(gate_input, axes=[1], starts=[hidden_size * 3], ends=[hidden_size * 4]) c = pre_cell * layers.sigmoid(f) + layers.sigmoid( i) * layers.tanh(j) m = layers.tanh(c) * layers.sigmoid(o) rnn.update_memory(pre_hidden, m) rnn.update_memory(pre_cell, c) rnn.step_output(m) rnn.step_output(c) input = m if dropout != None and dropout > 0.0: input = layers.dropout( input, dropout_prob=dropout, dropout_implementation='upscale_in_train') rnn.step_output(input) #real_res = layers.concat(res, 0) rnnout = rnn() last_hidden_array = [] last_cell_array = [] real_res = rnnout[-1] for i in range(num_layers): m = rnnout[i * 2] c = rnnout[i * 2 + 1] m.stop_gradient = True c.stop_gradient = True last_h = layers.slice(m, axes=[0], starts=[num_steps - 1], ends=[num_steps]) last_hidden_array.append(last_h) last_c = layers.slice(c, axes=[0], starts=[num_steps - 1], ends=[num_steps]) last_cell_array.append(last_c) ''' else: real_res = rnnout[-1] for i in range( num_layers ): m1, c1, m2, c2 = rnnout real_res = m2 m1.stop_gradient = True c1.stop_gradient = True c2.stop_gradient = True ''' #layers.Print( first_hidden, message="22", summarize=10) #layers.Print( rnnout[1], message="11", summarize=10) #real_res = ( rnnout[1] + rnnout[2] + rnnout[3] + rnnout[4]) / 4.0 real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) last_hidden = layers.concat(last_hidden_array, 0) last_cell = layers.concat(last_cell_array, 0) ''' last_hidden = layers.concat( hidden_array, 1 ) last_hidden = layers.reshape( last_hidden, shape=[-1, num_layers, hidden_size]) last_hidden = layers.transpose( x = last_hidden, perm = [1, 0, 2]) last_cell = layers.concat( cell_array, 1) last_cell = layers.reshape( last_cell, shape=[ -1, num_layers, hidden_size]) last_cell = layers.transpose( x = last_cell, perm = [1, 0, 2]) ''' return real_res, last_hidden, last_cell
def forward(self): """ forward """ src, dst = L.read_file(self.pyreader) src_id = L.slice(src, [0, 1, 2, 3], [0, 0, 0, 0], [int(math.pow(2, 30)) - 1, 1, 1, 1]) dst_id = L.slice(dst, [0, 1, 2, 3], [0, 0, 0, 0], [int(math.pow(2, 30)) - 1, self.neg_num + 1, 1, 1]) if self.is_sparse: # sparse mode use 2 dims input. src = L.reshape(src, [-1, 1]) dst = L.reshape(dst, [-1, 1]) # [b, 1, f, h] src_embed = split_embedding(src, self.num_nodes, self.hidden_size, self.embed_init, "weight", self.num_part, self.is_sparse) # [b, n+1, f, h] dst_embed = split_embedding(dst, self.num_nodes, self.hidden_size, self.embed_init, "weight", self.num_part, self.is_sparse) if self.is_sparse: src_embed = L.reshape(src_embed, [-1, 1, self.num_featuers, self.hidden_size]) dst_embed = L.reshape( dst_embed, [-1, self.neg_num + 1, self.num_featuers, self.hidden_size]) # [b, 1, 1, f] src_weight = L.softmax( L.embedding(src_id, [self.num_nodes, self.num_featuers], param_attr=F.ParamAttr(name="alpha"))) # [b, n+1, 1, f] dst_weight = L.softmax( L.embedding(dst_id, [self.num_nodes, self.num_featuers], param_attr=F.ParamAttr(name="alpha"))) # [b, 1, h] src_sum = L.squeeze(L.matmul(src_weight, src_embed), axes=[2]) # [b, n+1, h] dst_sum = L.squeeze(L.matmul(dst_weight, dst_embed), axes=[2]) logits = L.matmul(src_sum, dst_sum, transpose_y=True) # [batch_size, 1, neg_num+1] pos_label = L.fill_constant_batch_size_like(logits, [-1, 1, 1], "float32", 1) neg_label = L.fill_constant_batch_size_like(logits, [-1, 1, self.neg_num], "float32", 0) label = L.concat([pos_label, neg_label], -1) pos_weight = L.fill_constant_batch_size_like(logits, [-1, 1, 1], "float32", self.neg_num) neg_weight = L.fill_constant_batch_size_like(logits, [-1, 1, self.neg_num], "float32", 1) weight = L.concat([pos_weight, neg_weight], -1) weight.stop_gradient = True label.stop_gradient = True loss = L.sigmoid_cross_entropy_with_logits(logits, label) loss = loss * weight loss = L.reduce_mean(loss) loss = loss * ((self.neg_num + 1) / 2 / self.neg_num) loss.persistable = True self.loss = loss return loss
def forward(self, q, k, v, lengths, speaker_embed, start_index, force_monotonic=False, prev_coeffs=None, window=None): # add position encoding as an inductive bias if self.has_bias: # multi-speaker model omega_q = 2 * F.sigmoid( F.squeeze(self.q_pos_affine(speaker_embed), axes=[-1])) omega_k = 2 * self.omega_initial * F.sigmoid( F.squeeze(self.k_pos_affine(speaker_embed), axes=[-1])) else: # single-speaker case batch_size = q.shape[0] omega_q = F.ones((batch_size, ), dtype="float32") omega_k = F.ones( (batch_size, ), dtype="float32") * self.omega_default q += self.position_encoding_weight * positional_encoding( q, start_index, omega_q) k += self.position_encoding_weight * positional_encoding(k, 0, omega_k) q, k, v = self.q_affine(q), self.k_affine(k), self.v_affine(v) activations = F.matmul(q, k, transpose_y=True) activations /= np.sqrt(self.attention_dim) if self.training: # mask the <pad> parts from the encoder mask = F.sequence_mask(lengths, dtype="float32") attn_bias = F.scale(1. - mask, -1000) activations += F.unsqueeze(attn_bias, [1]) elif force_monotonic: assert window is not None backward_step, forward_step = window T_enc = k.shape[1] batch_size, T_dec, _ = q.shape # actually T_dec = 1 here alpha = F.fill_constant((batch_size, T_dec), value=0, dtype="int64") \ if prev_coeffs is None \ else F.argmax(prev_coeffs, axis=-1) backward = F.sequence_mask(alpha - backward_step, maxlen=T_enc, dtype="bool") forward = F.sequence_mask(alpha + forward_step, maxlen=T_enc, dtype="bool") mask = F.cast(F.logical_xor(backward, forward), "float32") # print("mask's shape:", mask.shape) attn_bias = F.scale(1. - mask, -1000) activations += attn_bias # softmax coefficients = F.softmax(activations, axis=-1) # context vector coefficients = F.dropout(coefficients, 1. - self.keep_prob, dropout_implementation='upscale_in_train') contexts = F.matmul(coefficients, v) # context normalization enc_lengths = F.cast(F.unsqueeze(lengths, axes=[1, 2]), "float32") contexts *= F.sqrt(enc_lengths) # out affine contexts = self.out_affine(contexts) return contexts, coefficients
def edge_aware_self_attention(q, k, v, edges_k, edges_v, attn_bias, d_key, dropout_rate): """ Edge-aware Self-Attention. Scalar dimensions referenced here: B = batch_size M = max_sequence_length N = num_attention_heads H = hidden_size_per_head Args: q: reshaped queries [B, N, M, H] k: reshaped keys [B, N, M, H] v: reshaped values [B, N, M, H] edges_k: edge representations between input tokens (keys) [M, M, H] edges_v: edge representations between input tokens (values) [M, M, H] attn_bias: attention mask [B, N, M, M] """ if not (len(q.shape) == len(k.shape) == len(v.shape) == 4): raise ValueError("Input q, k, v should be 4-D Tensors.") if not (len(edges_k.shape) == len(edges_v.shape) == 3): raise ValueError( "Input edges_k and edges_v should be 3-D Tensors.") # regular self-attention scaled_q = layers.scale(x=q, scale=d_key**-0.5) product = layers.matmul(x=scaled_q, y=k, transpose_y=True) # edge-aware self-attention if edges_k and edges_v: # 1. transpose scaled_q from [B, N, M, H] to [M, B, N, H] scaled_q = layers.transpose(x=scaled_q, perm=[2, 0, 1, 3]) # 2. reshape scaled_q from [M, B, N, H] to [M, B*N, H] scaled_q = layers.reshape(x=scaled_q, shape=[0, -1, scaled_q.shape[3]], inplace=True) # 3. multiply scaled_q with transpose(edges_k) # scaled_q: [M, B*N, H] # edges_k: [M, M, H] # edge_bias: [M, B*N, M] edge_bias = layers.matmul(x=scaled_q, y=edges_k, transpose_y=True) # 4. reshape edge_bias from [M, B*N, M] to [M, B, N, M] edge_bias = layers.reshape(x=edge_bias, shape=[0, -1, q.shape[1], q.shape[2]], inplace=True) # 5. transpose edge_bias from [M, B, N, M] to [B, N, M, M] edge_bias = layers.transpose(x=edge_bias, perm=[1, 2, 0, 3]) # 6. add edge_bias to product product += edge_bias # add attention bias if attn_bias: product += attn_bias # softmax attention weights weights = layers.softmax(product) if dropout_rate: weights = layers.dropout(weights, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False) # edge-aware self-attention out = layers.matmul(weights, v) if edges_k and edges_v: # 1. transpose weights from [B, N, M, M] to [M, B, N, M] reshaped_weights = layers.transpose(x=weights, perm=[2, 0, 1, 3]) # 2. reshape weights from [M, B, N, M] to [M, B*N, M] reshaped_weights = layers.reshape( x=reshaped_weights, shape=[0, -1, reshaped_weights.shape[3]], inplace=True) # 3. multiply reshaped_weights with edges_v # reshaped_weights: [M, B*N, M] # edges_v: [M, M, H] # edge_bias: [M, B*N, H] edge_bias = layers.matmul(x=reshaped_weights, y=edges_v) # 4. reshape edge_bias from [M, B*N, H] to [M, B, N, H] edge_bias = layers.reshape(x=edge_bias, shape=[0, -1, q.shape[1], q.shape[3]], inplace=True) # 5. transpose edge_bias from [M, B, N, H] to [B, N, M, H] edge_bias = layers.transpose(x=edge_bias, perm=[1, 2, 0, 3]) out += edge_bias return out
def dot_attention(query, memory): attn = layers.matmul(query, memory, transpose_y=True) weight = layers.softmax(attn) weight_memory = layers.matmul(weight, memory) return weight_memory, weight
def build_model(self, model_configs): self.update_params(model_configs) features = fluid.layers.data(name="features", shape=[None, self.seq_len_], dtype='int64') labels = fluid.layers.data(name="labels", shape=[None, self.seq_len_], dtype='int64') sequence_length_ph = fluid.layers.data(name="seq_len_ph", shape=[None], dtype='int64') sequence_mask_ph = fluid.layers.data(name="seq_mask_ph", shape=[None], dtype='float32') init_hidden = fluid.layers.data( name="init_hidden", shape=[None, self.num_layers_, self.n_hidden_], dtype='float32') init_cell = fluid.layers.data( name="init_cell", shape=[None, self.num_layers_, self.n_hidden_], dtype='float32') init_hidden = layers.transpose(init_hidden, perm=[1, 0, 2]) init_cell = layers.transpose(init_cell, perm=[1, 0, 2]) init_hidden_reshape = layers.reshape( init_hidden, shape=[self.num_layers_, -1, self.n_hidden_]) init_cell_reshape = layers.reshape( init_cell, shape=[self.num_layers_, -1, self.n_hidden_]) features = layers.reshape(features, shape=[-1, self.seq_len_, 1]) # word embedding inputs = layers.embedding( input=features, size=[self.vocab_size_, self.n_hidden_], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr( name='embedding_para', initializer=fluid.initializer.UniformInitializer( low=-self.init_scale_, high=self.init_scale_))) # LSTM output, last_hidden, last_cell = self._build_rnn_graph( inputs, init_hidden, init_cell, sequence_length_ph) output = layers.reshape(output, shape=[-1, self.seq_len_, self.n_hidden_], inplace=True) self.last_hidden_ = layers.reshape( last_hidden, [-1, self.num_layers_, self.n_hidden_]) self.last_cell_ = layers.reshape( last_cell, [-1, self.num_layers_, self.n_hidden_]) # softmax softmax_w = layers.create_parameter( [self.n_hidden_, self.vocab_size_], dtype="float32", name="softmax_w", default_initializer=fluid.initializer.UniformInitializer( low=-self.init_scale_, high=self.init_scale_)) softmax_b = layers.create_parameter( [self.vocab_size_], dtype="float32", name='softmax_b', default_initializer=fluid.initializer.UniformInitializer( low=-self.init_scale_, high=self.init_scale_)) logits = layers.matmul(output, softmax_w) logits = layers.elementwise_add(logits, softmax_b) logits = layers.reshape(logits, shape=[-1, self.vocab_size_], inplace=True) # correct predictions labels_reshaped = layers.reshape(labels, [-1]) pred = layers.cast(layers.argmax(logits, 1), dtype="int64") correct_pred = layers.cast(layers.equal(pred, labels_reshaped), dtype="int64") self.pred_ = pred # predicting unknown is always considered wrong # only in paddle 1.8 unk_tensor = layers.fill_constant(layers.shape(labels_reshaped), value=self.unk_symbol_, dtype='int64') pred_unk = layers.cast(layers.equal(pred, unk_tensor), dtype="int64") correct_unk = layers.elementwise_mul(pred_unk, correct_pred) # predicting padding is always considered wrong pad_tensor = layers.fill_constant(layers.shape(labels_reshaped), value=self.pad_symbol_, dtype='int64') pred_pad = layers.cast(layers.equal(pred, pad_tensor), dtype="int64") correct_pad = layers.elementwise_mul(pred_pad, correct_pred) # Reshape logits to be a 3-D tensor for sequence loss logits = layers.reshape(logits, [-1, self.seq_len_, self.vocab_size_]) labels = layers.reshape(labels, [-1, self.seq_len_, 1]) loss = layers.softmax_with_cross_entropy(logits=logits, label=labels, soft_label=False, return_softmax=False) sequence_mask = layers.reshape(sequence_mask_ph, [-1, self.seq_len_, 1]) loss = layers.reduce_mean(layers.elementwise_mul(loss, sequence_mask)) eval_metric_ops = fluid.layers.reduce_sum(correct_pred) \ - fluid.layers.reduce_sum(correct_unk) \ - fluid.layers.reduce_sum(correct_pad) self.loss_ = loss self.correct_ = eval_metric_ops self.input_name_list_ = [ 'features', 'labels', 'seq_len_ph', 'seq_mask_ph', 'init_hidden', 'init_cell' ] self.target_var_names_ = [ self.loss_, self.last_hidden_, self.last_cell_, self.correct_ ] self.program_ = fluid.default_main_program() self.startup_program_ = fluid.default_startup_program()
def forward(self, input_ids, position_ids): if _global_parallel_strategy == "dp": auto.shard_tensor(input_ids, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(input_ids, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) input_embeddings = self.word_embeddings(input_ids) position_embeddings = self.position_embeddings(position_ids) if _global_parallel_strategy == "mp": auto.shard_tensor(self.word_embeddings.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.word_embeddings.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [1, -1] }) embeddings = input_embeddings + position_embeddings embeddings = self.dropout1(embeddings) # Pre-norm target = self.norm1(embeddings) # The following is the attention part q = self.q_proj(target) q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim]) q = tensor.transpose(x=q, perm=[0, 2, 1, 3]) k = self.k_proj(target) v = self.v_proj(target) if _global_parallel_strategy == "mp": auto.shard_tensor(self.q_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) auto.shard_tensor(self.k_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) auto.shard_tensor(self.v_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.q_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) auto.shard_tensor(self.k_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) auto.shard_tensor(self.v_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim]) k = tensor.transpose(x=k, perm=[0, 2, 1, 3]) v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim]) v = tensor.transpose(x=v, perm=[0, 2, 1, 3]) # scale dot product attention product = layers.matmul(x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5) if self.attn_mask is not None: product = product + self.attn_mask weights = F.softmax(product) if self.dropout_ratio: weights = F.dropout(weights, self.dropout_ratio, training=self.training, mode="upscale_in_train") out = tensor.matmul(weights, v) # combine heads out = tensor.transpose(out, perm=[0, 2, 1, 3]) out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) # project to output out = self.out_proj(out) if _global_parallel_strategy == "mp": auto.shard_tensor(self.out_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.out_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [1, -1] }) # Add residual residual = embeddings + self.dropout2(out) # Pre-norm out0 = self.norm2(residual) # The following is the MLP part out1 = self.linear0(out0) out2 = F.gelu(out1, approximate=True) out3 = self.linear1(out2) if _global_parallel_strategy == "mp": auto.shard_tensor(self.linear0.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) auto.shard_tensor(self.linear1.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.linear0.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) auto.shard_tensor(self.linear1.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [1, -1] }) # Add residual final = residual + self.dropout3(out3) return final
def forward(self, input): if _global_parallel_strategy == "dp": auto.shard_tensor(input, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(input, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1, -1] }) q = self.q_proj(input) q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim]) q = tensor.transpose(x=q, perm=[0, 2, 1, 3]) k = self.k_proj(input) v = self.v_proj(input) if _global_parallel_strategy == "mp": auto.shard_tensor(self.q_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) auto.shard_tensor(self.k_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) auto.shard_tensor(self.v_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.q_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) auto.shard_tensor(self.k_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) auto.shard_tensor(self.v_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim]) k = tensor.transpose(x=k, perm=[0, 2, 1, 3]) v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim]) v = tensor.transpose(x=v, perm=[0, 2, 1, 3]) # scale dot product attention product = layers.matmul(x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5) if self.attn_mask is not None: product = product + self.attn_mask weights = F.softmax(product) if self.dropout_ratio: weights = F.dropout(weights, self.dropout_ratio, training=self.training, mode="upscale_in_train") out = tensor.matmul(weights, v) # combine heads out = tensor.transpose(out, perm=[0, 2, 1, 3]) out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) # project to output out = self.out_proj(out) if _global_parallel_strategy == "mp": auto.shard_tensor(self.out_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.out_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [1, -1] }) return out
def lm_model(hidden_size, vocab_size, batch_size, num_layers=2, num_steps=20, init_scale=0.1, dropout=None, rnn_model='static'): def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None): weight_1_arr = [] weight_2_arr = [] bias_arr = [] hidden_array = [] cell_array = [] mask_array = [] for i in range(num_layers): weight_1 = layers.create_parameter([hidden_size * 2, hidden_size*4], dtype="float32", name="fc_weight1_"+str(i), \ default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale)) weight_1_arr.append(weight_1) bias_1 = layers.create_parameter( [hidden_size * 4], dtype="float32", name="fc_bias1_" + str(i), default_initializer=fluid.initializer.Constant(0.0)) bias_arr.append(bias_1) pre_hidden = layers.slice(init_hidden, axes=[0], starts=[i], ends=[i + 1]) pre_cell = layers.slice(init_cell, axes=[0], starts=[i], ends=[i + 1]) pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size]) pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size]) hidden_array.append(pre_hidden) cell_array.append(pre_cell) input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2]) rnn = PaddingRNN() with rnn.step(): input = rnn.step_input(input_embedding) for k in range(num_layers): pre_hidden = rnn.memory(init=hidden_array[k]) pre_cell = rnn.memory(init=cell_array[k]) weight_1 = weight_1_arr[k] bias = bias_arr[k] nn = layers.concat([input, pre_hidden], 1) gate_input = layers.matmul(x=nn, y=weight_1) gate_input = layers.elementwise_add(gate_input, bias) #i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) i = layers.slice(gate_input, axes=[1], starts=[0], ends=[hidden_size]) j = layers.slice(gate_input, axes=[1], starts=[hidden_size], ends=[hidden_size * 2]) f = layers.slice(gate_input, axes=[1], starts=[hidden_size * 2], ends=[hidden_size * 3]) o = layers.slice(gate_input, axes=[1], starts=[hidden_size * 3], ends=[hidden_size * 4]) c = pre_cell * layers.sigmoid(f) + layers.sigmoid( i) * layers.tanh(j) m = layers.tanh(c) * layers.sigmoid(o) rnn.update_memory(pre_hidden, m) rnn.update_memory(pre_cell, c) rnn.step_output(m) rnn.step_output(c) input = m if dropout != None and dropout > 0.0: input = layers.dropout( input, dropout_prob=dropout, dropout_implementation='upscale_in_train') rnn.step_output(input) #real_res = layers.concat(res, 0) rnnout = rnn() last_hidden_array = [] last_cell_array = [] real_res = rnnout[-1] for i in range(num_layers): m = rnnout[i * 2] c = rnnout[i * 2 + 1] m.stop_gradient = True c.stop_gradient = True last_h = layers.slice(m, axes=[0], starts=[num_steps - 1], ends=[num_steps]) last_hidden_array.append(last_h) last_c = layers.slice(c, axes=[0], starts=[num_steps - 1], ends=[num_steps]) last_cell_array.append(last_c) ''' else: real_res = rnnout[-1] for i in range( num_layers ): m1, c1, m2, c2 = rnnout real_res = m2 m1.stop_gradient = True c1.stop_gradient = True c2.stop_gradient = True ''' #layers.Print( first_hidden, message="22", summarize=10) #layers.Print( rnnout[1], message="11", summarize=10) #real_res = ( rnnout[1] + rnnout[2] + rnnout[3] + rnnout[4]) / 4.0 real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) last_hidden = layers.concat(last_hidden_array, 0) last_cell = layers.concat(last_cell_array, 0) ''' last_hidden = layers.concat( hidden_array, 1 ) last_hidden = layers.reshape( last_hidden, shape=[-1, num_layers, hidden_size]) last_hidden = layers.transpose( x = last_hidden, perm = [1, 0, 2]) last_cell = layers.concat( cell_array, 1) last_cell = layers.reshape( last_cell, shape=[ -1, num_layers, hidden_size]) last_cell = layers.transpose( x = last_cell, perm = [1, 0, 2]) ''' return real_res, last_hidden, last_cell def encoder_static(input_embedding, len=3, init_hidden=None, init_cell=None): weight_1_arr = [] weight_2_arr = [] bias_arr = [] hidden_array = [] cell_array = [] mask_array = [] for i in range(num_layers): weight_1 = layers.create_parameter([hidden_size * 2, hidden_size*4], dtype="float32", name="fc_weight1_"+str(i), \ default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale)) weight_1_arr.append(weight_1) bias_1 = layers.create_parameter( [hidden_size * 4], dtype="float32", name="fc_bias1_" + str(i), default_initializer=fluid.initializer.Constant(0.0)) bias_arr.append(bias_1) pre_hidden = layers.slice(init_hidden, axes=[0], starts=[i], ends=[i + 1]) pre_cell = layers.slice(init_cell, axes=[0], starts=[i], ends=[i + 1]) pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size], inplace=True) pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size], inplace=True) hidden_array.append(pre_hidden) cell_array.append(pre_cell) res = [] sliced_inputs = layers.split(input_embedding, num_or_sections=len, dim=1) for index in range(len): input = sliced_inputs[index] input = layers.reshape(input, shape=[-1, hidden_size], inplace=True) for k in range(num_layers): pre_hidden = hidden_array[k] pre_cell = cell_array[k] weight_1 = weight_1_arr[k] bias = bias_arr[k] nn = layers.concat([input, pre_hidden], 1) gate_input = layers.matmul(x=nn, y=weight_1) gate_input = layers.elementwise_add(gate_input, bias) i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) c = pre_cell * layers.sigmoid(f) + layers.sigmoid( i) * layers.tanh(j) m = layers.tanh(c) * layers.sigmoid(o) hidden_array[k] = m cell_array[k] = c input = m if dropout != None and dropout > 0.0: input = layers.dropout( input, dropout_prob=dropout, dropout_implementation='upscale_in_train') res.append(input) last_hidden = layers.concat(hidden_array, 1) last_hidden = layers.reshape(last_hidden, shape=[-1, num_layers, hidden_size], inplace=True) last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2]) last_cell = layers.concat(cell_array, 1) last_cell = layers.reshape(last_cell, shape=[-1, num_layers, hidden_size]) last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2]) real_res = layers.concat(res, 0) real_res = layers.reshape(real_res, shape=[len, -1, hidden_size], inplace=True) real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) return real_res, last_hidden, last_cell x = layers.data(name="x", shape=[batch_size, num_steps, 1], dtype='int64', append_batch_size=False) y = layers.data(name="y", shape=[batch_size * num_steps, 1], dtype='int64', append_batch_size=False) init_hidden = layers.data(name="init_hidden", shape=[num_layers, batch_size, hidden_size], dtype='float32', append_batch_size=False) init_cell = layers.data(name="init_cell", shape=[num_layers, batch_size, hidden_size], dtype='float32', append_batch_size=False) init_hidden = layers.reshape(init_hidden, shape=[num_layers, -1, hidden_size]) init_cell = layers.reshape(init_cell, shape=[num_layers, -1, hidden_size]) x_emb = layers.embedding( input=x, size=[vocab_size, hidden_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr( name='embedding_para', initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))) x_emb = layers.reshape(x_emb, shape=[-1, num_steps, hidden_size], inplace=True) if dropout != None and dropout > 0.0: x_emb = layers.dropout(x_emb, dropout_prob=dropout, dropout_implementation='upscale_in_train') if rnn_model == "padding": rnn_out, last_hidden, last_cell = padding_rnn(x_emb, len=num_steps, init_hidden=init_hidden, init_cell=init_cell) elif rnn_model == "static": rnn_out, last_hidden, last_cell = encoder_static( x_emb, len=num_steps, init_hidden=init_hidden, init_cell=init_cell) elif rnn_model == "cudnn": x_emb = layers.transpose(x_emb, perm=[1, 0, 2]) rnn_out, last_hidden, last_cell = layers.lstm( x_emb, init_hidden, init_cell, num_steps, hidden_size, num_layers, \ is_bidirec=False, \ default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale) ) rnn_out = layers.transpose(rnn_out, perm=[1, 0, 2]) else: print("type not support") return rnn_out = layers.reshape(rnn_out, shape=[-1, num_steps, hidden_size], inplace=True) softmax_weight = layers.create_parameter([hidden_size, vocab_size], dtype="float32", name="softmax_weight", \ default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale)) softmax_bias = layers.create_parameter([vocab_size], dtype="float32", name='softmax_bias', \ default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale)) projection = layers.matmul(rnn_out, softmax_weight) projection = layers.elementwise_add(projection, softmax_bias) projection = layers.reshape(projection, shape=[-1, vocab_size], inplace=True) loss = layers.softmax_with_cross_entropy(logits=projection, label=y, soft_label=False) loss = layers.reshape(loss, shape=[-1, num_steps], inplace=True) loss = layers.reduce_mean(loss, dim=[0]) loss = layers.reduce_sum(loss) loss.persistable = True last_cell.persistable = True last_hidden.persistable = True feeding_list = ['x', 'y', 'init_hidden', 'init_cell'] return loss, last_hidden, last_cell, feeding_list
def forward(self, q, v, mask=None): """forward Args: q (Variable): shape = [batch_size, seq_len1, hidden_size] or [batch_size, hidden_size]. dtype = float32 v (Variable): shape = [batch_size, seq_len2, hidden_size]. dtype = float32 mask (Variable): shape = [batch_size, seq_len2]. dtype = v.dtype. Default is None Returns: Variable shape = [batch_size, seq_len2], dtype = float32. Raises: RuntimeError: while giving unsupported score_type. """ input_dim = len(q.shape) if input_dim == 2: q = layers.unsqueeze(q, [1]) if self._score_type == 'dot_prod': ptr_score = layers.matmul(q, v, transpose_y=True) elif self._score_type == 'affine': q_tmp = layers.fc(q, size=v.shape[2], num_flatten_dims=2, **nn_utils.param_attr(self._name, self._init_scale, need_bias=True)) ptr_score = layers.matmul(q_tmp, v, transpose_y=True) elif self._score_type == 'std': if self._hidden_size <= 0: raise ValueError("hidden_size should greater than 0") q_tmp = layers.fc(q, size=self._hidden_size, num_flatten_dims=2, **nn_utils.param_attr(self._name + '_q', self._init_scale, need_bias=True)) v_tmp = layers.fc(v, size=self._hidden_size, num_flatten_dims=2, **nn_utils.param_attr(self._name + '_k', self._init_scale, need_bias=True)) # shape = [batch_size, seq_len1, seq_len2, hidden_size] q_tmp_expand = layers.expand(layers.unsqueeze(q_tmp, [2]), [1, 1, v_tmp.shape[1], 1]) # shape = [batch_size, 1, seq_len2, hidden_size] v_tmp_expand = layers.unsqueeze(v_tmp, [1]) ptr_score = layers.fc(layers.elementwise_add(q_tmp_expand, v_tmp_expand, act='tanh'), size=1, num_flatten_dims=3, **nn_utils.param_attr(self._name + '_w', self._init_scale, need_bias=True)) ptr_score = layers.squeeze(ptr_score, [3]) else: raise RuntimeError( 'Supported score types: dot_prod/affine/std. but got %s' % (self._score_type)) if mask is not None: score_for_mask = layers.transpose(ptr_score, [1, 0, 2]) ptr_score_masked = layers.elementwise_add(score_for_mask, (mask - 1.0) * INF, axis=-1) ptr_score = layers.transpose(ptr_score_masked, [1, 0, 2]) if input_dim == 2: ptr_score = layers.squeeze(ptr_score, [1]) return ptr_score
def matmul(a, b): if isinstance(a, PTensor) or isinstance(b, PTensor): return layers.matmul(a, b) else: return np.matmul(a, b)
x = paddle.randn((N, in_C, H, W)) w = paddle.randn((C, in_C, 1, 1)) y = F.conv2d(x, w) x_in = L.reshape(x, (N, 1, in_C, H, W)) w_r = L.reshape(w, (1, C, in_C, 1, 1)) y2 = x_in * w_r # [N, C, in_C, H, W] y2 = L.reduce_sum(y2, dim=[ 2, ]) x_in2 = L.transpose(x, [0, 2, 3, 1]) # [N, H, W, in_C] w_r2 = L.reshape(w, (C, in_C)) w_r2 = L.transpose(w_r2, [1, 0]) # [in_C, C] y3 = L.matmul(x_in2, w_r2) # [N, H, W, C] y3 = L.transpose(y3, [0, 3, 1, 2]) # [N, C, H, W] y = y.numpy() y2 = y2.numpy() y3 = y3.numpy() d = np.sum((y - y2)**2) print(d) d = np.sum((y - y3)**2) print(d) ''' 因此,两个形如 (N, 1, in_C, H, W) (1, C, in_C, 1, 1) 或者说形如 (A, 1, in_C, B, C)
def grow_top_k(step_idx, alive_seq, alive_log_prob, parant_idx): pre_ids = alive_seq dec_step_emb = layers.embedding( input=pre_ids, size=[self.tar_vocab_size, self.hidden_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr( name='target_embedding', initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale))) dec_att_out, new_hidden_array, new_cell_array = decoder_step( dec_step_emb, pre_hidden_array, pre_cell_array) projection = layers.matmul(dec_att_out, softmax_weight) logits = layers.softmax(projection) current_log = layers.elementwise_add(x=layers.log(logits), y=alive_log_prob, axis=0) base_1 = layers.cast(step_idx, 'float32') + 6.0 base_1 /= 6.0 length_penalty = layers.pow(base_1, alpha) len_pen = layers.pow( ((5. + layers.cast(step_idx + 1, 'float32')) / 6.), alpha) current_log = layers.reshape(current_log, shape=[1, -1]) current_log = current_log / length_penalty topk_scores, topk_indices = layers.topk(input=current_log, k=beam_size) topk_scores = layers.reshape(topk_scores, shape=[-1]) topk_log_probs = topk_scores * length_penalty generate_id = layers.reshape( topk_indices, shape=[-1]) % self.tar_vocab_size selected_beam = layers.reshape( topk_indices, shape=[-1]) // self.tar_vocab_size topk_finished = layers.equal(generate_id, eos_ids) topk_finished = layers.cast(topk_finished, 'float32') generate_id = layers.reshape(generate_id, shape=[-1, 1]) pre_tokens_list = layers.gather(tokens, selected_beam) full_tokens_list = layers.concat( [pre_tokens_list, generate_id], axis=1) return full_tokens_list, topk_log_probs, topk_scores, topk_finished, selected_beam, generate_id, \ dec_att_out, new_hidden_array, new_cell_array
def network(batch_size, items_num, hidden_size, step, rate): stdv = 1.0 / math.sqrt(hidden_size) items = layers.data( name="items", shape=[batch_size, -1, 1], dtype="int64", append_batch_size=False) #[bs, uniq_max, 1] seq_index = layers.data( name="seq_index", shape=[batch_size, -1], dtype="int64", append_batch_size=False) #[-1(seq_max)*batch_size, 1] last_index = layers.data( name="last_index", shape=[batch_size], dtype="int64", append_batch_size=False) #[batch_size, 1] adj_in = layers.data( name="adj_in", shape=[batch_size, -1, -1], dtype="float32", append_batch_size=False) adj_out = layers.data( name="adj_out", shape=[batch_size, -1, -1], dtype="float32", append_batch_size=False) mask = layers.data( name="mask", shape=[batch_size, -1, 1], dtype="float32", append_batch_size=False) label = layers.data( name="label", shape=[batch_size, 1], dtype="int64", append_batch_size=False) items_emb = layers.embedding( input=items, is_sparse=True, param_attr=fluid.ParamAttr( name="emb", learning_rate=rate, initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), size=[items_num, hidden_size]) #[batch_size, uniq_max, h] data_feed = [items, seq_index, last_index, adj_in, adj_out, mask, label] pre_state = items_emb for i in range(step): pre_state = layers.reshape( x=pre_state, shape=[batch_size, -1, hidden_size]) state_in = layers.fc( input=pre_state, name="state_in", size=hidden_size, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, uniq_max, h] state_out = layers.fc( input=pre_state, name="state_out", size=hidden_size, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, uniq_max, h] state_adj_in = layers.matmul(adj_in, state_in) #[batch_size, uniq_max, h] state_adj_out = layers.matmul(adj_out, state_out) #[batch_size, uniq_max, h] gru_input = layers.concat([state_adj_in, state_adj_out], axis=2) gru_input = layers.reshape(x=gru_input, shape=[-1, hidden_size * 2]) gru_fc = layers.fc(input=gru_input, name="gru_fc", size=3 * hidden_size, bias_attr=False) pre_state, _, _ = fluid.layers.gru_unit( input=gru_fc, hidden=layers.reshape( x=pre_state, shape=[-1, hidden_size]), size=3 * hidden_size) final_state = pre_state seq_index = layers.reshape(seq_index, shape=[-1]) seq = layers.gather(final_state, seq_index) #[batch_size*-1(seq_max), h] last = layers.gather(final_state, last_index) #[batch_size, h] seq = layers.reshape( seq, shape=[batch_size, -1, hidden_size]) #[batch_size, -1(seq_max), h] last = layers.reshape( last, shape=[batch_size, hidden_size]) #[batch_size, h] seq_fc = layers.fc( input=seq, name="seq_fc", size=hidden_size, bias_attr=False, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, -1(seq_max), h] last_fc = layers.fc(input=last, name="last_fc", size=hidden_size, bias_attr=False, act=None, num_flatten_dims=1, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[bathc_size, h] seq_fc_t = layers.transpose( seq_fc, perm=[1, 0, 2]) #[-1(seq_max), batch_size, h] add = layers.elementwise_add(seq_fc_t, last_fc) #[-1(seq_max), batch_size, h] b = layers.create_parameter( shape=[hidden_size], dtype='float32', default_initializer=fluid.initializer.Constant(value=0.0)) #[h] add = layers.elementwise_add(add, b) #[-1(seq_max), batch_size, h] add_sigmoid = layers.sigmoid(add) #[-1(seq_max), batch_size, h] add_sigmoid = layers.transpose( add_sigmoid, perm=[1, 0, 2]) #[batch_size, -1(seq_max), h] weight = layers.fc(input=add_sigmoid, name="weight_fc", size=1, act=None, num_flatten_dims=2, bias_attr=False, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, -1, 1] weight *= mask weight_mask = layers.elementwise_mul(seq, weight, axis=0) global_attention = layers.reduce_sum(weight_mask, dim=1) final_attention = layers.concat( [global_attention, last], axis=1) #[batch_size, 2*h] final_attention_fc = layers.fc( input=final_attention, name="fina_attention_fc", size=hidden_size, bias_attr=False, act=None, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, h] all_vocab = layers.create_global_var( shape=[items_num - 1, 1], value=0, dtype="int64", persistable=True, name="all_vocab") all_emb = layers.embedding( input=all_vocab, is_sparse=True, param_attr=fluid.ParamAttr( name="emb", learning_rate=rate, initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), size=[items_num, hidden_size]) #[all_vocab, h] logits = layers.matmul( x=final_attention_fc, y=all_emb, transpose_y=True) #[batch_size, all_vocab] softmax = layers.softmax_with_cross_entropy( logits=logits, label=label) #[batch_size, 1] loss = layers.reduce_mean(softmax) # [1] #fluid.layers.Print(loss) acc = layers.accuracy(input=logits, label=label, k=20) return loss, acc, data_feed, [items_emb, all_emb]
def decode(self, dec_input, enc_words_output, enc_sents_output, caches=None, gather_idx=None): """Decoding to generate output text""" trg_word, trg_pos, trg_slf_attn_bias, trg_src_words_attn_bias, \ trg_src_sents_attn_bias, graph_attn_bias = dec_input dec_res = self._gen_dec_input(trg_word, trg_pos, trg_slf_attn_bias, trg_src_words_attn_bias, trg_src_sents_attn_bias, graph_attn_bias) emb_out, trg_slf_attn_bias, trg_src_words_attn_bias, trg_src_sents_attn_bias, graph_attn_bias = \ dec_res.emb_out, dec_res.trg_slf_attn_bias, dec_res.trg_src_words_attn_bias, \ dec_res.trg_src_sents_attn_bias, dec_res.graph_attn_bias # (batch_size, tgt_len, emb_dim) dec_output = graph_decoder( dec_input=emb_out, # (batch_size, tgt_len, emb_dim) enc_words_output= enc_words_output, # (batch_size, n_blocks, n_tokens, emb_dim) enc_sents_output=enc_sents_output, # (batch_size, n_blocks, emb_dim) dec_slf_attn_bias= trg_slf_attn_bias, # (batch_size, n_head, tgt_len, tgt_len) dec_enc_words_attn_bias= trg_src_words_attn_bias, # (batch_size, n_blocks, n_head, tgt_len, n_tokens) dec_enc_sents_attn_bias= trg_src_sents_attn_bias, # (batch_size, n_head, tgt_len, n_blocks) graph_attn_bias= graph_attn_bias, # (batch_size, n_head, n_blocks, n_blocks) pos_win=self.pos_win, n_layer=self._dec_n_layer, n_head=self._n_head, d_key=self._emb_size // self._n_head, d_value=self._emb_size // self._n_head, d_model=self._emb_size, d_inner_hid=self._emb_size * 4, prepostprocess_dropout=self._prepostprocess_dropout, attention_dropout=self._attention_dropout, relu_dropout=self._prepostprocess_dropout, hidden_act=self._hidden_act, preprocess_cmd=self._preprocess_command, postprocess_cmd=self._postprocess_command, param_initializer=self._param_initializer, caches=caches, gather_idx=gather_idx, name='graph_decoder') # Reshape to 2D tensor to use GEMM instead of BatchedGEMM # (batch_size*tgt_len, emb_dim) dec_output = layers.reshape(dec_output, shape=[-1, self._emb_size], inplace=True) if self._dtype is "float16": dec_output = fluid.layers.cast(x=dec_output, dtype=self._emb_dtype) if self._weight_sharing: out = layers.matmul( x=dec_output, y=fluid.default_main_program().global_block().var( self._word_emb_name), transpose_y=True) bias = layers.create_parameter( shape=[self.voc_size], dtype=self._emb_dtype, attr=fluid.ParamAttr( name='generator.bias', initializer=fluid.initializer.Constant(value=0.0)), is_bias=True) predict = layers.elementwise_add(x=out, y=bias, axis=-1) else: predict = layers.fc( input=dec_output, size=self.voc_size, param_attr=fluid.ParamAttr( name="generator.w", initializer=fluid.initializer.TruncatedNormal(scale=0.02)), bias_attr=fluid.ParamAttr( name='generator.bias', initializer=fluid.initializer.Constant(value=0.0))) return predict
def encoder_static(input_embedding, len=3, init_hidden=None, init_cell=None): weight_1_arr = [] weight_2_arr = [] bias_arr = [] hidden_array = [] cell_array = [] mask_array = [] for i in range(num_layers): weight_1 = layers.create_parameter( [hidden_size * 2, hidden_size * 4], dtype="float32", name="fc_weight1_" + str(i), default_initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale)) weight_1_arr.append(weight_1) bias_1 = layers.create_parameter( [hidden_size * 4], dtype="float32", name="fc_bias1_" + str(i), default_initializer=fluid.initializer.Constant(0.0)) bias_arr.append(bias_1) pre_hidden = layers.slice(init_hidden, axes=[0], starts=[i], ends=[i + 1]) pre_cell = layers.slice(init_cell, axes=[0], starts=[i], ends=[i + 1]) pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size], inplace=True) pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size], inplace=True) hidden_array.append(pre_hidden) cell_array.append(pre_cell) res = [] sliced_inputs = layers.split(input_embedding, num_or_sections=len, dim=1) for index in range(len): input = sliced_inputs[index] input = layers.reshape(input, shape=[-1, hidden_size], inplace=True) for k in range(num_layers): pre_hidden = hidden_array[k] pre_cell = cell_array[k] weight_1 = weight_1_arr[k] bias = bias_arr[k] nn = layers.concat([input, pre_hidden], 1) gate_input = layers.matmul(x=nn, y=weight_1) gate_input = layers.elementwise_add(gate_input, bias) i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) try: from paddle.fluid.contrib.layers import fused_elemwise_activation # fluid.contrib.layers.fused_elemwise_activation can do a fused # operation, like: # 1) x + sigmoid(y); x + tanh(y) # 2) tanh(x + y) # Now the unary operation supported in this fused op is limit, and # we will extent this operation to support more unary operations and # do this kind of fusion automitically in future version of paddle.fluid. # layers.sigmoid(i) * layers.tanh(j) tmp0 = fused_elemwise_activation( x=layers.tanh(j), y=i, functor_list=['elementwise_mul', 'sigmoid'], save_intermediate_out=False) # pre_cell * layers.sigmoid(f) tmp1 = fused_elemwise_activation( x=pre_cell, y=f, functor_list=['elementwise_mul', 'sigmoid'], save_intermediate_out=False) c = tmp0 + tmp1 # layers.tanh(c) * layers.sigmoid(o) m = fused_elemwise_activation( x=layers.tanh(c), y=o, functor_list=['elementwise_mul', 'sigmoid'], save_intermediate_out=False) except ImportError: c = pre_cell * layers.sigmoid(f) + layers.sigmoid( i) * layers.tanh(j) m = layers.tanh(c) * layers.sigmoid(o) hidden_array[k] = m cell_array[k] = c input = m if dropout != None and dropout > 0.0: input = layers.dropout( input, dropout_prob=dropout, dropout_implementation='upscale_in_train') res.append(input) last_hidden = layers.concat(hidden_array, 1) last_hidden = layers.reshape(last_hidden, shape=[-1, num_layers, hidden_size], inplace=True) last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2]) last_cell = layers.concat(cell_array, 1) last_cell = layers.reshape(last_cell, shape=[-1, num_layers, hidden_size]) last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2]) real_res = layers.concat(res, 0) real_res = layers.reshape(real_res, shape=[len, -1, hidden_size], inplace=True) real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) return real_res, last_hidden, last_cell
def encoder(x, y, vocab_size, emb_size, init_hidden=None, init_cell=None, para_name='', custom_samples=None, custom_probabilities=None, test_mode=False, args=None): x_emb = layers.embedding(input=x, size=[vocab_size, emb_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr(name='embedding_para')) rnn_input = x_emb rnn_outs = [] rnn_outs_ori = [] cells = [] projs = [] for i in range(args.num_layers): rnn_input = dropout(rnn_input, test_mode, args) if init_hidden and init_cell: h0 = layers.squeeze(layers.slice(init_hidden, axes=[0], starts=[i], ends=[i + 1]), axes=[0]) c0 = layers.squeeze(layers.slice(init_cell, axes=[0], starts=[i], ends=[i + 1]), axes=[0]) else: h0 = c0 = None rnn_out, cell, input_proj = lstmp_encoder( rnn_input, args.hidden_size, h0, c0, para_name + 'layer{}'.format(i + 1), emb_size, test_mode, args) rnn_out_ori = rnn_out if i > 0: rnn_out = rnn_out + rnn_input rnn_out = dropout(rnn_out, test_mode, args) cell = dropout(cell, test_mode, args) rnn_outs.append(rnn_out) rnn_outs_ori.append(rnn_out_ori) rnn_input = rnn_out cells.append(cell) projs.append(input_proj) softmax_weight = layers.create_parameter([vocab_size, emb_size], dtype="float32", name="softmax_weight") softmax_bias = layers.create_parameter([vocab_size], dtype="float32", name='softmax_bias') projection = layers.matmul(rnn_outs[-1], softmax_weight, transpose_y=True) projection = layers.elementwise_add(projection, softmax_bias) projection = layers.reshape(projection, shape=[-1, vocab_size]) if args.sample_softmax and (not test_mode): loss = layers.sampled_softmax_with_cross_entropy( logits=projection, label=y, num_samples=args.n_negative_samples_batch, seed=args.random_seed) else: label = layers.one_hot(input=y, depth=vocab_size) loss = layers.softmax_with_cross_entropy(logits=projection, label=label, soft_label=True) return [x_emb, projection, loss], rnn_outs, rnn_outs_ori, cells, projs
def lm_model(hidden_size, vocab_size, batch_size, num_layers=2, num_steps=20, init_scale=0.1, dropout=None, rnn_model='static', use_dataloader=False): def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None): weight_1_arr = [] weight_2_arr = [] bias_arr = [] hidden_array = [] cell_array = [] mask_array = [] for i in range(num_layers): weight_1 = layers.create_parameter( [hidden_size * 2, hidden_size * 4], dtype="float32", name="fc_weight1_" + str(i), default_initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale)) weight_1_arr.append(weight_1) bias_1 = layers.create_parameter( [hidden_size * 4], dtype="float32", name="fc_bias1_" + str(i), default_initializer=fluid.initializer.Constant(0.0)) bias_arr.append(bias_1) pre_hidden = layers.slice(init_hidden, axes=[0], starts=[i], ends=[i + 1]) pre_cell = layers.slice(init_cell, axes=[0], starts=[i], ends=[i + 1]) pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size]) pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size]) hidden_array.append(pre_hidden) cell_array.append(pre_cell) input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2]) rnn = PaddingRNN() with rnn.step(): input = rnn.step_input(input_embedding) for k in range(num_layers): pre_hidden = rnn.memory(init=hidden_array[k]) pre_cell = rnn.memory(init=cell_array[k]) weight_1 = weight_1_arr[k] bias = bias_arr[k] nn = layers.concat([input, pre_hidden], 1) gate_input = layers.matmul(x=nn, y=weight_1) gate_input = layers.elementwise_add(gate_input, bias) i = layers.slice(gate_input, axes=[1], starts=[0], ends=[hidden_size]) j = layers.slice(gate_input, axes=[1], starts=[hidden_size], ends=[hidden_size * 2]) f = layers.slice(gate_input, axes=[1], starts=[hidden_size * 2], ends=[hidden_size * 3]) o = layers.slice(gate_input, axes=[1], starts=[hidden_size * 3], ends=[hidden_size * 4]) c = pre_cell * layers.sigmoid(f) + layers.sigmoid( i) * layers.tanh(j) m = layers.tanh(c) * layers.sigmoid(o) rnn.update_memory(pre_hidden, m) rnn.update_memory(pre_cell, c) rnn.step_output(m) rnn.step_output(c) input = m if dropout != None and dropout > 0.0: input = layers.dropout( input, dropout_prob=dropout, dropout_implementation='upscale_in_train') rnn.step_output(input) rnnout = rnn() last_hidden_array = [] last_cell_array = [] real_res = rnnout[-1] for i in range(num_layers): m = rnnout[i * 2] c = rnnout[i * 2 + 1] m.stop_gradient = True c.stop_gradient = True last_h = layers.slice(m, axes=[0], starts=[num_steps - 1], ends=[num_steps]) last_hidden_array.append(last_h) last_c = layers.slice(c, axes=[0], starts=[num_steps - 1], ends=[num_steps]) last_cell_array.append(last_c) real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) last_hidden = layers.concat(last_hidden_array, 0) last_cell = layers.concat(last_cell_array, 0) return real_res, last_hidden, last_cell def encoder_static(input_embedding, len=3, init_hidden=None, init_cell=None): weight_1_arr = [] weight_2_arr = [] bias_arr = [] hidden_array = [] cell_array = [] mask_array = [] for i in range(num_layers): weight_1 = layers.create_parameter( [hidden_size * 2, hidden_size * 4], dtype="float32", name="fc_weight1_" + str(i), default_initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale)) weight_1_arr.append(weight_1) bias_1 = layers.create_parameter( [hidden_size * 4], dtype="float32", name="fc_bias1_" + str(i), default_initializer=fluid.initializer.Constant(0.0)) bias_arr.append(bias_1) pre_hidden = layers.slice(init_hidden, axes=[0], starts=[i], ends=[i + 1]) pre_cell = layers.slice(init_cell, axes=[0], starts=[i], ends=[i + 1]) pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size], inplace=True) pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size], inplace=True) hidden_array.append(pre_hidden) cell_array.append(pre_cell) res = [] sliced_inputs = layers.split(input_embedding, num_or_sections=len, dim=1) for index in range(len): input = sliced_inputs[index] input = layers.reshape(input, shape=[-1, hidden_size], inplace=True) for k in range(num_layers): pre_hidden = hidden_array[k] pre_cell = cell_array[k] weight_1 = weight_1_arr[k] bias = bias_arr[k] nn = layers.concat([input, pre_hidden], 1) gate_input = layers.matmul(x=nn, y=weight_1) gate_input = layers.elementwise_add(gate_input, bias) i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) try: from paddle.fluid.contrib.layers import fused_elemwise_activation # fluid.contrib.layers.fused_elemwise_activation can do a fused # operation, like: # 1) x + sigmoid(y); x + tanh(y) # 2) tanh(x + y) # Now the unary operation supported in this fused op is limit, and # we will extent this operation to support more unary operations and # do this kind of fusion automitically in future version of paddle.fluid. # layers.sigmoid(i) * layers.tanh(j) tmp0 = fused_elemwise_activation( x=layers.tanh(j), y=i, functor_list=['elementwise_mul', 'sigmoid'], save_intermediate_out=False) # pre_cell * layers.sigmoid(f) tmp1 = fused_elemwise_activation( x=pre_cell, y=f, functor_list=['elementwise_mul', 'sigmoid'], save_intermediate_out=False) c = tmp0 + tmp1 # layers.tanh(c) * layers.sigmoid(o) m = fused_elemwise_activation( x=layers.tanh(c), y=o, functor_list=['elementwise_mul', 'sigmoid'], save_intermediate_out=False) except ImportError: c = pre_cell * layers.sigmoid(f) + layers.sigmoid( i) * layers.tanh(j) m = layers.tanh(c) * layers.sigmoid(o) hidden_array[k] = m cell_array[k] = c input = m if dropout != None and dropout > 0.0: input = layers.dropout( input, dropout_prob=dropout, dropout_implementation='upscale_in_train') res.append(input) last_hidden = layers.concat(hidden_array, 1) last_hidden = layers.reshape(last_hidden, shape=[-1, num_layers, hidden_size], inplace=True) last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2]) last_cell = layers.concat(cell_array, 1) last_cell = layers.reshape(last_cell, shape=[-1, num_layers, hidden_size]) last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2]) real_res = layers.concat(res, 0) real_res = layers.reshape(real_res, shape=[len, -1, hidden_size], inplace=True) real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) return real_res, last_hidden, last_cell batch_size_each = batch_size // fluid.core.get_cuda_device_count() x = fluid.data(name="x", shape=[batch_size_each, num_steps, 1], dtype='int64') y = fluid.data(name="y", shape=[batch_size_each * num_steps, 1], dtype='int64') if use_dataloader: dataloader = fluid.io.DataLoader.from_generator(feed_list=[x, y], capacity=16, iterable=False, use_double_buffer=True) init_hidden = fluid.data(name="init_hidden", shape=[num_layers, batch_size_each, hidden_size], dtype='float32') init_cell = fluid.data(name="init_cell", shape=[num_layers, batch_size_each, hidden_size], dtype='float32') init_cell.persistable = True init_hidden.persistable = True init_hidden_reshape = layers.reshape(init_hidden, shape=[num_layers, -1, hidden_size]) init_cell_reshape = layers.reshape(init_cell, shape=[num_layers, -1, hidden_size]) x_emb = layers.embedding( input=x, size=[vocab_size, hidden_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr( name='embedding_para', initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))) x_emb = layers.reshape(x_emb, shape=[-1, num_steps, hidden_size], inplace=True) if dropout != None and dropout > 0.0: x_emb = layers.dropout(x_emb, dropout_prob=dropout, dropout_implementation='upscale_in_train') if rnn_model == "padding": rnn_out, last_hidden, last_cell = padding_rnn( x_emb, len=num_steps, init_hidden=init_hidden_reshape, init_cell=init_cell_reshape) elif rnn_model == "static": rnn_out, last_hidden, last_cell = encoder_static( x_emb, len=num_steps, init_hidden=init_hidden_reshape, init_cell=init_cell_reshape) elif rnn_model == "cudnn": x_emb = layers.transpose(x_emb, perm=[1, 0, 2]) rnn_out, last_hidden, last_cell = layers.lstm( x_emb, init_hidden_reshape, init_cell_reshape, num_steps, hidden_size, num_layers, is_bidirec=False, default_initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale)) rnn_out = layers.transpose(rnn_out, perm=[1, 0, 2]) elif rnn_model == "basic_lstm": rnn_out, last_hidden, last_cell = basic_lstm( x_emb, init_hidden, init_cell, hidden_size, \ num_layers=num_layers, batch_first=True, dropout_prob=dropout, \ param_attr = ParamAttr( initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale) ), \ bias_attr = ParamAttr( initializer = fluid.initializer.Constant(0.0) ), \ forget_bias = 0.0) else: print("type not support") return rnn_out = layers.reshape(rnn_out, shape=[-1, num_steps, hidden_size], inplace=True) softmax_weight = layers.create_parameter( [hidden_size, vocab_size], dtype="float32", name="softmax_weight", default_initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale)) softmax_bias = layers.create_parameter( [vocab_size], dtype="float32", name='softmax_bias', default_initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale)) projection = layers.matmul(rnn_out, softmax_weight) projection = layers.elementwise_add(projection, softmax_bias) projection = layers.reshape(projection, shape=[-1, vocab_size], inplace=True) loss = layers.softmax_with_cross_entropy(logits=projection, label=y, soft_label=False) loss = layers.reshape(loss, shape=[-1, num_steps], inplace=True) loss = layers.reduce_mean(loss, dim=[0]) loss = layers.reduce_sum(loss) loss.persistable = True last_cell.persistable = True last_hidden.persistable = True # This will feed last_hidden, last_cell to init_hidden, init_cell, which # can be used directly in next batch. This can avoid the fetching of # last_hidden and last_cell and feeding of init_hidden and init_cell in # each training step. layers.assign(input=last_cell, output=init_cell) layers.assign(input=last_hidden, output=init_hidden) feeding_list = ['x', 'y', 'init_hidden', 'init_cell'] if use_dataloader: return loss, last_hidden, last_cell, feeding_list, dataloader else: return loss, last_hidden, last_cell, feeding_list
def __call__( self, predictions, labels_pos_mask, # Shape: [batch_size, 19248, 1] labels_neg_mask, # Shape: [batch_size, 19248, 1] labels_allboxes_vector, # Shape: [batch_size, 19248, 8] segment_t, # list Shape: [batch_size, 19248, 1] label_masks, labels_best_truth_idx, labels_pos_index, labels_pos_cid, # Shape: [batch_size, 19248] labels_pos_cid2, # Shape: [batch_size, 19248] priors, class_vectors, batch_size, use_maskiou=True, use_ce_loss=True, use_ghm_c_loss=False, use_focal_loss=False, use_ohem_loss=False): pred_allboxes_encode_x0y0x1y1 = predictions[ 'loc'] # Shape: [batch_size, 19248, 4] pred_allboxes_conf = predictions[ 'conf'] # Shape: [batch_size, 19248, 1+80] pred_allboxes_mask_coef = predictions[ 'mask'] # Shape: [batch_size, 19248, 原型数=32] pred_proto = predictions[ 'proto'] # Shape: [batch_size, s4=138, s4=138, 原型数=32] pred_segm = predictions[ 'segm'] # Shape: [batch_size, 类别数=80, s8=69, s8=69] labels_allboxes_x0y0x1y1 = labels_allboxes_vector[:, :, 0: 4] # Shape: [batch_size, 19248, 4] labels_allboxes_decode_x0y0x1y1 = labels_allboxes_vector[:, :, 4: 8] # Shape: [batch_size, 19248, 4] losses = {} # 1.bbox_loss,只有正例才计算。 # bbox_alpha = 1.5 # bbox_loss = P.smooth_l1(P.reshape(pred_allboxes_encode_x0y0x1y1, (-1, 4)), P.reshape(labels_allboxes_x0y0x1y1, (-1, 4))) # bbox_loss = P.reshape(labels_pos_mask, (-1, 1)) * bbox_loss # bbox_loss = P.reduce_sum(bbox_loss) * bbox_alpha # losses['B'] = bbox_loss # 1.bbox_loss,ciou_loss pred_x0y0x1y1 = [] for idx in range(batch_size): temp = decode(pred_allboxes_encode_x0y0x1y1[idx], priors) pred_x0y0x1y1.append(temp) pred_x0y0x1y1 = P.concat(pred_x0y0x1y1, axis=0) # Shape: [batch_size*num_priors, 4] pred_x0y0x1y1 = P.reshape( pred_x0y0x1y1, (batch_size, -1, 4)) # Shape: [batch_size, num_priors, 4] ciou = P.reshape( self.bbox_ciou(pred_x0y0x1y1, labels_allboxes_decode_x0y0x1y1), (batch_size, -1, 1)) # (batch_size, num_priors, 1) # 每个预测框ciou_loss的权重 = 2 - (ground truth的面积/图片面积) gt_area = (labels_allboxes_decode_x0y0x1y1[:, :, 2:3] - labels_allboxes_decode_x0y0x1y1[:, :, 0:1]) * \ (labels_allboxes_decode_x0y0x1y1[:, :, 3:4] - labels_allboxes_decode_x0y0x1y1[:, :, 1:2]) bbox_loss_scale = 2.0 - gt_area ciou_loss = labels_pos_mask * bbox_loss_scale * (1 - ciou) bbox_alpha = 1.5 ciou_loss = P.reduce_sum(ciou_loss) * bbox_alpha losses['B'] = ciou_loss # 2.mask_loss,只有正例才计算 mask_h = P.shape(pred_proto)[1] mask_w = P.shape(pred_proto)[2] loss_m = 0 maskiou_t_list = [] maskiou_net_input_list = [] label_t_list = [] for idx in range(batch_size): # [[0], [0], [0], [0], [0], [0], [0], [0]]。把8个正样本的最匹配gt的下标(在label_x0y0x1y1cid[idx]中的下标)选出来。 # 因为只有一个gt,所以下标全是0 labels_pos_index[idx].stop_gradient = True cur_gt = P.gather(labels_best_truth_idx[idx], labels_pos_index[idx]) # (?, 1) cur_gt.stop_gradient = True cur_x0y0x1y1 = P.gather(labels_allboxes_decode_x0y0x1y1[idx], labels_pos_index[idx]) # (?, 4) proto_masks = pred_proto[idx] # (138, 138, 32) # pred_mask_coef (batch_size, 19248, 32)。 把8个正样本预测的mask系数选出来。 proto_coef = P.gather(pred_allboxes_mask_coef[idx], labels_pos_index[idx]) # (?, 32) # (?, 138, 138),把8个正样本所匹配的gt的真实mask抽出来。因为匹配到同一个gt,所以是同一个mask重复了8次。 mask_t = P.gather(label_masks[idx], cur_gt) # (?, 138, 138) # (?, ),把8个正样本所匹配的gt的真实cid抽出来。因为匹配到同一个gt,所以是同一个cid重复了8次。 label_t = P.gather(labels_pos_cid[idx], labels_pos_index[idx]) # (?, ) # Size: (138, 138, ?) = 原型*系数转置 pred_masks = P.matmul(proto_masks, proto_coef, transpose_y=True) pred_masks = P.sigmoid(pred_masks) # sigmoid激活 pred_masks = crop(pred_masks, cur_x0y0x1y1) pred_masks = P.transpose(pred_masks, perm=[2, 0, 1]) masks_pos_loss = mask_t * (0 - P.log(pred_masks + 1e-9) ) # 二值交叉熵,加了极小的常数防止nan masks_neg_loss = (1 - mask_t) * (0 - P.log(1 - pred_masks + 1e-9) ) # 二值交叉熵,加了极小的常数防止nan pre_loss = (masks_pos_loss + masks_neg_loss) pre_loss = P.reduce_sum(pre_loss, dim=[1, 2]) # gt面积越小,对应mask损失权重越大 cur_cxcywh = center_size(cur_x0y0x1y1) gt_box_width = cur_cxcywh[:, 2] gt_box_height = cur_cxcywh[:, 3] pre_loss = pre_loss / (gt_box_width * gt_box_height) loss_m += P.reduce_sum(pre_loss) if use_maskiou: # mask_t中,面积<=5*5的被丢弃 # discard_mask_area = 5*5 ''' gpu版本的paddlepaddle1.6.2里有一个问题。select如果是[None],并且在gather()里使用了select,就会出现 cudaGetLastError invalid configuration argument errno: 9 这个错误。cpu版本则可以正常跑。 为了避免上面的问题,只能让select不是[None],所以这里不做面积过滤,mask_t全部保留。 ''' discard_mask_area = -1 gt_mask_area = P.reduce_sum(mask_t, dim=[1, 2]) gt_mask_area.stop_gradient = True select = P.where(gt_mask_area > discard_mask_area) select.stop_gradient = True pred_masks = P.gather(pred_masks, select) mask_t = P.gather(mask_t, select) label_t = P.gather(label_t, select) label_t.stop_gradient = True maskiou_net_input = P.reshape( pred_masks, (P.shape(pred_masks)[0], 1, mask_h, mask_w)) pred_masks = P.cast(pred_masks > 0.5, 'float32') # 四舍五入 maskiou_t = self._mask_iou(pred_masks, mask_t) # (8, ) maskiou_net_input_list.append( maskiou_net_input) # (8, 1, 138, 138) maskiou_t_list.append(maskiou_t) # (8, ) label_t_list.append(label_t) # (8, ) mask_alpha = 6.125 losses['M'] = loss_m * mask_alpha / mask_h / mask_w # 余下部分 if use_maskiou: maskiou_net_input = P.concat( maskiou_net_input_list, axis=0) # (21, 1, 138, 138) 21个正例预测的掩码 maskiou_t = P.concat(maskiou_t_list, axis=0) # (21, ) 21个正例预测的掩码和真实掩码的iou label_t = P.concat(label_t_list, axis=0) # (21, ) 21个正例预测的cid label_t.stop_gradient = True # 因为是整数所以才? maskiou_targets = [maskiou_net_input, maskiou_t, label_t] # 3.conf_loss。 conf_alpha = 1.0 if use_ce_loss: conf_loss = self.ce_conf_loss(pred_allboxes_conf, labels_pos_mask, labels_neg_mask, class_vectors, labels_pos_cid2, gt_area) elif use_ghm_c_loss: conf_loss = self.ghm_c_loss(pred_allboxes_conf, labels_pos_mask, labels_neg_mask, class_vectors, labels_pos_cid2) elif use_focal_loss: conf_loss = self.focal_conf_loss(pred_allboxes_conf, labels_pos_mask, labels_neg_mask, class_vectors, labels_pos_cid2) elif use_ohem_loss: conf_loss = self.ohem_conf_loss(pred_allboxes_conf, batch_size, labels_neg_mask, labels_pos_mask, labels_pos_index, class_vectors, labels_pos_cid) losses['C'] = conf_loss * conf_alpha # 4.mask_iou_loss,只有正例才计算。 if use_maskiou: # maskiou_net_input (21, 1, 138, 138) 21个正例预测的掩码 # maskiou_t (21, ) 21个正例预测的掩码和真实掩码的iou # label_t (21, ) 21个正例预测的cid maskiou_net_input, maskiou_t, label_t = maskiou_targets maskiou_p = maskiou_net(maskiou_net_input, self.num_classes - 1) maskiou_p = P.reduce_max(maskiou_p, dim=[2, 3]) # 最大池化 (21, 80) temp_mask = P.gather(class_vectors, label_t) # 掩码 (21, 81) temp_mask = temp_mask[:, 1:] # 掩码 (21, 80) maskiou_p = temp_mask * maskiou_p # 只保留真实类别的那个通道 (21, 80) maskiou_p = P.reduce_sum(maskiou_p, dim=1, keep_dim=True) # (21, 1) loss_i = P.smooth_l1( maskiou_p, P.reshape(maskiou_t, (P.shape(maskiou_t)[0], 1))) maskiou_alpha = 25.0 losses['I'] = maskiou_alpha * P.reduce_sum(loss_i) # 5.semantic_segmentation_loss,只有正例才计算 mask_h = P.shape(pred_segm)[2] mask_w = P.shape(pred_segm)[3] loss_s = 0.0 for idx in range(batch_size): cur_segment = pred_segm[idx] # (80, 69, 69) l = P.sigmoid_cross_entropy_with_logits(cur_segment, segment_t[idx]) loss_s += P.reduce_sum(l) semantic_segmentation_alpha = 1.0 losses['S'] = loss_s / mask_h / mask_w * semantic_segmentation_alpha total_num_pos = P.cast(P.reduce_sum(labels_pos_mask), 'float32') for k in losses: if k not in ('S', ): losses[k] /= total_num_pos else: losses[k] /= batch_size total_loss = 0.0 for k in losses: total_loss += losses[k] # Loss Key: # - B: Box Localization Loss # - M: Mask Loss # - C: Class Confidence Loss # - I: MaskIou Loss # - S: Semantic Segmentation Loss # return losses['M'], losses['C'] return losses, total_loss
def _forward(self, inputs, is_training): """ Real forward process of model in different mode(train/test). """ outputs = {} src_token = inputs["src_token"] src_mask = inputs["src_mask"] src_pos = inputs["src_pos"] src_type = inputs["src_type"] src_turn = inputs["src_turn"] tgt_token = inputs["tgt_token"][:, :-1] tgt_mask = inputs["tgt_mask"][:, :-1] tgt_pos = inputs["tgt_pos"][:, :-1] tgt_type = inputs["tgt_type"][:, :-1] tgt_turn = inputs["tgt_turn"][:, :-1] input_mask = layers.concat([src_mask, tgt_mask], axis=1) input_mask.stop_gradient = True src_embed = self.embedder(src_token, src_pos, src_type, src_turn) tgt_embed = self.embedder(tgt_token, tgt_pos, tgt_type, tgt_turn) embed = layers.concat([src_embed, tgt_embed], axis=1) embed = self.embed_layer_norm(embed) batch_size = src_token.shape[0] src_len = src_token.shape[1] tgt_len = tgt_token.shape[1] if self.num_latent > 0: post_embed, post_probs, post_logits = self._posteriori_network( input_mask, embed, batch_size, src_len, tgt_len) outputs["post_logits"] = post_logits if self.use_discriminator: pos_probs, neg_probs = self._discriminator_network( input_mask, embed, batch_size, src_len, tgt_len, post_embed) outputs["pos_probs"] = pos_probs outputs["neg_probs"] = neg_probs if is_training: z = F.gumbel_softmax(post_logits, self.tau) else: indices = layers.argmax(post_logits, axis=1) z = layers.one_hot(F.unsqueeze(indices, [1]), self.num_latent) latent_embeddings = self.latent_embeddings latent_embed = layers.matmul(z, latent_embeddings) outputs["latent_embed"] = latent_embed else: latent_embed = None latent_embed, dec_probs = self._generation_network( input_mask, embed, batch_size, src_len, tgt_len, latent_embed) outputs["dec_probs"] = dec_probs if self.num_latent > 0 and self.with_bow: if self.two_layer_predictor: latent_embed = self.pre_bow_predictor(latent_embed) bow_logits = self.bow_predictor(latent_embed) bow_probs = layers.softmax(bow_logits) outputs["bow_probs"] = bow_probs return outputs
def wrap_decoder(trg_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, dec_inputs=None, enc_output=None, caches=None, gather_idx=None, bos_idx=0): """ The wrapper assembles together all needed layers for the decoder. """ if dec_inputs is None: # This is used to implement independent decoder program in inference. trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, enc_output = \ make_all_inputs(decoder_data_input_fields) else: trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs dec_input = prepare_decoder(trg_word, trg_pos, trg_vocab_size, d_model, max_length, prepostprocess_dropout, bos_idx=bos_idx, word_emb_param_name="src_word_emb_table" if weight_sharing else "trg_word_emb_table") dec_output = decoder(dec_input, enc_output, trg_slf_attn_bias, trg_src_attn_bias, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, caches=caches, gather_idx=gather_idx) return dec_output # Reshape to 2D tensor to use GEMM instead of BatchedGEMM dec_output = layers.reshape(dec_output, shape=[-1, dec_output.shape[-1]], inplace=True) if weight_sharing: predict = layers.matmul( x=dec_output, y=fluid.default_main_program().global_block().var( "trg_word_emb_table"), transpose_y=True) else: predict = layers.fc(input=dec_output, size=trg_vocab_size, bias_attr=False) if dec_inputs is None: # Return probs for independent decoder program. predict = layers.softmax(predict) return predict
def encoder_static(input_embedding, len=3, init_hidden=None, init_cell=None): weight_1_arr = [] weight_2_arr = [] bias_arr = [] hidden_array = [] cell_array = [] mask_array = [] for i in range(num_layers): weight_1 = layers.create_parameter([hidden_size * 2, hidden_size*4], dtype="float32", name="fc_weight1_"+str(i), \ default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale)) weight_1_arr.append(weight_1) bias_1 = layers.create_parameter( [hidden_size * 4], dtype="float32", name="fc_bias1_" + str(i), default_initializer=fluid.initializer.Constant(0.0)) bias_arr.append(bias_1) pre_hidden = layers.slice(init_hidden, axes=[0], starts=[i], ends=[i + 1]) pre_cell = layers.slice(init_cell, axes=[0], starts=[i], ends=[i + 1]) pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size], inplace=True) pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size], inplace=True) hidden_array.append(pre_hidden) cell_array.append(pre_cell) res = [] sliced_inputs = layers.split(input_embedding, num_or_sections=len, dim=1) for index in range(len): input = sliced_inputs[index] input = layers.reshape(input, shape=[-1, hidden_size], inplace=True) for k in range(num_layers): pre_hidden = hidden_array[k] pre_cell = cell_array[k] weight_1 = weight_1_arr[k] bias = bias_arr[k] nn = layers.concat([input, pre_hidden], 1) gate_input = layers.matmul(x=nn, y=weight_1) gate_input = layers.elementwise_add(gate_input, bias) i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) c = pre_cell * layers.sigmoid(f) + layers.sigmoid( i) * layers.tanh(j) m = layers.tanh(c) * layers.sigmoid(o) hidden_array[k] = m cell_array[k] = c input = m if dropout != None and dropout > 0.0: input = layers.dropout( input, dropout_prob=dropout, dropout_implementation='upscale_in_train') res.append(input) last_hidden = layers.concat(hidden_array, 1) last_hidden = layers.reshape(last_hidden, shape=[-1, num_layers, hidden_size], inplace=True) last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2]) last_cell = layers.concat(cell_array, 1) last_cell = layers.reshape(last_cell, shape=[-1, num_layers, hidden_size]) last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2]) real_res = layers.concat(res, 0) real_res = layers.reshape(real_res, shape=[len, -1, hidden_size], inplace=True) real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) return real_res, last_hidden, last_cell
def forward(self): """Build the GATNE net. """ param_attr_init = fluid.initializer.Uniform( low=-1.0, high=1.0, seed=np.random.randint(100)) embed_param_attrs = fluid.ParamAttr(name='Base_node_embed', initializer=param_attr_init) # node_embeddings base_node_embed = fl.embedding( input=fl.reshape(self.train_inputs, shape=[-1, 1]), size=[self.num_nodes, self.embedding_size], param_attr=embed_param_attrs) node_features = [] for edge_type in self.edge_types: param_attr_init = fluid.initializer.Uniform( low=-1.0, high=1.0, seed=np.random.randint(100)) embed_param_attrs = fluid.ParamAttr(name='%s_node_embed' % edge_type, initializer=param_attr_init) features = fl.embedding( input=self.gw[edge_type].node_feat['index'], size=[self.num_nodes, self.embedding_u_size], param_attr=embed_param_attrs) node_features.append(features) # mp_output: list of embedding(self.num_nodes, dim) mp_output = self.message_passing(self.gw, self.edge_types, node_features) # U : (num_type[m], num_nodes, dim[s]) node_type_embed = fl.stack(mp_output, axis=0) # U : (num_nodes, num_type[m], dim[s]) node_type_embed = fl.transpose(node_type_embed, perm=[1, 0, 2]) #gather node_type_embed from train_inputs node_type_embed = fl.gather(node_type_embed, self.train_inputs) # M_r trans_weights = fl.create_parameter( shape=[ self.edge_type_count, self.embedding_u_size, self.embedding_size // self.att_head ], attr=fluid.initializer.TruncatedNormalInitializer( loc=0.0, scale=1.0 / math.sqrt(self.embedding_size)), dtype='float32', name='trans_w') # W_r trans_weights_s1 = fl.create_parameter( shape=[self.edge_type_count, self.embedding_u_size, self.dim_a], attr=fluid.initializer.TruncatedNormalInitializer( loc=0.0, scale=1.0 / math.sqrt(self.embedding_size)), dtype='float32', name='trans_w_s1') # w_r trans_weights_s2 = fl.create_parameter( shape=[self.edge_type_count, self.dim_a, self.att_head], attr=fluid.initializer.TruncatedNormalInitializer( loc=0.0, scale=1.0 / math.sqrt(self.embedding_size)), dtype='float32', name='trans_w_s2') trans_w = fl.gather(trans_weights, self.train_types) trans_w_s1 = fl.gather(trans_weights_s1, self.train_types) trans_w_s2 = fl.gather(trans_weights_s2, self.train_types) attention = self.attention(node_type_embed, trans_w_s1, trans_w_s2) node_type_embed = fl.matmul(attention, node_type_embed) node_embed = base_node_embed + fl.reshape( fl.matmul(node_type_embed, trans_w), [-1, self.embedding_size]) self.last_node_embed = fl.l2_normalize(node_embed, axis=1) nce_weight_initializer = fluid.initializer.TruncatedNormalInitializer( loc=0.0, scale=1.0 / math.sqrt(self.embedding_size)) nce_weight_attrs = fluid.ParamAttr(name='nce_weight', initializer=nce_weight_initializer) weight_pos = fl.embedding(input=self.train_labels, size=[self.num_nodes, self.embedding_size], param_attr=nce_weight_attrs) weight_neg = fl.embedding(input=self.train_negs, size=[self.num_nodes, self.embedding_size], param_attr=nce_weight_attrs) tmp_node_embed = fl.unsqueeze(self.last_node_embed, axes=[1]) pos_logits = fl.matmul(tmp_node_embed, weight_pos, transpose_y=True) # [B, 1, 1] neg_logits = fl.matmul(tmp_node_embed, weight_neg, transpose_y=True) # [B, 1, neg_num] pos_score = fl.squeeze(pos_logits, axes=[1]) pos_score = fl.clip(pos_score, min=-10, max=10) pos_score = -1.0 * fl.logsigmoid(pos_score) neg_score = fl.squeeze(neg_logits, axes=[1]) neg_score = fl.clip(neg_score, min=-10, max=10) neg_score = -1.0 * fl.logsigmoid(-1.0 * neg_score) neg_score = fl.reduce_sum(neg_score, dim=1, keep_dim=True) self.loss = fl.reduce_mean(pos_score + neg_score)
def forward(self, query, key, value, attn_mask=None, use_cache=False, cache=None): """ Applies multi-head attention to map queries and a set of key-value pairs to outputs. """ key = query if key is None else key value = query if value is None else value # compute q ,k ,v if use_cache is False: if self.fuse: q, k, v = self._fuse_prepare_qkv(query) else: q, k, v = self._prepare_qkv(query, key, value, use_cache, cache) else: q, k, v, cache = self._prepare_qkv(query, key, value, use_cache, cache) product = layers.matmul(x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5) if attn_mask is not None: product = product + attn_mask weights = F.softmax(product) if self.dropout: weights = F.dropout(weights, self.dropout, training=self.training, mode="upscale_in_train") out = tensor.matmul(weights, v) # combine heads out = tensor.transpose(out, perm=[0, 2, 1, 3]) out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) # project to output out = self.out_proj(out) if _global_parallel_strategy == "mp": auto.shard_tensor(self.out_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.out_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [1, -1] }) elif _global_parallel_strategy == "mp_pp": auto.shard_tensor(self.out_proj.weight, dist_attr={ "process_mesh": MPPP_MESH_LIST[self.mesh_idx], "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp_pp": auto.shard_tensor(self.out_proj.weight, dist_attr={ "process_mesh": DPMPPP_MESH_LIST[self.mesh_idx], "dims_mapping": [1, -1] }) outs = [out] if self.need_weights: outs.append(weights) if use_cache: outs.append(cache) return out if len(outs) == 1 else tuple(outs)