def concat_coord(x): ins_feat = x # [N, c, h, w] batch_size = L.shape(x)[0] h = L.shape(x)[2] w = L.shape(x)[3] float_h = L.cast(h, 'float32') float_w = L.cast(w, 'float32') y_range = L.range(0., float_h, 1., dtype='float32') # [h, ] y_range = 2.0 * y_range / (float_h - 1.0) - 1.0 x_range = L.range(0., float_w, 1., dtype='float32') # [w, ] x_range = 2.0 * x_range / (float_w - 1.0) - 1.0 x_range = L.reshape(x_range, (1, -1)) # [1, w] y_range = L.reshape(y_range, (-1, 1)) # [h, 1] x = L.expand(x_range, [h, 1]) # [h, w] y = L.expand(y_range, [1, w]) # [h, w] x = L.reshape(x, (1, 1, h, w)) # [1, 1, h, w] y = L.reshape(y, (1, 1, h, w)) # [1, 1, h, w] x = L.expand(x, [batch_size, 1, 1, 1]) # [N, 1, h, w] y = L.expand(y, [batch_size, 1, 1, 1]) # [N, 1, h, w] ins_kernel_feat = L.concat([ins_feat, x, y], axis=1) # [N, c+2, h, w] return ins_kernel_feat
def is_finished(self, step_idx, source_length, alive_log_probs, finished_scores, finished_in_finished): """ is_finished """ base_1 = layers.cast(source_length, 'float32') + 55.0 base_1 /= 6.0 max_length_penalty = layers.pow(base_1, self.alpha) flat_alive_log_probs = layers.reshape(alive_log_probs, [-1]) lower_bound_alive_scores_1 = layers.gather(flat_alive_log_probs, [self.get_alive_index]) lower_bound_alive_scores = lower_bound_alive_scores_1 / max_length_penalty lowest_score_of_finished_in_finish = layers.reduce_min(finished_scores * finished_in_finished, dim=1) finished_in_finished = layers.cast(finished_in_finished, 'bool') lowest_score_of_finished_in_finish += \ ((1.0 - layers.cast(layers.reduce_any(finished_in_finished, 1), 'float32')) * -INF) #print lowest_score_of_finished_in_finish bound_is_met = layers.reduce_all(layers.greater_than(lowest_score_of_finished_in_finish, lower_bound_alive_scores)) decode_length = source_length + 50 length_cond = layers.less_than(x=step_idx, y=decode_length) return layers.logical_and(x=layers.logical_not(bound_is_met), y=length_cond)
def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0., name=''): """ Add residual connection, layer normalization and droput to the out tensor optionally according to the value of process_cmd. This will be used before or after multi-head attention and position-wise feed-forward networks. """ for cmd in process_cmd: if cmd == "a": # add residual connection out = out + prev_out if prev_out else out elif cmd == "n": # add layer normalization out_dtype = out.dtype if out_dtype == fluid.core.VarDesc.VarType.FP16: out = layers.cast(x=out, dtype="float32") out = layers.layer_norm( out, begin_norm_axis=len(out.shape) - 1, param_attr=fluid.ParamAttr( name=name + '_layer_norm_scale', initializer=fluid.initializer.Constant(1.)), bias_attr=fluid.ParamAttr( name=name + '_layer_norm_bias', initializer=fluid.initializer.Constant(0.))) if out_dtype == fluid.core.VarDesc.VarType.FP16: out = layers.cast(x=out, dtype="float16") elif cmd == "d": # add dropout if dropout_rate: out = layers.dropout( out, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False) return out
def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0., name='', is_test=False): for cmd in process_cmd: if cmd == "a": # 两个输入相加 out = out + prev_out if prev_out else out elif cmd == "n": # 进行normalization out_type = out.dtype if out_type == fluid.core.VarDesc.VarType.FP16: out = layers.cast(x=out, dtype="float32") out = layers.layer_norm( out, begin_norm_axis=len(out.shape) - 1, param_attr=fluid.ParamAttr( name=name + '_layer_norm_scale', initializer=fluid.initializer.Constant(1.)), bias_attr=fluid.ParamAttr( name=name + '_layer_norm_bias', initializer=fluid.initializer.Constant(0.))) if out_type == fluid.core.VarDesc.VarType.FP16: out = layers.cast(x=out, dtype="float16") elif cmd == "d": # 进行dropout if dropout_rate: out = layers.dropout(out, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=is_test) return out
def input_true(x, condition, reverse=False): """input instances in x, while corrensponding condition is true Args: x (Variable): shape = [batch_size, ...] condition (Variable): shape = [batch_size, 1] reverse (Variable): Default is False Returns: TODO Raises: NULL """ x_dtype = x.dtype if x_dtype == PaddleVarType.bool: x = layers.cast(x, dtype='int32') if condition.dtype != x.dtype: condition = layers.cast(condition, dtype=x.dtype) if reverse: condition = 1.0 - condition output = layers.elementwise_mul(x, condition, axis=0) if x_dtype == PaddleVarType.bool: output = layers.cast(output, dtype=x_dtype) return output
def __call__(self, msg): alpha = msg["alpha"] # lod-tensor (batch_size, num_heads) if attn_drop: old_h = alpha dropout = F.data(name='attn_drop', shape=[1], dtype="int64") u = L.uniform_random(shape=L.cast(L.shape(alpha)[:1], 'int64'), min=0., max=1.) keeped = L.cast(u > dropout, dtype="float32") self_attn_mask = L.scale(x=keeped, scale=10000.0, bias=-1.0, bias_after_scale=False) n_head_self_attn_mask = L.stack(x=[self_attn_mask] * num_heads, axis=1) n_head_self_attn_mask.stop_gradient = True alpha = n_head_self_attn_mask + alpha alpha = L.lod_reset(alpha, old_h) h = msg["v"] alpha = paddle_helper.sequence_softmax(alpha) self.alpha = alpha old_h = h h = h * alpha h = L.lod_reset(h, old_h) h = L.sequence_pool(h, "sum") if concat: h = L.reshape(h, [-1, num_heads * hidden_size]) else: h = L.reduce_mean(h, dim=1) return h
def elementwise_op_wrapper(cls, op, x, y, *args, force=False, axis=-1, act=None, name=None): """wrapper of elementwise op Args: op (TYPE): NULL x (TYPE): NULL y (TYPE): NULL *args (TYPE): NULL force (TYPE): Default is False axis (TYPE): Default is -1 act (TYPE): Default is None name (TYPE): Default is None Returns: TODO Raises: NULL """ x_dtype = x.dtype if x_dtype == PaddleVarType.bool: x = layers.cast(x, dtype='int32') tmp = x extras = [y] + list(args) for var in extras: if var.dtype != tmp.dtype and force: var = layers.cast(var, dtype=x.dtype) elif var.dtype == PaddleVarType.bool and x_dtype == PaddleVarType.bool: var = layers.cast(var, dtype=x.dtype) tmp = op(x=tmp, y=var, axis=axis, act=act, name=name) if x_dtype == PaddleVarType.bool: tmp = layers.cast(tmp, dtype=x_dtype) return tmp
def mask_fill(input, mask, value): """Fill value to input according to mask Args: input: input matrix mask: mask matrix value: Fill value Returns: output >>> input [ [1, 2, 3], [4, 5, 6] ] >>> mask [ [True, True, False], [True, False, False] ] >>> mask_fill(input, mask, 0) [ [1, 2, 0], [4, 0, 0] ] """ return input * layers.cast(layers.logical_not( mask), input.dtype) + layers.cast(mask, input.dtype) * value
def _build_position_ids(self, src_ids): src_shape = L.shape(src_ids) src_seqlen = src_shape[1] src_batch = src_shape[0] slot_seqlen = self.slot_seqlen num_b = (src_seqlen / slot_seqlen) - 1 a_position_ids = L.reshape(L.range(0, slot_seqlen, 1, dtype='int32'), [1, slot_seqlen, 1], inplace=True) # [1, slot_seqlen, 1] a_position_ids = L.expand( a_position_ids, [src_batch, 1, 1]) # [B, slot_seqlen * num_b, 1] zero = L.fill_constant([1], dtype='int64', value=0) input_mask = L.cast(L.equal(src_ids[:, :slot_seqlen], zero), "int32") # assume pad id == 0 [B, slot_seqlen, 1] a_pad_len = L.reduce_sum(input_mask, 1) # [B, 1, 1] b_position_ids = L.reshape(L.range(slot_seqlen, 2 * slot_seqlen, 1, dtype='int32'), [1, slot_seqlen, 1], inplace=True) # [1, slot_seqlen, 1] b_position_ids = L.expand( b_position_ids, [src_batch, num_b, 1]) # [B, slot_seqlen * num_b, 1] b_position_ids = b_position_ids - a_pad_len # [B, slot_seqlen * num_b, 1] position_ids = L.concat([a_position_ids, b_position_ids], 1) position_ids = L.cast(position_ids, 'int64') position_ids.stop_gradient = True return position_ids
def build_program(self, dtype): with fluid.program_guard(self.main_program, self.startup_program): self.feed_vars = self._prepare_feed_vars([32, 128], dtype, 2) self.feed_vars.append( fluid.data( name="data2", shape=[128, 128], dtype=dtype)) # subgraph with 2 op nodes tmp_0 = self.feed_vars[0] * self.feed_vars[1] tmp_1 = layers.cast(tmp_0, dtype="float16") zero = layers.fill_constant(shape=[128], dtype="float16", value=0) # TODO(xreki): fix precision problem when using softmax of float16. # tmp_2 = layers.softmax(tmp_1) tmp_2 = layers.elementwise_add(tmp_1, zero) tmp_3 = layers.mul(tmp_0, self.feed_vars[2]) # subgraph with 4 op nodes tmp_3 = layers.cast(tmp_2, dtype="float16") tmp_4 = layers.relu(tmp_1 + tmp_3) tmp_5 = layers.cast(tmp_4, dtype=dtype) tmp_3 = layers.cast(tmp_2, dtype=dtype) self.append_gradients(tmp_5) self.num_fused_ops = 4 self.fetch_list = [tmp_5, self.grad(tmp_0)]
def build_position_ids(src_ids, dst_ids): src_shape = L.shape(src_ids) src_batch = src_shape[0] src_seqlen = src_shape[1] dst_seqlen = src_seqlen - 1 # without cls src_position_ids = L.reshape( L.range( 0, src_seqlen, 1, dtype='int32'), [1, src_seqlen, 1], inplace=True) # [1, slot_seqlen, 1] src_position_ids = L.expand(src_position_ids, [src_batch, 1, 1]) # [B, slot_seqlen * num_b, 1] zero = L.fill_constant([1], dtype='int64', value=0) input_mask = L.cast(L.equal(src_ids, zero), "int32") # assume pad id == 0 [B, slot_seqlen, 1] src_pad_len = L.reduce_sum(input_mask, 1, keep_dim=True) # [B, 1, 1] dst_position_ids = L.reshape( L.range( src_seqlen, src_seqlen+dst_seqlen, 1, dtype='int32'), [1, dst_seqlen, 1], inplace=True) # [1, slot_seqlen, 1] dst_position_ids = L.expand(dst_position_ids, [src_batch, 1, 1]) # [B, slot_seqlen, 1] dst_position_ids = dst_position_ids - src_pad_len # [B, slot_seqlen, 1] position_ids = L.concat([src_position_ids, dst_position_ids], 1) position_ids = L.cast(position_ids, 'int64') position_ids.stop_gradient = True return position_ids
def grow_top_k(step_idx, alive_seq, alive_log_prob, parant_idx): pre_ids = alive_seq dec_step_emb = layers.embedding( input=pre_ids, size=[self.tar_vocab_size, self.hidden_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr( name='target_embedding', initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale))) dec_att_out, new_hidden_array, new_cell_array = decoder_step( dec_step_emb, pre_feed, pre_hidden_array, pre_cell_array, enc_memory) projection = layers.matmul(dec_att_out, softmax_weight) logits = layers.softmax(projection) current_log = layers.elementwise_add(x=layers.log(logits), y=alive_log_prob, axis=0) base_1 = layers.cast(step_idx, 'float32') + 6.0 base_1 /= 6.0 length_penalty = layers.pow(base_1, alpha) len_pen = layers.pow( ((5. + layers.cast(step_idx + 1, 'float32')) / 6.), alpha) current_log = layers.reshape(current_log, shape=[1, -1]) current_log = current_log / length_penalty topk_scores, topk_indices = layers.topk(input=current_log, k=beam_size) topk_scores = layers.reshape(topk_scores, shape=[-1]) topk_log_probs = topk_scores * length_penalty generate_id = layers.reshape(topk_indices, shape=[-1]) % self.tar_vocab_size selected_beam = layers.reshape( topk_indices, shape=[-1]) // self.tar_vocab_size topk_finished = layers.equal(generate_id, eos_ids) topk_finished = layers.cast(topk_finished, 'float32') generate_id = layers.reshape(generate_id, shape=[-1, 1]) pre_tokens_list = layers.gather(tokens, selected_beam) full_tokens_list = layers.concat( [pre_tokens_list, generate_id], axis=1) return full_tokens_list, topk_log_probs, topk_scores, topk_finished, selected_beam, generate_id, \ dec_att_out, new_hidden_array, new_cell_array
def batch_scatter(ref, indices, updates, in_place=False, overwrite=False): """Scatter updates to ref, according to corrensponding index in indices in each batch. Currently, it only support 2d Tensor. Args: ref (Variable): with shape [batch_size, ...] indices (Variable): with shape [batch_size, 1] updates (Variable): with shape [batch_size] in_place (bool): if True, scatter result will be assign to ref. otherwise, a new Tensor will be returned. Default is False. overwrite (bool): if True, scatter will over write corrensponding elements. Default is False. Returns: TODO Raises: NULL Examples: ref [[1, 1, 1], [1, 1, 1]] indices [[2], [1]] updates [2, 3] return [[1, 1, 2], [1, 3, 1]] """ ref_dtype = ref.dtype if ref_dtype not in PaddleVarType.floats: ref_in = layers.cast(ref, dtype='float32') else: ref_in = ref if updates.dtype != ref_in.dtype: updates = layers.cast(updates, dtype=ref_in.dtype) batch_size = layers.cast(layers.shape(ref_in)[0], dtype=indices.dtype) zero = layers.fill_constant(shape=[1], dtype=indices.dtype, value=0) one = layers.fill_constant(shape=[1], dtype=indices.dtype, value=1) batch_indices = layers.unsqueeze( layers.range(zero, batch_size, one, dtype=indices.dtype), [1]) coord = layers.concat([batch_indices, indices], axis=1) if overwrite: mask = layers.gather_nd(ref_in, coord) mask = layers.elementwise_sub(layers.zeros_like(mask), mask) ref_in = layers.scatter_nd_add(ref_in, coord, mask) output = layers.scatter_nd_add(ref_in, coord, updates) if ref_dtype not in PaddleVarType.floats: output = layers.cast(output, dtype=ref_dtype) if in_place: layers.assign(output, ref) return ref else: return output
def _debug_summary(self, input_mask): #histogram seqlen_before_pad = L.cast(L.reduce_sum(input_mask, dim=1), dtype='float32') seqlen_after_pad = L.reduce_sum( L.cast(L.zeros_like(input_mask), dtype='float32') + 1.0, dim=1) pad_num = seqlen_after_pad - seqlen_before_pad pad_rate = pad_num / seqlen_after_pad
def beam_search_step(state, logits, eos_id, beam_width, is_first_step, length_penalty): """logits.shape == [B*W, V]""" _, vocab_size = logits.shape bsz, beam_width = state.log_probs.shape onehot_eos = L.cast(F.one_hot(L.ones([1], 'int64') * eos_id, vocab_size), 'int64') #[1, V] probs = L.log(L.softmax(logits)) #[B*W, V] probs = mask_prob(probs, onehot_eos, state.finished) #[B*W, V] allprobs = L.reshape(state.log_probs, [-1, 1]) + probs #[B*W, V] not_finished = 1 - L.reshape(state.finished, [-1, 1]) #[B*W,1] not_eos = 1 - onehot_eos length_to_add = not_finished * not_eos #[B*W,V] alllen = L.reshape(state.lengths, [-1, 1]) + length_to_add allprobs = L.reshape(allprobs, [-1, beam_width * vocab_size]) alllen = L.reshape(alllen, [-1, beam_width * vocab_size]) allscore = hyp_score(allprobs, alllen, length_penalty) if is_first_step: allscore = L.reshape( allscore, [bsz, beam_width, -1])[:, 0, :] # first step only consiter beam 0 scores, idx = L.topk(allscore, k=beam_width) #[B, W] next_beam_id = idx // vocab_size #[B, W] next_word_id = idx % vocab_size gather_idx = L.concat([L.where(idx != -1)[:, :1], L.reshape(idx, [-1, 1])], 1) next_probs = L.reshape(L.gather_nd(allprobs, gather_idx), idx.shape) next_len = L.reshape(L.gather_nd(alllen, gather_idx), idx.shape) gather_idx = L.concat( [L.where(next_beam_id != -1)[:, :1], L.reshape(next_beam_id, [-1, 1])], 1) next_finished = L.reshape( L.gather_nd(state.finished, gather_idx), state.finished.shape ) #[gather new beam state according to new beam id] #log.debug(gather_idx.numpy()) #log.debug(state.finished.numpy()) #log.debug(next_finished.numpy()) next_finished += L.cast(next_word_id == eos_id, 'int64') next_finished = L.cast(next_finished > 0, 'int64') #log.debug(next_word_id.numpy()) #log.debug(next_beam_id.numpy()) next_state = BeamSearchState(log_probs=next_probs, lengths=next_len, finished=next_finished) output = BeamSearchOutput(scores=scores, predicted_ids=next_word_id, beam_parent_ids=next_beam_id) return output, next_state
def unsqueeze(input, axes): """Increase the number of axes of input""" input_dtype = input.dtype if input_dtype == VarDesc.VarType.BOOL: input = layers.cast(input, 'int32') output = layers.unsqueeze(input, axes=axes) if input_dtype == VarDesc.VarType.BOOL: output = layers.cast(output, 'bool') return output
def get_enc_bias(source_inputs): """ get_enc_bias """ source_inputs = layers.cast(source_inputs, 'float32') emb_sum = layers.reduce_sum(layers.abs(source_inputs), dim=-1) zero = layers.fill_constant([1], 'float32', value=0) bias = layers.cast(layers.equal(emb_sum, zero), 'float32') * -1e9 return layers.unsqueeze(layers.unsqueeze(bias, axes=[1]), axes=[1])
def matrix_nms(seg_masks, cate_labels, cate_scores, kernel='gaussian', sigma=2.0, sum_masks=None): """Matrix NMS for multi-class masks. Args: seg_masks (Tensor): shape (n, h, w) 0、1组成的掩码 cate_labels (Tensor): shape (n), mask labels in descending order cate_scores (Tensor): shape (n), mask scores in descending order kernel (str): 'linear' or 'gauss' sigma (float): std in gaussian method sum_masks (Tensor): shape (n, ) n个物体的面积 Returns: Tensor: cate_scores_update, tensors of shape (n) """ n_samples = L.shape(cate_labels)[0] # 物体数 seg_masks = L.reshape(seg_masks, (n_samples, -1)) # [n, h*w] # inter. inter_matrix = L.matmul(seg_masks, seg_masks, transpose_y=True) # [n, n] 自己乘以自己的转置。两两之间的交集面积。 # union. sum_masks_x = L.expand(L.reshape(sum_masks, (1, -1)), [n_samples, 1]) # [n, n] sum_masks重复了n行得到sum_masks_x # iou. iou_matrix = inter_matrix / (sum_masks_x + L.transpose(sum_masks_x, [1, 0]) - inter_matrix) rows = L.range(0, n_samples, 1, 'int32') cols = L.range(0, n_samples, 1, 'int32') rows = L.expand(L.reshape(rows, (1, -1)), [n_samples, 1]) cols = L.expand(L.reshape(cols, (-1, 1)), [1, n_samples]) tri_mask = L.cast(rows > cols, 'float32') iou_matrix = tri_mask * iou_matrix # [n, n] 只取上三角部分 # label_specific matrix. cate_labels_x = L.expand(L.reshape(cate_labels, (1, -1)), [n_samples, 1]) # [n, n] cate_labels重复了n行得到cate_labels_x label_matrix = L.cast(L.equal(cate_labels_x, L.transpose(cate_labels_x, [1, 0])), 'float32') label_matrix = tri_mask * label_matrix # [n, n] 只取上三角部分 # IoU compensation compensate_iou = L.reduce_max(iou_matrix * label_matrix, dim=0) compensate_iou = L.expand(L.reshape(compensate_iou, (1, -1)), [n_samples, 1]) # [n, n] compensate_iou = L.transpose(compensate_iou, [1, 0]) # [n, n] # IoU decay decay_iou = iou_matrix * label_matrix # # matrix nms if kernel == 'gaussian': decay_matrix = L.exp(-1 * sigma * (decay_iou ** 2)) compensate_matrix = L.exp(-1 * sigma * (compensate_iou ** 2)) decay_coefficient = L.reduce_min((decay_matrix / compensate_matrix), dim=0) elif kernel == 'linear': decay_matrix = (1-decay_iou)/(1-compensate_iou) decay_coefficient = L.reduce_min(decay_matrix, dim=0) else: raise NotImplementedError # update the score. cate_scores_update = cate_scores * decay_coefficient return cate_scores_update
def beam_search_step(state, logits, eos_id, beam_width, is_first_step, length_penalty): """logits.shape == [B*W, V]""" beam_size, vocab_size = logits.shape # as batch size=1 in this hub module. the first dim means bsz * beam_size equals beam_size logits_np = logits.numpy() for i in range(beam_size): logits_np[i][17963] = 0 # make [UNK] prob = 0 logits = D.to_variable(logits_np) bsz, beam_width = state.log_probs.shape onehot_eos = L.cast(F.one_hot(L.ones([1], 'int64') * eos_id, vocab_size), 'int64') #[1, V] probs = L.log(L.softmax(logits)) #[B*W, V] probs = mask_prob(probs, onehot_eos, state.finished) #[B*W, V] allprobs = L.reshape(state.log_probs, [-1, 1]) + probs #[B*W, V] not_finished = 1 - L.reshape(state.finished, [-1, 1]) #[B*W,1] not_eos = 1 - onehot_eos length_to_add = not_finished * not_eos #[B*W,V] alllen = L.reshape(state.lengths, [-1, 1]) + length_to_add allprobs = L.reshape(allprobs, [-1, beam_width * vocab_size]) alllen = L.reshape(alllen, [-1, beam_width * vocab_size]) allscore = hyp_score(allprobs, alllen, length_penalty) if is_first_step: allscore = L.reshape( allscore, [bsz, beam_width, -1])[:, 0, :] # first step only consiter beam 0 scores, idx = L.topk(allscore, k=beam_width) #[B, W] next_beam_id = idx // vocab_size #[B, W] next_word_id = idx % vocab_size gather_idx = L.concat([L.where(idx != -1)[:, :1], L.reshape(idx, [-1, 1])], 1) next_probs = L.reshape(L.gather_nd(allprobs, gather_idx), idx.shape) next_len = L.reshape(L.gather_nd(alllen, gather_idx), idx.shape) gather_idx = L.concat( [L.where(next_beam_id != -1)[:, :1], L.reshape(next_beam_id, [-1, 1])], 1) next_finished = L.reshape( L.gather_nd(state.finished, gather_idx), state.finished.shape ) #[gather new beam state according to new beam id] next_finished += L.cast(next_word_id == eos_id, 'int64') next_finished = L.cast(next_finished > 0, 'int64') next_state = BeamSearchState(log_probs=next_probs, lengths=next_len, finished=next_finished) output = BeamSearchOutput(scores=scores, predicted_ids=next_word_id, beam_parent_ids=next_beam_id) return output, next_state
def norm(param, dim, power): powered = F.pow(param, power) in_dtype = powered.dtype if in_dtype == fluid.core.VarDesc.VarType.FP16: powered = F.cast(powered, "float32") powered_norm = F.reduce_sum(powered, dim=dim, keep_dim=False) norm_ = F.pow(powered_norm, 1. / power) if in_dtype == fluid.core.VarDesc.VarType.FP16: norm_ = F.cast(norm_, "float16") return norm_
def build_model(self): node_features = self.graph_wrapper.node_feat["feat"] output = self.gcn(gw=self.graph_wrapper, feature=node_features, hidden_size=self.hidden_size, activation="relu", norm=self.graph_wrapper.node_feat["norm"], name="gcn_layer_1") output1 = output output = self.gcn(gw=self.graph_wrapper, feature=output, hidden_size=self.hidden_size, activation="relu", norm=self.graph_wrapper.node_feat["norm"], name="gcn_layer_2") output2 = output output = self.gcn(gw=self.graph_wrapper, feature=output, hidden_size=self.hidden_size, activation="relu", norm=self.graph_wrapper.node_feat["norm"], name="gcn_layer_3") output = L.concat(input=[output1, output2, output], axis=-1) output, ratio_length = sag_pool(gw=self.graph_wrapper, feature=output, ratio=self.pooling_ratio, graph_id=self.graph_id, dataset=self.args.dataset_name, name="sag_pool_1") output = L.lod_reset(output, self.graph_wrapper.graph_lod) cat1 = L.sequence_pool(output, "sum") ratio_length = L.cast(ratio_length, dtype="float32") cat1 = L.elementwise_div(cat1, ratio_length, axis=-1) cat2 = L.sequence_pool(output, "max") output = L.concat(input=[cat2, cat1], axis=-1) output = L.fc(output, size=self.hidden_size, act="relu") output = L.dropout(output, dropout_prob=self.dropout_ratio) output = L.fc(output, size=self.hidden_size // 2, act="relu") output = L.fc(output, size=self.num_classes, act=None, param_attr=fluid.ParamAttr(name="final_fc")) self.labels = L.cast(self.labels, dtype="float32") loss = L.sigmoid_cross_entropy_with_logits(x=output, label=self.labels) self.loss = L.mean(loss) pred = L.sigmoid(output) self.pred = L.argmax(x=pred, axis=-1) correct = L.equal(self.pred, self.labels_1dim) correct = L.cast(correct, dtype="int32") self.correct = L.reduce_sum(correct)
def fluid_get_offset(seq_len): """ args: seq_len: (-1) return: offset: the same shape as seq_len, cumsum(seq_len) - seq_len """ assert len(seq_len.shape) == 1 csum = layers.cumsum(layers.cast(seq_len, 'float32'), exclusive=True) return layers.cast(csum, 'int64')
def uniq_edges(src, dst, num_nodes): sorted_dst = L.cast(dst, dtype="int64") sorted_src = L.cast(src, dtype="int64") num_nodes = L.cast(num_nodes, dtype="int64") edge_hash = sorted_dst * num_nodes + sorted_src edge_hash, _ = L.argsort(edge_hash) edge_hash, _ = L.unique(edge_hash, dtype="int64") sorted_src = L.elementwise_mod(edge_hash, num_nodes) sorted_dst = L.elementwise_div(edge_hash, num_nodes) sorted_src = L.cast(sorted_src, dtype="int32") sorted_dst = L.cast(sorted_dst, dtype="int32") return sorted_src, sorted_dst
def forward(self, features): src_ids, sent_ids = features dtype = 'float16' if self.hparam['fp16'] else 'float32' zero = L.fill_constant([1], dtype='int64', value=0) input_mask = L.cast(L.logical_not(L.equal(src_ids, zero)), dtype) # assume pad id == 0 #input_mask = L.unsqueeze(input_mask, axes=[2]) d_shape = L.shape(src_ids) seqlen = d_shape[1] batch_size = d_shape[0] pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0]) pos_ids = L.expand(pos_ids, [batch_size, 1]) pos_ids = L.unsqueeze(pos_ids, axes=[2]) pos_ids = L.cast(pos_ids, 'int64') pos_ids.stop_gradient = True input_mask.stop_gradient = True task_ids = L.zeros_like(src_ids) + self.hparam.task_id #this shit wont use at the moment task_ids.stop_gradient = True bert = ErnieModel( src_ids=src_ids, position_ids=pos_ids, sentence_ids=sent_ids, task_ids=task_ids, input_mask=input_mask, config=self.hparam, use_fp16=self.hparam['fp16'] ) cls_feats = bert.get_pooled_output() cls_feats = L.dropout( x=cls_feats, dropout_prob=0.1, dropout_implementation="upscale_in_train" ) logits = L.fc( input=cls_feats, size=self.hparam['num_label'], param_attr=F.ParamAttr( name="cls_out_w", initializer=F.initializer.TruncatedNormal(scale=0.02)), bias_attr=F.ParamAttr( name="cls_out_b", initializer=F.initializer.Constant(0.)) ) propeller.summary.histogram('pred', logits) if self.mode is propeller.RunMode.PREDICT: probs = L.softmax(logits) return probs else: return logits
def build_program(self, dtype): with fluid.program_guard(self.main_program, self.startup_program): self.feed_vars = self._prepare_feed_vars([2, 2], dtype, 2) tmp_0 = layers.elementwise_add(self.feed_vars[0], self.feed_vars[1]) tmp_1 = layers.cast(tmp_0, dtype="float64") tmp_2 = layers.cast(tmp_1, dtype="float32") self.append_gradients(tmp_2) self.num_fused_ops = 2 self.fetch_list = [tmp_2, self.grad(tmp_0)]
def forward(self, q, k, v, lengths, speaker_embed, start_index, force_monotonic=False, prev_coeffs=None, window=None): # add position encoding as an inductive bias if self.has_bias: # multi-speaker model omega_q = 2 * F.sigmoid( F.squeeze(self.q_pos_affine(speaker_embed), axes=[-1])) omega_k = 2 * self.omega_initial * F.sigmoid(F.squeeze( self.k_pos_affine(speaker_embed), axes=[-1])) else: # single-speaker case batch_size = q.shape[0] omega_q = F.ones((batch_size, ), dtype="float32") omega_k = F.ones((batch_size, ), dtype="float32") * self.omega_default q += self.position_encoding_weight * positional_encoding(q, start_index, omega_q) k += self.position_encoding_weight * positional_encoding(k, 0, omega_k) q, k, v = self.q_affine(q), self.k_affine(k), self.v_affine(v) activations = F.matmul(q, k, transpose_y=True) activations /= np.sqrt(self.attention_dim) if self.training: # mask the <pad> parts from the encoder mask = F.sequence_mask(lengths, dtype="float32") attn_bias = F.scale(1. - mask, -1000) activations += F.unsqueeze(attn_bias, [1]) elif force_monotonic: assert window is not None backward_step, forward_step = window T_enc = k.shape[1] batch_size, T_dec, _ = q.shape # actually T_dec = 1 here alpha = F.fill_constant((batch_size, T_dec), value=0, dtype="int64") \ if prev_coeffs is None \ else F.argmax(prev_coeffs, axis=-1) backward = F.sequence_mask(alpha - backward_step, maxlen=T_enc, dtype="bool") forward = F.sequence_mask(alpha + forward_step, maxlen=T_enc, dtype="bool") mask = F.cast(F.logical_xor(backward, forward), "float32") # print("mask's shape:", mask.shape) attn_bias = F.scale(1. - mask, -1000) activations += attn_bias # softmax coefficients = F.softmax(activations, axis=-1) # context vector coefficients = F.dropout(coefficients, 1. - self.keep_prob, dropout_implementation='upscale_in_train') contexts = F.matmul(coefficients, v) # context normalization enc_lengths = F.cast(F.unsqueeze(lengths, axes=[1, 2]), "float32") contexts *= F.sqrt(enc_lengths) # out affine contexts = self.out_affine(contexts) return contexts, coefficients
def forward(self, features): def FC(inputs, name, i, act): return L.fc(inputs, self.hidden_size, act=act, param_attr=F.ParamAttr( name='%s.fc.w_%d' % (name, i), initializer=F.initializer.XavierInitializer( fan_in=self.hidden_size, fan_out=self.hidden_size)), bias_attr=F.ParamAttr( name='%s.fc.b_%d' % (name, i), initializer=F.initializer.Constant(0.))) title_ids, comment_ids = features embedding_attr = F.ParamAttr( name='emb', initializer=F.initializer.XavierInitializer( fan_in=self.vocab_size, fan_out=self.embedding_size)) title_encoded = L.embedding(title_ids, [self.vocab_size, self.embedding_size], param_attr=embedding_attr) comment_encoded = L.embedding(comment_ids, [self.vocab_size, self.embedding_size], param_attr=embedding_attr) # Vsum zero = L.fill_constant(shape=[1], dtype='int64', value=0) title_pad = L.cast(L.logical_not(L.equal(title_ids, zero)), 'float32') comment_pad = L.cast(L.logical_not(L.equal(comment_ids, zero)), 'float32') title_encoded = L.reduce_sum(title_encoded * title_pad, dim=1) title_encoded = L.softsign(title_encoded) comment_encoded = L.reduce_sum(comment_encoded * comment_pad, dim=1) comment_encoded = L.softsign(comment_encoded) for i in range(self.num_layers): title_encoded = FC(title_encoded, 'title', i, 'tanh') for i in range(self.num_layers): comment_encoded = FC(comment_encoded, 'comment', i, 'tanh') score = L.reduce_sum(title_encoded * comment_encoded, dim=1, keep_dim=True) / np.sqrt(self.hidden_size) if self.mode is propeller.RunMode.PREDICT: probs = L.sigmoid(score) return probs else: return score
def ffffffffffffffffffff(self, pred, target): ''' 输入矩形的格式是cx cy w h ''' assert pred.shape[0] == target.shape[0] pred = L.reshape(pred, [-1, 4]) target = L.reshape(target, [-1, 4]) pred = L.cast(pred, 'float32') target = L.cast(target, 'float32') # 相交矩形左上角坐标 tl = L.elementwise_max((pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2)) # 相交矩形右下角坐标 br = L.elementwise_min((pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2)) area_p = paddle.prod(pred[:, 2:], 1) # 预测框的面积 area_g = paddle.prod(target[:, 2:], 1) # gt框的面积 # 相交矩形是否存在? # en = (tl < br).type(tl.type()).prod(dim=1) en = L.cast(tl < br, 'float32') en = paddle.prod(en, 1) # 相交矩形是否存在? area_i = paddle.prod(br - tl, 1) * en area_u = area_p + area_g - area_i iou = (area_i) / (area_u + 1e-16) if self.loss_type == "iou": loss = 1 - iou**2 elif self.loss_type == "giou": c_tl = L.elementwise_min((pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2)) c_br = L.elementwise_max((pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2)) area_c = paddle.prod(c_br - c_tl, 1) # area_c限制在区间[1e-16, np.inf]内 area_c = L.clip(area_c, 1e-16, np.inf) giou = iou - (area_c - area_u) / area_c # giou限制在区间[-1.0, 1.0]内 giou = L.clip(giou, -1.0, 1.0) loss = 1 - giou if self.reduction == "mean": loss = loss.mean() elif self.reduction == "sum": loss = loss.sum() return loss
def gen_bias(encoder_inputs, decoder_inputs, step): decoder_bsz, decoder_seqlen = decoder_inputs.shape[:2] attn_bias = L.reshape(L.range(0, decoder_seqlen, 1, dtype='float32') + 1, [1, -1, 1]) decoder_bias = L.cast((L.matmul(attn_bias, 1. / attn_bias, transpose_y=True) >= 1.), 'float32') #[1, 1, decoderlen, decoderlen] encoder_bias = L.unsqueeze(L.cast(L.ones_like(encoder_inputs), 'float32'), [1]) #[bsz, 1, encoderlen] encoder_bias = L.expand(encoder_bias, [1, decoder_seqlen, 1]) #[bsz,decoderlen, encoderlen] decoder_bias = L.expand(decoder_bias, [decoder_bsz, 1, 1]) #[bsz, decoderlen, decoderlen] if step > 0: bias = L.concat([encoder_bias, L.ones([decoder_bsz, decoder_seqlen, step], 'float32'), decoder_bias], -1) else: bias = L.concat([encoder_bias, decoder_bias], -1) return bias
def sag_pool(gw, feature, ratio, graph_id, dataset, name, activation=L.tanh): """Implementation of self-attention graph pooling (SAGPool) This is an implementation of the paper SELF-ATTENTION GRAPH POOLING (https://arxiv.org/pdf/1904.08082.pdf) Args: gw: Graph wrapper object. feature: A tensor with shape (num_nodes, feature_size). ratio: The pooling ratio of nodes we want to select. graph_id: The graphs that the nodes belong to. dataset: To differentiate FRANKENSTEIN dataset and other datasets. name: The name of SAGPool layer. activation: The activation function. Return: new_feature: A tensor with shape (num_nodes, feature_size), and the unselected nodes' feature is masked by zero. ratio_length: The selected node numbers of each graph. """ if dataset == "FRANKENSTEIN": gcn_ = gcn else: gcn_ = norm_gcn score = gcn_(gw=gw, feature=feature, hidden_size=1, activation=None, norm=gw.node_feat["norm"], name=name) score = L.squeeze(score, axes=[]) perm, ratio_length = topk_pool(gw, score, graph_id, ratio) mask = L.zeros_like(score) mask = L.cast(mask, dtype="float32") updates = L.ones_like(perm) updates = L.cast(updates, dtype="float32") mask = L.scatter(mask, perm, updates) new_feature = L.elementwise_mul(feature, mask, axis=0) temp_score = activation(score) new_feature = L.elementwise_mul(new_feature, temp_score, axis=0) return new_feature, ratio_length