def forward_grad(self, x): #grad_x = F.conv2d(F.pad(x, (0, 0, 0, 1)), self.f_grad) # , groups=self.channels) #grad_y = F.conv2d(F.pad(x, (0, 0, 0, 1)), self.f_grad2) # , groups=self.channels) x1 = fluid.layers.pad2d(x, paddings=[0, 0, 0, 1]) #pdb.set_trace() grad_x = self.conv2df_grad(x1) #grad_x[:, :, :, -1] = 0 temp = unstack(grad_x, axis=3) temp[-1] = temp[-1] * 0 grad_x = stack(temp, axis=3) x2 = fluid.layers.pad2d(x, paddings=[0, 1, 0, 0]) grad_y = self.conv2df_grad2(x2) # , groups=self.channels) #grad_y[:, :, -1, :] = 0 temp = unstack(grad_y, axis=2) temp[-1] = temp[-1] * 0 grad_y = stack(temp, axis=2) bt, c, h, w = grad_x.shape grad_x = fluid.layers.reshape(grad_x, [-1, c, h, w]) grad_y = fluid.layers.reshape(grad_y, [-1, c, h, w]) return grad_x, grad_y
def greedy_search_infilling(model, q_ids, q_sids, sos_id, eos_id, attn_id, max_encode_len=640, max_decode_len=100, tgt_type_id=3): model.eval() _, logits, info = model(q_ids, q_sids) gen_ids = L.argmax(logits, -1) d_batch, d_seqlen = q_ids.shape seqlen = L.reduce_sum(L.cast(q_ids != 0, 'int64'), 1, keep_dim=True) has_stopped = np.zeros([d_batch], dtype=np.bool) gen_seq_len = np.zeros([d_batch], dtype=np.int64) output_ids = [] past_cache = info['caches'] cls_ids = L.ones([d_batch], dtype='int64') * sos_id attn_ids = L.ones([d_batch], dtype='int64') * attn_id ids = L.stack([cls_ids, attn_ids], -1) for step in range(max_decode_len): bias = gen_bias(q_ids, ids, step) pos_ids = D.to_variable( np.tile(np.array([[step, step + 1]], dtype=np.int64), [d_batch, 1])) pos_ids += seqlen _, logits, info = model(ids, L.ones_like(ids) * tgt_type_id, pos_ids=pos_ids, attn_bias=bias, past_cache=past_cache) gen_ids = L.argmax(logits, -1) past_cached_k, past_cached_v = past_cache cached_k, cached_v = info['caches'] cached_k = [ L.concat([pk, k[:, :1, :]], 1) for pk, k in zip(past_cached_k, cached_k) ] # concat cached cached_v = [ L.concat([pv, v[:, :1, :]], 1) for pv, v in zip(past_cached_v, cached_v) ] past_cache = (cached_k, cached_v) gen_ids = gen_ids[:, 1] ids = L.stack([gen_ids, attn_ids], 1) gen_ids = gen_ids.numpy() has_stopped |= (gen_ids == eos_id).astype(np.bool) gen_seq_len += (1 - has_stopped.astype(np.int64)) output_ids.append(gen_ids.tolist()) if has_stopped.all(): break output_ids = np.array(output_ids).transpose([1, 0]) return output_ids
def seq2seq_api_rnn(input_embedding, len=3, init_hiddens=None, init_cells=None): class EncoderCell(layers.RNNCell): def __init__(self, num_layers, hidden_size, dropout_prob=0., forget_bias=0.): self.num_layers = num_layers self.hidden_size = hidden_size self.dropout_prob = dropout_prob self.lstm_cells = [] for i in range(num_layers): self.lstm_cells.append( layers.LSTMCell( hidden_size, forget_bias=forget_bias, param_attr=fluid.ParamAttr( initializer=fluid.initializer. UniformInitializer(low=-init_scale, high=init_scale)))) def call(self, step_input, states): new_states = [] for i in range(self.num_layers): out, new_state = self.lstm_cells[i](step_input, states[i]) step_input = layers.dropout( out, self.dropout_prob, dropout_implementation='upscale_in_train' ) if self.dropout_prob > 0 else out new_states.append(new_state) return step_input, new_states cell = EncoderCell(num_layers, hidden_size, dropout) output, new_states = layers.rnn( cell, inputs=input_embedding, initial_states=[[hidden, cell] for hidden, cell in zip([ layers.reshape(init_hidden, shape=[-1, hidden_size]) for init_hidden in layers.split( init_hiddens, num_or_sections=num_layers, dim=0) ], [ layers.reshape(init_cell, shape=[-1, hidden_size]) for init_cell in layers.split( init_cells, num_or_sections=num_layers, dim=0) ])], time_major=False) last_hidden = layers.stack([hidden for hidden, _ in new_states], 0) last_cell = layers.stack([cell for _, cell in new_states], 0) return output, last_hidden, last_cell
def forward_grad(self, x): grad_x = self.conv4u(layers.pad(x, (0, 0, 0, 0, 0, 0, 0, 1))) tmp = layers.unstack(grad_x, axis=2) tmp[-1] = tmp[-1] - tmp[-1] #tmp[-1]=0 grad_x = layers.stack(tmp, axis=2) grad_y = self.conv4v(layers.pad(x, (0, 0, 0, 0, 0, 1, 0, 0))) tmp = layers.unstack(grad_y, axis=2) tmp[-1] = tmp[-1] - tmp[-1] # tmp[-1]=0 grad_y = layers.stack(tmp, axis=2) return grad_x, grad_y
def loop_body(i, mel_input, outputs, hiddens, attentions, state=None, coeffs=None): # state is None coeffs is None for the first step decoded, hidden, new_coeffs, new_state = self.decoder( mel_input, keys, values, text_lengths, i, speaker_embed, state, force_monotonic_attention, coeffs, window) new_coeffs = F.stack(new_coeffs) # (N, B, T_dec=1, T_enc) attentions.append(new_coeffs) # (N, B, T_dec=1, T_enc) outputs.append(decoded) # (B, T_dec=1, rC_mel) hiddens.append(hidden) # (B, T_dec=1, C_dec) # slice the last frame out of r generated frames to be used as the input for the next step batch_size = mel_input.shape[0] frames = F.reshape(decoded, [ batch_size, -1, self.decoder.reduction_factor, self.decoder.in_channels ]) input_frame = frames[:, :, -1, :] return (i + 1, input_frame, outputs, hiddens, attentions, new_state, new_coeffs)
def masks_to_boxes(masks): """ Compute the bounding boxes around the provided masks The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. Returns a [N, 4] tensors, with the boxes in xyxy format """ if np.sum(masks.shape) == 0: return dg.to_variable(np.zeros((0, 4))) h, w = masks.shape[-2:] y = dg.to_variable(np.arange(0, h, 1, dtype="float32")) x = dg.to_variable(np.arange(0, w, 1, dtype="float32")) y, x = T.meshgrid([y, x]) # [h, w] x_mask = (masks * L.unsqueeze(x, [0])) # [N, H, W] x_max = L.reduce_max(L.flatten(x_mask, axis=1), dim=-1) non_mask = dg.to_variable(~masks.numpy()) x_mask[non_mask] = 1e8 x_min = L.reduce_min(L.flatten(x_mask, axis=1), dim=-1) y_mask = (masks * L.unsqueeze(y, [0])) # [N, H, W] y_max = L.reduce_max(L.flatten(y_mask, axis=1), dim=-1) y_mask[non_mask] = 1e8 y_min = L.reduce_min(L.flatten(y_mask, axis=1), dim=-1) return L.stack([x_min, y_min, x_max, y_max], 1)
def __call__(self, msg): alpha = msg["alpha"] # lod-tensor (batch_size, num_heads) if attn_drop: old_h = alpha dropout = F.data(name='attn_drop', shape=[1], dtype="int64") u = L.uniform_random(shape=L.cast(L.shape(alpha)[:1], 'int64'), min=0., max=1.) keeped = L.cast(u > dropout, dtype="float32") self_attn_mask = L.scale(x=keeped, scale=10000.0, bias=-1.0, bias_after_scale=False) n_head_self_attn_mask = L.stack(x=[self_attn_mask] * num_heads, axis=1) n_head_self_attn_mask.stop_gradient = True alpha = n_head_self_attn_mask + alpha alpha = L.lod_reset(alpha, old_h) h = msg["v"] alpha = paddle_helper.sequence_softmax(alpha) self.alpha = alpha old_h = h h = h * alpha h = L.lod_reset(h, old_h) h = L.sequence_pool(h, "sum") if concat: h = L.reshape(h, [-1, num_heads * hidden_size]) else: h = L.reduce_mean(h, dim=1) return h
def build_graph_attn_bias(input_mask, n_head, dtype, slot_seqlen): input_shape = L.shape(input_mask) input_batch = input_shape[0] input_seqlen = input_shape[1] num_slot = input_seqlen / slot_seqlen num_b = num_slot - 1 ones = L.ones([num_b], dtype="float32") # [num_b] diag_ones = L.diag(ones) # [num_b, num_b] diag_ones = L.unsqueeze(diag_ones, [1, -1]) # [num_b, 1, num_b, 1] diag_ones = L.expand( diag_ones, [1, slot_seqlen, 1, slot_seqlen]) # [num_b, seqlen, num_b, seqlen] diag_ones = L.reshape(diag_ones, [1, num_b * slot_seqlen, num_b * slot_seqlen ]) # [1, num_b*seqlen, num_b*seqlen] graph_attn_bias = L.concat([ L.ones([1, num_b * slot_seqlen, slot_seqlen], dtype="float32"), diag_ones ], 2) graph_attn_bias = L.concat([ L.ones([1, slot_seqlen, num_slot * slot_seqlen], dtype="float32"), graph_attn_bias ], 1) # [1, seq, seq] pad_attn_bias = L.matmul(input_mask, input_mask, transpose_y=True) # [batch, seq, seq] attn_bias = graph_attn_bias * pad_attn_bias attn_bias = (1. - attn_bias) * -10000. attn_bias = L.stack([attn_bias] * n_head, 1) # [batch, n_head, seq, seq] if attn_bias.dtype != dtype: attn_bias = L.cast(attn_bias, dtype) return attn_bias
def build_attn_bias(input_mask, n_head, dtype): attn_bias = L.matmul(input_mask, input_mask, transpose_y=True) # [batch, seq, seq] attn_bias = (1. - attn_bias) * -10000. attn_bias = L.stack([attn_bias] * n_head, 1) # [batch, n_head, seq, seq] if attn_bias.dtype != dtype: attn_bias = L.cast(attn_bias, dtype) return attn_bias
def forward(self, tensor_list: NestedTensor): x = tensor_list.tensors mask = tensor_list.mask assert mask is not None bs, h, w = mask.shape mask = mask.numpy() not_mask = ~mask not_mask = dg.to_variable(not_mask).astype('float32') y_embed = L.cumsum(not_mask, axis=1) # [batch_size, h, w] x_embed = L.cumsum(not_mask, axis=2) # [batch_size, h, w] if self.normalize: eps = 1e-6 y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale dim_t = (np.arange(0, self.num_pos_feats, 1, dtype="float32")) # [num_pos_feats] dim_t = self.temperature**(2 * (dim_t // 2) / self.num_pos_feats ) # [num_pos_feats] dim_t = dg.to_variable(dim_t) x_embed = L.unsqueeze(x_embed, 3) # [batch_size, h, w, 1] y_embed = L.unsqueeze(y_embed, 3) # [batch_size, h, w, 1] pos_x = x_embed / dim_t # [batch_size, h, w, num_pos_feats] pos_y = y_embed / dim_t # [batch_size, h, w, num_pos_feats] pos_x_1 = L.sin(pos_x[:, :, :, 0::2]) # [batch_size, h, w, num_pos_feats / 2] pos_x_2 = L.cos(pos_x[:, :, :, 1::2]) # [batch_size, h, w, num_pos_feats / 2] pos_y_1 = L.sin(pos_y[:, :, :, 0::2]) # [batch_size, h, w, num_pos_feats / 2] pos_y_2 = L.cos(pos_y[:, :, :, 1::2]) # [batch_size, h, w, num_pos_feats / 2] pos_x = L.reshape(L.stack([pos_x_1, pos_x_2], axis=4), (bs, h, w, -1)) # [batch_size, h, w, num_pos_feats] pos_y = L.reshape(L.stack([pos_y_1, pos_y_2], axis=4), (bs, h, w, -1)) # [batch_size, h, w, num_pos_feats] pos = L.concat((pos_y, pos_x), axis=3) # [batch_size, h, w, num_pos_feats * 2] pos = L.transpose(pos, perm=(0, 3, 1, 2)) # [batch_size, num_pos_feats * 2, h, w] return pos
def update_loss_scale(grads): state = mixed_precision_global_state() if state is None or not state.dynamic_scaling: return per_grad_check = layers.stack([layers.reduce_sum(g) for g in grads]) grad_valid = layers.isfinite(per_grad_check) layers.cond(grad_valid, lambda: state.increment(), lambda: state.decrement()) return grad_valid
def _gen_input(self, token_ids, type_ids, pos_ids, input_mask, aux_emb=None): token_emb_out = layers.embedding( input=token_ids, size=[self.vocab_size, self.emb_size], dtype=self.dtype, param_attr=fluid.ParamAttr(name=self.token_emb_name, initializer=self.param_initializer)) type_emb_out = layers.embedding( input=type_ids, size=[self.type_size, self.emb_size], dtype=self.dtype, param_attr=fluid.ParamAttr(name=self.type_emb_name, initializer=self.param_initializer)) pos_emb_out = layers.embedding( input=pos_ids, size=[self.max_position_seq_len, self.emb_size], dtype=self.dtype, param_attr=fluid.ParamAttr(name=self.pos_emb_name, initializer=self.param_initializer)) emb_out = token_emb_out + type_emb_out + pos_emb_out # auxiliary memory embeddings if aux_emb is not None: emb_out = layers.concat([aux_emb, emb_out], axis=1) # post process of embedding emb_out = pre_process_layer(emb_out, self.pre_encoder_cmd, self.prepostprocess_dropout, name="pre_encoder", epsilon=self.epsilon) if self.emb_mapping_in: emb_out = layers.fc(input=emb_out, num_flatten_dims=2, size=self.hidden_size, param_attr=fluid.ParamAttr( name="emb_hidden_mapping", initializer=self.param_initializer), bias_attr="emb_hidden_mapping_bias") # generate n-head self-attention mask self_attn_mask = input_mask self_attn_mask = layers.scale(x=self_attn_mask, scale=1e4, bias=-1.0, bias_after_scale=False) n_head_self_attn_mask = layers.stack(x=[self_attn_mask] * self.n_head, axis=1) n_head_self_attn_mask.stop_gradient = True return emb_out, n_head_self_attn_mask
def pad(self, input_ele): max_len = max([input_ele[i].shape[0] for i in range(len(input_ele))]) out_list = [] for i in range(len(input_ele)): pad_len = max_len - input_ele[i].shape[0] one_batch_padded = layers.pad(input_ele[i], [0, pad_len, 0, 0], pad_value=0.0) out_list.append(one_batch_padded) out_padded = layers.stack(out_list) return out_padded
def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, pos=None, query_pos=None): output = tgt intermediate = [] assert tgt_mask is None, "Not implement compute tgt_mask's attn_mask." if memory_mask is not None: bs, tgt_length = tgt.shape[:2] memory_length = memory.shape[1] attn_mask = L.zeros([bs, tgt_length, memory_length], dtype="float32") memory_mask = L.expand( L.unsqueeze(memory_mask, [1]), (1, tgt_length, 1)) # [bs, tgt_length, memory_length] attn_mask = attn_mask.numpy() memory_mask = memory_mask.numpy() attn_mask[memory_mask] = -1e8 attn_mask = dg.to_variable(attn_mask) attn_mask = L.expand(L.unsqueeze(attn_mask, [1]), (1, self.nhead, 1, 1)) # [bs, nhead, tgt_length, memory_length] memory_mask = attn_mask attention_weight = [] for layer in self.layers: output, self_attn_weights, multihead_attn_weights = layer( output, memory, tgt_mask=tgt_mask, memory_mask=memory_mask, pos=pos, query_pos=query_pos) attention_weight.append( (self_attn_weights, multihead_attn_weights)) if self.return_intermediate: intermediate.append(self.norm(output)) if self.norm is not None: output = self.norm(output) if self.return_intermediate: intermediate.pop() intermediate.append(output) if self.return_intermediate: return L.stack(intermediate), attention_weight return L.unsqueeze(output, [0]), attention_weight
def forward(self, x, seq_mask, pad_index, hx=None): """Forward network""" x, batch_sizes, sorted_indices = self.pack_padded_sequence( x, seq_mask, pad_index) _, unsorted_indices = layers.argsort(sorted_indices) batch_size = batch_sizes[0] h_n, c_n = [], [] if hx is None: ih = layers.zeros(shape=(self.num_layers * 2, batch_size, self.hidden_size), dtype=x[0].dtype) h, c = ih, ih else: h, c = self.permute_hidden(hx, sorted_indices) h = layers.reshape(h, shape=(self.num_layers, 2, -1, self.hidden_size)) c = layers.reshape(c, shape=(self.num_layers, 2, -1, self.hidden_size)) for i in range(self.num_layers): x = layers.split(x, batch_sizes, dim=0) if self.training and self.dropout > 0: mask = SharedDropout.get_mask(x[0], self.dropout) x = [j * mask[:len(j)] for j in x] x_f, (h_f, c_f) = self.layer_forward(x=x, hx=(h[i, 0], c[i, 0]), cell=self.f_cells[i], batch_sizes=batch_sizes) x_b, (h_b, c_b) = self.layer_forward(x=x, hx=(h[i, 1], c[i, 1]), cell=self.b_cells[i], batch_sizes=batch_sizes, reverse=True) x = layers.concat((x_f, x_b), axis=-1) h_n.append(layers.stack((h_f, h_b))) c_n.append(layers.stack((c_f, c_b))) x = self.pad_packed_sequence(x, batch_sizes, unsorted_indices) hx = layers.concat(h_n, axis=0), layers.concat(c_n, axis=0) hx = self.permute_hidden(hx, unsorted_indices) return x, hx
def update_loss_scale(grads): state = mixed_precision_global_state() if state is None or not state.dynamic_scaling: return per_grad_check = layers.stack([layers.reduce_sum(g) for g in grads]) grad_valid = layers.isfinite(per_grad_check) with layers.Switch() as switch: with switch.case(grad_valid): state.increment() with switch.default(): state.decrement() return grad_valid
def _ranking(self, inputs, predictions): """ Reranking generated responses. """ src_token = inputs["src_token"] src_mask = inputs["src_mask"] src_pos = inputs["src_pos"] src_type = inputs["src_type"] src_turn = inputs["src_turn"] src_embed = self.embedder(src_token, src_pos, src_type, src_turn) batch_size, num_latent, tgt_seq_len = predictions.shape # shape: [batch_size, num_latent, seq_len, 1] preds_token = F.unsqueeze(predictions, [3]) preds_mask = F.not_equal(preds_token, self.padding_idx, "int64") preds_pos = layers.range(0, tgt_seq_len, 1, dtype="float32") preds_pos = F.unsqueeze(preds_pos, [0, 0, 1]) preds_pos = layers.expand(preds_pos, [batch_size, num_latent, 1, 1]) preds_pos = layers.cast(preds_pos, "int64") preds_type = layers.zeros_like(preds_token) preds_turn = layers.zeros_like(preds_token) scores = [] for i in range(num_latent): pred_token = preds_token[:, i] pred_mask = preds_mask[:, i] pred_pos = preds_pos[:, i] pred_type = preds_type[:, i] pred_turn = preds_turn[:, i] input_mask = layers.concat([src_mask, pred_mask], axis=1) input_mask.stop_gradient = True pred_embed = self.embedder(pred_token, pred_pos, pred_type, pred_turn) embed = layers.concat([src_embed, pred_embed], axis=1) embed = self.embed_layer_norm(embed) mask_embed = self.mask_embed mask_embed = layers.expand(mask_embed, [batch_size, 1, 1]) mask_embed = self.embed_layer_norm(mask_embed) out = layers.concat([mask_embed, embed], axis=1) mask = self._create_mask(input_mask, append_head=True) for layer in self.layers: out = layer(out, mask, None) mask_embed = out[:, 0] score = self.discriminator(mask_embed) scores.append(score[:, 0]) scores = layers.stack(scores, axis=1) return scores
def pad_packed_sequence(self, x, batch_sizes, unsorted_indices): """Pads a packed sequences.""" h_size = x.shape[1] split_x = layers.split(x, batch_sizes, dim=0) max_bs = batch_sizes[0] step_embs = [] for step, cur_bs in enumerate(batch_sizes): pad_emb = layers.zeros(shape=(max_bs - cur_bs, h_size), dtype=x.dtype) step_emb = layers.concat(input=(split_x[step], pad_emb)) step_embs.append(step_emb) new_x = layers.stack(step_embs, axis=1) new_x = layers.index_select(new_x, unsorted_indices) return new_x
def forward(self, x, adj): """Forward network""" x = layers.dropout(x, self.dropout) if self.layer == 1: x = layers.stack([att.forward(x, adj) for att in self.attentions], dim=2) x = layers.reduce_sum(x, 2) x = layers.dropout(x, self.dropout) return layers.log_softmax(x, axis=2) else: x = layers.concat([att.forward(x, adj) for att in self.attentions], axis=2) x = layers.dropout(x, self.dropout) return self.out_att.forward(x, adj)
def pad_sequence_paddle(sequences, padding_value=0): """Fill sequences(variable) into a fixed-length matrix""" max_size = sequences[0].shape trailing_dims = max_size[1:] max_len = max([s.shape[0] for s in sequences]) out_tensor = [] for tensor in sequences: length = tensor.shape[0] pad_tensor = layers.concat((tensor, layers.fill_constant( (max_len - length, *trailing_dims), dtype=tensor.dtype, value=padding_value))) out_tensor.append(pad_tensor) out_tensor = layers.stack(out_tensor) return out_tensor
def teacher_forced_train(self, keys, values, text_lengths, speaker_embed, mel): # build decoder inputs by shifting over by one frame and add all zero <start> frame # the mel input is downsampled by a reduction factor batch_size = mel.shape[0] mel_input = F.reshape(mel, (batch_size, -1, self.decoder.reduction_factor, self.decoder.in_channels)) zero_frame = F.zeros((batch_size, 1, self.decoder.in_channels), dtype="float32") # downsample mel input as a regularization mel_input = F.concat([zero_frame, mel_input[:, :-1, -1, :]], axis=1) # decoder decoded, hidden, attentions, final_state = self.decoder(mel_input, keys, values, text_lengths, 0, speaker_embed) attentions = F.stack(attentions) # (N, B, T_dec, T_encs) # unfold frames decoded = F.reshape(decoded, (batch_size, -1, self.decoder.in_channels)) # postnet refined = self.postnet(hidden, speaker_embed) return decoded, refined, attentions, final_state
def pick_image(images, idx): """ Pick the image among images according to idx. Args: images (B x N x C x H x W), N images, idx (B ) indices to select. """ if type(images) == list: return [pick_image(r, idx) for r in images] if idx is None: return images[:, 0] elif type(idx) == int: return images[:, idx] idx = idx.astype('long').numpy() images = L.stack([images[i][int(idx[i])] for i in range(images.shape[0])]) return images
def build_and_run_program(place, batch_size, beam_size, stop_gradient=False): fluid.default_startup_program().random_seed = 1 fluid.default_main_program().random_seed = 1 np.random.seed(2) x = layers.assign( np.random.rand(batch_size, beam_size, 32).astype("float32")) indices = fluid.data(shape=[None, beam_size], dtype="int64", name="indices") step_idx = layers.fill_constant( shape=[1], dtype="int64", value=0, force_cpu=True) max_len = layers.fill_constant( shape=[1], dtype="int64", value=10, force_cpu=True) cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond) scores = layers.array_write(x, step_idx) with while_op.block(): bs = layers.cast(layers.shape(x)[0], "int64") for _ in range(20): bs = layers.cast(bs, 'int64') bs.stop_gradient = stop_gradient batch_pos = layers.expand( layers.unsqueeze( layers.range( 0, bs, 1, dtype=bs.dtype), [1]), [1, beam_size]) topk_coordinates = layers.stack([batch_pos, indices], axis=2) topk_coordinates.stop_gradient = stop_gradient score = layers.gather_nd(x, topk_coordinates) layers.increment(x=step_idx, value=1.0, in_place=True) layers.array_write(score, i=step_idx, array=scores) length_cond = layers.less_than(x=step_idx, y=max_len) layers.assign(length_cond, cond) out = layers.tensor_array_to_tensor(scores, axis=0, use_stack=True)[0] loss = layers.reduce_mean(out) opt = fluid.optimizer.Adam(0.01) opt.minimize(loss) exe = fluid.Executor(place) data = np.random.random_integers( low=0, high=beam_size - 1, size=(batch_size, beam_size)).astype("int64") loss_val, = exe.run(feed={"indices": data}, fetch_list=[loss]) return loss_val
def forward(self, indices, speaker_position_rate=None): """ Args: indices (Variable): shape (B, T), dtype: int64, position indices, where B means the batch size, T means the time steps. speaker_position_rate (Variable | float, optional), position rate. It can be a float point number or a Variable with shape (1,), then this speaker_position_rate is used for every example. It can also be a Variable with shape (B, ), which contains a speaker position rate for each utterance. Returns: out (Variable): shape(B, T, C_pos), dtype float32, position embedding, where C_pos means position embedding size. """ batch_size, time_steps = indices.shape # convert speaker_position_rate to a Variable with shape(B, ) if isinstance(speaker_position_rate, float): speaker_position_rate = dg.to_variable( np.array([speaker_position_rate]).astype("float32")) speaker_position_rate = F.expand(speaker_position_rate, [batch_size]) elif isinstance(speaker_position_rate, fluid.framework.Variable) \ and list(speaker_position_rate.shape) == [1]: speaker_position_rate = F.expand(speaker_position_rate, [batch_size]) assert len(speaker_position_rate.shape) == 1 and \ list(speaker_position_rate.shape) == [batch_size] weight = compute_position_embedding(self.weight, speaker_position_rate) # (B, V, C) # make indices for gather_nd batch_id = F.expand( F.unsqueeze( F.range( 0, batch_size, 1, dtype="int64"), [1]), [1, time_steps]) # (B, T, 2) gather_nd_id = F.stack([batch_id, indices], -1) out = F.gather_nd(weight, gather_nd_id) return out
def crop(x, audio_start, audio_length): """Crop the upsampled condition to match audio_length. The upsampled condition has the same time steps as the whole audio does. But since audios are sliced to 0.5 seconds randomly while conditions are not, upsampled conditions should also be sliced to extaclt match the time steps of the audio slice. Args: x (Variable): shape(B, C, T), dtype float32, the upsample condition. audio_start (Variable): shape(B, ), dtype: int64, the index the starting point. audio_length (int): the length of the audio (number of samples it contaions). Returns: Variable: shape(B, C, audio_length), cropped condition. """ # crop audio slices = [] # for each example starts = audio_start.numpy() for i in range(x.shape[0]): start = starts[i] end = start + audio_length slice = F.slice(x[i], axes=[1], starts=[start], ends=[end]) slices.append(slice) out = F.stack(slices) return out
def forward(self, outputs, target_sizes): """ Perform the computation Parameters: outputs: raw outputs of the model target_sizes: tensor of dimension [batch_size x 2] containing the size of each image For evaluation, this must be the original image size (before any data augmentation) For visualization, this should be the image size after data augment, but before padding """ out_logits, out_bbox = outputs["pred_logits"], outputs["pred_boxes"] assert len(out_logits) == len(target_sizes) assert target_sizes.shape[1] == 2 prob = L.softmax(out_logits, -1) # [bs, num_queries, num_classes + 1] labels = L.argmax(prob[:, :, :], axis=-1) # [bs, num_queries] scores = L.reduce_max(prob, dim=-1) # [bs, num_queries] # convert to [x0, y0, x1, y1] format bs, num_queries, _ = out_bbox.shape out_bbox = L.reshape(out_bbox, (-1, 4)) boxes = box_ops.box_cxcywh_to_xyxy(out_bbox) boxes = L.reshape(boxes, (bs, num_queries, 4)) # and fromm relative [0, 1] to absolute [0, height] coordinates img_h, img_w = target_sizes[:, 0], target_sizes[:, 1] scale_fct = L.stack([img_w, img_h, img_w, img_h], 1) # [bs, 4] scale_fct = L.expand(L.unsqueeze(scale_fct, [1]), (1, num_queries, 1)) boxes = boxes * scale_fct results = [{ 'scores': s, 'labels': l, 'boxes': b } for s, l, b in zip(scores.numpy(), labels.numpy(), boxes.numpy())] return results
def beam_search_infilling(model, q_ids, q_sids, sos_id, eos_id, attn_id, max_encode_len=640, max_decode_len=100, beam_width=5, tgt_type_id=3, length_penalty=1.0): model.eval() _, __, info = model(q_ids, q_sids) d_batch, d_seqlen = q_ids.shape state = BeamSearchState(log_probs=L.zeros([d_batch, beam_width], 'float32'), lengths=L.zeros([d_batch, beam_width], 'int64'), finished=L.zeros([d_batch, beam_width], 'int64')) outputs = [] def reorder_(t, parent_id): """reorder cache according to parent beam id""" gather_idx = L.where(parent_id != -1)[:, 0] * beam_width + L.reshape( parent_id, [-1]) t = L.gather(t, gather_idx) return t def tile_(t, times): _shapes = list(t.shape[1:]) ret = L.reshape( L.expand(L.unsqueeze(t, [1]), [ 1, times, ] + [ 1, ] * len(_shapes)), [ -1, ] + _shapes) return ret cached_k, cached_v = info['caches'] cached_k = [tile_(k, beam_width) for k in cached_k] cached_v = [tile_(v, beam_width) for v in cached_v] past_cache = (cached_k, cached_v) q_ids = tile_(q_ids, beam_width) seqlen = L.reduce_sum(L.cast(q_ids != 0, 'int64'), 1, keep_dim=True) cls_ids = L.ones([d_batch * beam_width], dtype='int64') * sos_id attn_ids = L.ones([d_batch * beam_width], dtype='int64') * attn_id # SOS ids = L.stack([cls_ids, attn_ids], -1) for step in range(max_decode_len): bias = gen_bias(q_ids, ids, step) pos_ids = D.to_variable( np.tile(np.array([[step, step + 1]], dtype=np.int64), [d_batch * beam_width, 1])) pos_ids += seqlen _, logits, info = model(ids, L.ones_like(ids) * tgt_type_id, pos_ids=pos_ids, attn_bias=bias, past_cache=past_cache) output, state = beam_search_step(state, logits[:, 1], eos_id=eos_id, beam_width=beam_width, is_first_step=(step == 0), length_penalty=length_penalty) outputs.append(output) past_cached_k, past_cached_v = past_cache cached_k, cached_v = info['caches'] cached_k = [ reorder_(L.concat([pk, k[:, :1, :]], 1), output.beam_parent_ids) for pk, k in zip(past_cached_k, cached_k) ] # concat cached cached_v = [ reorder_(L.concat([pv, v[:, :1, :]], 1), output.beam_parent_ids) for pv, v in zip(past_cached_v, cached_v) ] past_cache = (cached_k, cached_v) pred_ids_flatten = L.reshape(output.predicted_ids, [d_batch * beam_width]) ids = L.stack([pred_ids_flatten, attn_ids], 1) if state.finished.numpy().all(): break final_ids = L.stack([o.predicted_ids for o in outputs], 0) final_parent_ids = L.stack([o.beam_parent_ids for o in outputs], 0) final_ids = L.gather_tree(final_ids, final_parent_ids)[:, :, 0] # pick best beam final_ids = L.transpose(L.reshape(final_ids, [-1, d_batch * 1]), [1, 0]) return final_ids
def beam_search(self, src_word, src_pos, src_slf_attn_bias, trg_word, trg_src_attn_bias, bos_id=0, eos_id=1, beam_size=4, max_len=256): def expand_to_beam_size(tensor, beam_size): tensor = layers.reshape(tensor, [tensor.shape[0], 1] + tensor.shape[1:]) tile_dims = [1] * len(tensor.shape) tile_dims[1] = beam_size return layers.expand(tensor, tile_dims) def merge_batch_beams(tensor): return layers.reshape(tensor, [tensor.shape[0] * tensor.shape[1]] + tensor.shape[2:]) def split_batch_beams(tensor): return fluid.layers.reshape(tensor, shape=[-1, beam_size] + list(tensor.shape[1:])) def mask_probs(probs, finished, noend_mask_tensor): # TODO: use where_op finished = layers.cast(finished, dtype=probs.dtype) probs = layers.elementwise_mul(layers.expand( layers.unsqueeze(finished, [2]), [1, 1, self.trg_vocab_size]), noend_mask_tensor, axis=-1) - layers.elementwise_mul( probs, (finished - 1), axis=0) return probs def gather(x, indices, batch_pos): topk_coordinates = fluid.layers.stack([batch_pos, indices], axis=2) return layers.gather_nd(x, topk_coordinates) # run encoder enc_output = self.encoder(src_word, src_pos, src_slf_attn_bias) # constant number inf = float(1. * 1e7) batch_size = enc_output.shape[0] max_len = (enc_output.shape[1] + 20) if max_len is None else max_len vocab_size_tensor = layers.fill_constant(shape=[1], dtype="int64", value=self.trg_vocab_size) end_token_tensor = to_variable( np.full([batch_size, beam_size], eos_id, dtype="int64")) noend_array = [-inf] * self.trg_vocab_size noend_array[eos_id] = 0 noend_mask_tensor = to_variable(np.array(noend_array, dtype="float32")) batch_pos = layers.expand( layers.unsqueeze( to_variable(np.arange(0, batch_size, 1, dtype="int64")), [1]), [1, beam_size]) predict_ids = [] parent_ids = [] ### initialize states of beam search ### log_probs = to_variable( np.array([[0.] + [-inf] * (beam_size - 1)] * batch_size, dtype="float32")) finished = to_variable( np.full([batch_size, beam_size], 0, dtype="bool")) ### initialize inputs and states of transformer decoder ### ## init inputs for decoder, shaped `[batch_size*beam_size, ...]` trg_word = layers.fill_constant(shape=[batch_size * beam_size, 1], dtype="int64", value=bos_id) trg_pos = layers.zeros_like(trg_word) trg_src_attn_bias = merge_batch_beams( expand_to_beam_size(trg_src_attn_bias, beam_size)) enc_output = merge_batch_beams( expand_to_beam_size(enc_output, beam_size)) ## init states (caches) for transformer, need to be updated according to selected beam caches = [{ "k": layers.fill_constant( shape=[batch_size * beam_size, self.n_head, 0, self.d_key], dtype=enc_output.dtype, value=0), "v": layers.fill_constant( shape=[batch_size * beam_size, self.n_head, 0, self.d_value], dtype=enc_output.dtype, value=0), } for i in range(self.n_layer)] for i in range(max_len): trg_pos = layers.fill_constant(shape=trg_word.shape, dtype="int64", value=i) caches = map_structure( # can not be reshaped since the 0 size lambda x: x if i == 0 else merge_batch_beams(x), caches) logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias, enc_output, caches) caches = map_structure(split_batch_beams, caches) step_log_probs = split_batch_beams( fluid.layers.log(fluid.layers.softmax(logits))) step_log_probs = mask_probs(step_log_probs, finished, noend_mask_tensor) log_probs = layers.elementwise_add(x=step_log_probs, y=log_probs, axis=0) log_probs = layers.reshape(log_probs, [-1, beam_size * self.trg_vocab_size]) scores = log_probs topk_scores, topk_indices = fluid.layers.topk(input=scores, k=beam_size) beam_indices = fluid.layers.elementwise_floordiv( topk_indices, vocab_size_tensor) token_indices = fluid.layers.elementwise_mod( topk_indices, vocab_size_tensor) # update states caches = map_structure( lambda x: gather(x, beam_indices, batch_pos), caches) log_probs = gather(log_probs, topk_indices, batch_pos) finished = gather(finished, beam_indices, batch_pos) finished = layers.logical_or( finished, layers.equal(token_indices, end_token_tensor)) trg_word = layers.reshape(token_indices, [-1, 1]) predict_ids.append(token_indices) parent_ids.append(beam_indices) if layers.reduce_all(finished).numpy(): break predict_ids = layers.stack(predict_ids, axis=0) parent_ids = layers.stack(parent_ids, axis=0) finished_seq = layers.transpose( layers.gather_tree(predict_ids, parent_ids), [1, 2, 0]) finished_scores = topk_scores return finished_seq, finished_scores
def forward(self): """Build the GATNE net. """ param_attr_init = fluid.initializer.Uniform( low=-1.0, high=1.0, seed=np.random.randint(100)) embed_param_attrs = fluid.ParamAttr(name='Base_node_embed', initializer=param_attr_init) # node_embeddings base_node_embed = fl.embedding( input=fl.reshape(self.train_inputs, shape=[-1, 1]), size=[self.num_nodes, self.embedding_size], param_attr=embed_param_attrs) node_features = [] for edge_type in self.edge_types: param_attr_init = fluid.initializer.Uniform( low=-1.0, high=1.0, seed=np.random.randint(100)) embed_param_attrs = fluid.ParamAttr(name='%s_node_embed' % edge_type, initializer=param_attr_init) features = fl.embedding( input=self.gw[edge_type].node_feat['index'], size=[self.num_nodes, self.embedding_u_size], param_attr=embed_param_attrs) node_features.append(features) # mp_output: list of embedding(self.num_nodes, dim) mp_output = self.message_passing(self.gw, self.edge_types, node_features) # U : (num_type[m], num_nodes, dim[s]) node_type_embed = fl.stack(mp_output, axis=0) # U : (num_nodes, num_type[m], dim[s]) node_type_embed = fl.transpose(node_type_embed, perm=[1, 0, 2]) #gather node_type_embed from train_inputs node_type_embed = fl.gather(node_type_embed, self.train_inputs) # M_r trans_weights = fl.create_parameter( shape=[ self.edge_type_count, self.embedding_u_size, self.embedding_size // self.att_head ], attr=fluid.initializer.TruncatedNormalInitializer( loc=0.0, scale=1.0 / math.sqrt(self.embedding_size)), dtype='float32', name='trans_w') # W_r trans_weights_s1 = fl.create_parameter( shape=[self.edge_type_count, self.embedding_u_size, self.dim_a], attr=fluid.initializer.TruncatedNormalInitializer( loc=0.0, scale=1.0 / math.sqrt(self.embedding_size)), dtype='float32', name='trans_w_s1') # w_r trans_weights_s2 = fl.create_parameter( shape=[self.edge_type_count, self.dim_a, self.att_head], attr=fluid.initializer.TruncatedNormalInitializer( loc=0.0, scale=1.0 / math.sqrt(self.embedding_size)), dtype='float32', name='trans_w_s2') trans_w = fl.gather(trans_weights, self.train_types) trans_w_s1 = fl.gather(trans_weights_s1, self.train_types) trans_w_s2 = fl.gather(trans_weights_s2, self.train_types) attention = self.attention(node_type_embed, trans_w_s1, trans_w_s2) node_type_embed = fl.matmul(attention, node_type_embed) node_embed = base_node_embed + fl.reshape( fl.matmul(node_type_embed, trans_w), [-1, self.embedding_size]) self.last_node_embed = fl.l2_normalize(node_embed, axis=1) nce_weight_initializer = fluid.initializer.TruncatedNormalInitializer( loc=0.0, scale=1.0 / math.sqrt(self.embedding_size)) nce_weight_attrs = fluid.ParamAttr(name='nce_weight', initializer=nce_weight_initializer) weight_pos = fl.embedding(input=self.train_labels, size=[self.num_nodes, self.embedding_size], param_attr=nce_weight_attrs) weight_neg = fl.embedding(input=self.train_negs, size=[self.num_nodes, self.embedding_size], param_attr=nce_weight_attrs) tmp_node_embed = fl.unsqueeze(self.last_node_embed, axes=[1]) pos_logits = fl.matmul(tmp_node_embed, weight_pos, transpose_y=True) # [B, 1, 1] neg_logits = fl.matmul(tmp_node_embed, weight_neg, transpose_y=True) # [B, 1, neg_num] pos_score = fl.squeeze(pos_logits, axes=[1]) pos_score = fl.clip(pos_score, min=-10, max=10) pos_score = -1.0 * fl.logsigmoid(pos_score) neg_score = fl.squeeze(neg_logits, axes=[1]) neg_score = fl.clip(neg_score, min=-10, max=10) neg_score = -1.0 * fl.logsigmoid(-1.0 * neg_score) neg_score = fl.reduce_sum(neg_score, dim=1, keep_dim=True) self.loss = fl.reduce_mean(pos_score + neg_score)
def plot_results(self, samples, outputs, targets): # samples: [batch_size, 3, H, W] samples = [sample.numpy() for sample in samples] target_sizes = L.stack([t["size"] for t in targets], 0) results = self.postprocessor(outputs, target_sizes) for i, item in enumerate(zip(samples, results, targets)): image, result, target = item image = np.transpose(image, (1, 2, 0)) std = np.array([0.229, 0.224, 0.225]) mean = np.array([0.485, 0.456, 0.406]) image = (image * std + mean) * 255 image = image.astype(np.uint8)[:, :, ::-1] # RGB -> BGR targ_image = image.copy() pred_img = image.copy() colors = [ (0, 0, 255), (0, 255, 0), (255, 0, 0), (255, 255, 0), (255, 0, 255), (0, 255, 255), (0, 0, 128), (0, 128, 0), (128, 0, 0), (128, 128, 0), (128, 0, 128), (0, 128, 128), ] rect_num = len(target["boxes"]) colors = colors * math.ceil(rect_num / 12) h, w = target["size"].numpy() for i, item in enumerate(zip(target["labels"], target["boxes"])): l, box = item color = colors[i] box = L.unsqueeze(box, [0]) box = box_cxcywh_to_xyxy(box) # [1, 4] box = L.squeeze(box, [0]) # [4] box = (box.numpy() * np.array([w, h, w, h])).astype(np.int) left_top, bottom_down = (box[0], box[1]), (box[2], box[3]) cv2.rectangle(targ_image, left_top, bottom_down, color, 2) l = l.numpy()[0] if isinstance(self.label_to_text, dict): label_name = self.label_to_text.get(str(l), str(l)) else: if l < len(self.label_to_text): label_name = self.label_to_text[l] else: label_name = str(l) cv2.putText(targ_image, label_name, left_top, cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2) rect_num = len(result["labels"]) colors = colors * math.ceil(rect_num / 12) for i, item in enumerate( zip(result["scores"], result["labels"], result["boxes"])): s, l, box = item if l == self.background: continue color = colors[i] left_top, bottom_down = (box[0], box[1]), (box[2], box[3]) cv2.rectangle(pred_img, left_top, bottom_down, color, 2) if isinstance(self.label_to_text, dict): label_name = self.label_to_text.get(str(l), str(l)) else: if l < len(self.label_to_text): label_name = self.label_to_text[l] else: label_name = str(l) cv2.putText(pred_img, label_name + " [" + str(s)[:4] + "]", left_top, cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2) show_image = np.concatenate((targ_image, pred_img), 1) cv2.imwrite( os.path.join(self.output_dir, str(self.index) + ".jpg"), show_image) self.index = (self.index + 1) % self.pool_size