def forward(self, p, q, pad_mask=None): """ Args: p(obj:`Tensor`): the first forward logits of training examples. q(obj:`Tensor`): the second forward logits of training examples. pad_mask(obj:`Tensor`, optional): The Tensor containing the binary mask to index with, it's data type is bool. Returns: loss(obj:`Tensor`): the rdrop loss of p and q """ p_loss = F.kl_div(F.log_softmax(p, axis=-1), F.softmax(q, axis=-1), reduction=self.reduction) q_loss = F.kl_div(F.log_softmax(q, axis=-1), F.softmax(p, axis=-1), reduction=self.reduction) # pad_mask is for seq-level tasks if pad_mask is not None: p_loss = paddle.masked_select(p_loss, pad_mask) q_loss = paddle.masked_select(q_loss, pad_mask) # You can choose whether to use function "sum" and "mean" depending on your task p_loss = p_loss.sum() q_loss = q_loss.sum() loss = (p_loss + q_loss) / 2 return loss
def forward(self, x, y): _,z_s = self.ques_encoder(y) z_s = paddle.fluid.layers.transpose(z_s, perm=[1, 0, 2]) # [B,1,I] for i in range(self.hop): z_s_ = paddle.fluid.layers.expand(z_s,expand_times=[1, x.shape[1], 1]) # [B, S, I] s = self.FFNs_start[i](paddle.concat([x, z_s_, x * z_s_], axis=2)).squeeze(2) p_s = F.softmax(s, axis=1) # [B, S] u_s = p_s.unsqueeze(1).bmm(x) # [B, 1, I] z_e = self.SFUs_start[i](z_s, u_s) # [B, 1, I] z_s_ = paddle.fluid.layers.expand(z_s,expand_times=[1, x.shape[1], 1]) # [B, S, I] e = self.FFNs_end[i](paddle.concat([x, z_s_, x * z_s_], axis=2)).squeeze(2) p_e = F.softmax(e, axis=1) # [B, S] u_e = p_e.unsqueeze(1).bmm(x) # [B, 1, I] z_s = self.SFUs_end[i](z_e, u_e) if self.normalize: if self.training: # In training we output log-softmax for NLL p_s = F.log_softmax(s, axis=1) # [B, S] p_e = F.log_softmax(e, axis=1) # [B, S] else: # ...Otherwise 0-1 probabilities p_s = F.softmax(s, axis=1) # [B, S] p_e = F.softmax(e, axis=1) # [B, S] else: p_s = s.exp() p_e = e.exp() return p_s, p_e,z_s
def check_api(self, place=fluid.CPUPlace(), axis=None, dtype=None): ref_out = ref_log_softmax(self.x, axis, dtype) main_program = fluid.Program() mylogsoftmax = nn.LogSoftmax(axis) with fluid.program_guard(main_program): x = fluid.data(name='x', shape=self.x_shape) y = functional.log_softmax(x, axis, dtype) exe = fluid.Executor(place) out = exe.run(main_program, feed={'x': self.x}, fetch_list=[y]) self.assertTrue(np.allclose(out[0], ref_out)) with fluid.dygraph.guard(place): x = fluid.dygraph.to_variable(self.x) y = functional.log_softmax(x, axis, dtype) self.assertTrue(np.allclose(y.numpy(), ref_out))
def calc_minilm_loss(loss_fct, s, t, attn_mask, num_relation_heads=0): # Initialize head_num if num_relation_heads > 0 and num_relation_heads != s.shape[1]: # s'shape: [bs, seq_len, head_num, head_dim] s = tensor.transpose(x=s, perm=[0, 2, 1, 3]) # s'shape: [bs, seq_len, num_relation_heads, head_dim_new] s = tensor.reshape(x=s, shape=[0, 0, num_relation_heads, -1]) #s's shape: [bs, num_relation_heads, seq_len,, head_dim_new] s = tensor.transpose(x=s, perm=[0, 2, 1, 3]) if num_relation_heads > 0 and num_relation_heads != t.shape[1]: t = tensor.transpose(x=t, perm=[0, 2, 1, 3]) t = tensor.reshape(x=t, shape=[0, 0, num_relation_heads, -1]) t = tensor.transpose(x=t, perm=[0, 2, 1, 3]) pad_seq_len = s.shape[2] s_head_dim, t_head_dim = s.shape[3], t.shape[3] scaled_dot_product_s = tensor.matmul( x=s, y=s, transpose_y=True) / math.sqrt(s_head_dim) del s scaled_dot_product_s += attn_mask scaled_dot_product_t = tensor.matmul( x=t, y=t, transpose_y=True) / math.sqrt(t_head_dim) del t scaled_dot_product_t += attn_mask loss = loss_fct(F.log_softmax(scaled_dot_product_s), F.softmax(scaled_dot_product_t)) return loss
def check_api(self, axis=-1, dtype=None): x = self.x.copy() if dtype is not None: x = x.astype(dtype) ref_out = np.apply_along_axis(ref_log_softmax, axis, x) with paddle.static.program_guard(paddle.static.Program()): x = paddle.fluid.data(name='x', shape=self.x_shape) y = F.log_softmax(x, axis, dtype) exe = paddle.static.Executor(self.place) out = exe.run(feed={'x': self.x}, fetch_list=[y]) self.assertTrue(np.allclose(out[0], ref_out)) paddle.disable_static() x = paddle.to_tensor(self.x) y = F.log_softmax(x, axis, dtype) self.assertTrue(np.allclose(y.numpy(), ref_out), True) paddle.enable_static()
def forward(self, logits, label): logits = F.normalize(logits, p=2, axis=1, epsilon=self.eps) wn = F.normalize(self.w, p=2, axis=0, epsilon=self.eps) cosine = paddle.matmul(logits, wn) y = paddle.zeros((logits.shape[0], self.n_classes)) for i in range(logits.shape[0]): y[i, label[i]] = self.margin pred = F.log_softmax((cosine - y) * self.scale, -1) return self.nll_loss(pred, label), pred
def calc_loss(self, x, target): if self._label_smoothing: target = self._labelsmoothing(target) x = -F.log_softmax(x, axis=-1) cost = paddle.sum(x * target, axis=-1) else: cost = F.cross_entropy(x, label=target) avg_cost = self.reduce_loss(cost) return avg_cost
def _crossentropy(self, input, target): if self._label_smoothing: target = self._labelsmoothing(target) input = -F.log_softmax(input, axis=-1) cost = paddle.sum(target * input, axis=-1) else: cost = F.cross_entropy(input=input, label=target) avg_cost = paddle.mean(cost) return avg_cost
def forward(self, outputs, targets, length=None): targets = F.one_hot(targets, outputs.shape[1]) try: predictions = self.loss_fn(outputs, targets) except TypeError: predictions = self.loss_fn(outputs) predictions = F.log_softmax(predictions, axis=1) loss = self.criterion(predictions, targets) / targets.sum() return loss
def greedy_search(self, input_ids, logits_processors, max_length, pad_token_id, eos_token_id, **model_kwargs): batch_size, cur_len = input_ids.shape origin_len = cur_len unfinished_flag = paddle.full([batch_size, 1], True, dtype='bool') scores = paddle.full([batch_size, 1], 0.0, dtype=paddle.get_default_dtype()) while cur_len < max_length: # prepare model inputs & get model output model_inputs = self.prepare_inputs_for_generation( input_ids, **model_kwargs) outputs = self(**model_inputs) logits = outputs[0] if isinstance(outputs, tuple) else outputs # [batch_size, vocab_size] logits = logits[:, -1, :] # pre-process distribution logits = self.adjust_logits_during_generation(logits) logits = logits_processors(input_ids, logits) # greedy probs = F.log_softmax(logits) next_tokens = paddle.argmax(probs, axis=-1).unsqueeze(-1) next_scores = paddle.index_sample(probs, next_tokens) if eos_token_id is not None: next_tokens = paddle.where( unfinished_flag, next_tokens, paddle.full_like(next_tokens, pad_token_id)) scores = self.update_scores_for_generation(scores, next_scores, cur_len - origin_len, unfinished_flag) cur_len += 1 input_ids = paddle.concat([input_ids, next_tokens], axis=1) if eos_token_id is not None: unfinished_flag = paddle.logical_and( unfinished_flag, next_tokens != eos_token_id) # Stop when there is a </s> in all sentences if not paddle.any(unfinished_flag): break model_kwargs = self.update_model_kwargs_for_generation( outputs, model_kwargs) return input_ids[:, origin_len:], scores
def forward(self, x, label): loss_dict = {} if self.epsilon is not None: class_num = x.shape[-1] label = self._labelsmoothing(label, class_num) x = -F.log_softmax(x, axis=-1) loss = paddle.sum(x * label, axis=-1) else: if label.shape[-1] == x.shape[-1]: label = F.softmax(label, axis=-1) soft_label = True else: soft_label = False loss = F.cross_entropy(x, label=label, soft_label=soft_label) return loss
def forward(self, x): x = self.conv1(x) # x = self.maxpool(x) x = self.stage2(x) x = self.stage3(x) x = self.stage4(x) # global average pooling layer x = F.avg_pool2d(x, x.shape[-2:]) # flatten for input to fully-connected layer x = x.flatten(1) x = self.fc(x) return F.log_softmax(x, axis=1)
def forward(self, confidence, predicted_locations, labels, gt_locations): """Compute classification loss and smooth l1 loss. Args: confidence (batch_size, num_priors, num_classes): class predictions. locations (batch_size, num_priors, 4): predicted locations. labels (batch_size, num_priors): real labels of all the priors. boxes (batch_size, num_priors, 4): real boxes corresponding all the priors. """ num_classes = confidence.shape[2] with paddle.no_grad(): # derived from cross_entropy=sum(log(p)) loss = -F.log_softmax(confidence, 2)[:, :, 0] mask = box_utils.hard_negative_mining(loss, labels, self.neg_pos_ratio) confidence = paddle.concat([ confidence[:, :, 0].masked_select(mask).reshape([-1, 1]), confidence[:, :, 1].masked_select(mask).reshape([-1, 1]) ], axis=1) classification_loss = F.cross_entropy(confidence.reshape( [-1, num_classes]), labels.masked_select(mask), reduction='sum') pos_mask = labels > 0 predicted_locations = predicted_locations.masked_select( paddle.concat([ pos_mask.reshape(pos_mask.shape + [1]), pos_mask.reshape(pos_mask.shape + [1]), pos_mask.reshape(pos_mask.shape + [1]), pos_mask.reshape(pos_mask.shape + [1]) ], axis=2)).reshape([-1, 4]) gt_locations = gt_locations.masked_select( paddle.concat([ pos_mask.reshape(pos_mask.shape + [1]), pos_mask.reshape(pos_mask.shape + [1]), pos_mask.reshape(pos_mask.shape + [1]), pos_mask.reshape(pos_mask.shape + [1]) ], axis=2)).reshape([-1, 4]) smooth_l1_loss = F.smooth_l1_loss(predicted_locations, gt_locations.cast('float32'), reduction='sum') # smooth_l1_loss # smooth_l1_loss = F.mse_loss(predicted_locations, gt_locations, reduction='sum') #l2 loss num_pos = gt_locations.shape[0] return smooth_l1_loss / num_pos, classification_loss / num_pos
def forward(self, logits): assert logits.ndim == 3, ( f'the input logits must be a ' + f'3d tensor of shape [n_spk,n_uttns,emb_dim],' + f'but received logits.ndim = {logits.ndim}') import pdb pdb.set_trace() logits = F.normalize(logits, p=2, axis=-1, epsilon=self.eps) proto = paddle.mean(logits[:, 1:, :], axis=1, keepdim=False).transpose( (1, 0)) # [emb_dim, n_spk] query = logits[:, 0, :] # [n_spk, emb_dim] similarity = paddle.matmul(query, proto) * self.s #[n_spk,n_spk] label = paddle.arange(0, similarity.shape[0]) log_sim = F.log_softmax(similarity, -1) return self.nll_loss(log_sim, label), log_sim
def calc_minilm_loss(loss_fct, s, t, attn_mask, num_relation_heads=0): """ Calculates loss for Q-Q, K-K, V-V relation from MiniLMv2. Args: loss_fct (callable): Loss function for distillation. It only supports kl_div loss now. s (Tensor): Q, K, V of Student. t (Tensor): Q, K, V of teacher. attn_mask (Tensor): Attention mask for relation. num_relation_heads (int): The number of relation heads. 0 means `num_relation_heads` equals to origin head num. Defaults to 0. Returns: Tensor: MiniLM loss value. """ # Initialize head_num if num_relation_heads > 0 and num_relation_heads != s.shape[1]: # s'shape: [bs, seq_len, head_num, head_dim] s = tensor.transpose(x=s, perm=[0, 2, 1, 3]) # s'shape: [bs, seq_len, num_relation_heads, head_dim_new] s = tensor.reshape(x=s, shape=[0, 0, num_relation_heads, -1]) # s' shape: [bs, num_relation_heads, seq_len, head_dim_new] s = tensor.transpose(x=s, perm=[0, 2, 1, 3]) if num_relation_heads > 0 and num_relation_heads != t.shape[1]: t = tensor.transpose(x=t, perm=[0, 2, 1, 3]) t = tensor.reshape(x=t, shape=[0, 0, num_relation_heads, -1]) t = tensor.transpose(x=t, perm=[0, 2, 1, 3]) s_head_dim, t_head_dim = s.shape[3], t.shape[3] scaled_dot_product_s = tensor.matmul( x=s, y=s, transpose_y=True) / math.sqrt(s_head_dim) del s scaled_dot_product_s += attn_mask scaled_dot_product_t = tensor.matmul( x=t, y=t, transpose_y=True) / math.sqrt(t_head_dim) del t scaled_dot_product_t += attn_mask loss = loss_fct(F.log_softmax(scaled_dot_product_s), F.softmax(scaled_dot_product_t)) return loss
def forward(self, x, label): if isinstance(x, dict): x = x["logits"] if self.epsilon is not None: class_num = x.shape[-1] label = self._labelsmoothing(label, class_num) x = -F.log_softmax(x, axis=-1) loss = paddle.sum(x * label, axis=-1) else: if label.shape[-1] == x.shape[-1]: label = F.softmax(label, axis=-1) soft_label = True else: soft_label = False loss = F.cross_entropy(x, label=label, soft_label=soft_label) loss = loss.mean() return {"CELoss": loss}
def pointer(self, x, state): x_ = paddle.fluid.layers.expand(state.unsqueeze(1),expand_times=[1, x.shape[1], 1]) out = paddle.concat([x, x_], axis=2) s0 = F.tanh(self.linear(out)) s = self.weights(s0).reshape(shape=[x.shape[0], x.shape[1]]) a = F.softmax(s) res = a.unsqueeze(1).bmm(x).squeeze(1) if self.normalize: if self.training: # In training we output log-softmax for NLL scores = F.log_softmax(s) else: # ...Otherwise 0-1 probabilities scores = F.softmax(s) else: scores = a.exp() return scores,res
def __call__(self, logits, label, mode="train"): loss_dict = {} if self.epsilon is not None: class_num = logits.shape[-1] label = self._labelsmoothing(label, class_num) x = -F.log_softmax(x, axis=-1) loss = paddle.sum(x * label, axis=-1) else: if label.shape[-1] == logits.shape[-1]: label = F.softmax(label, axis=-1) soft_label = True else: soft_label = False loss = F.cross_entropy(logits, label=label, soft_label=soft_label) loss_dict[self.name] = paddle.mean(loss) return loss_dict
def forward(self, pred, batch): pred = pred.reshape([-1, pred.shape[2]]) max_len = batch[2].max() tgt = batch[1][:, 1:2 + max_len] tgt = tgt.reshape([-1]) if self.smoothing: eps = 0.1 n_class = pred.shape[1] one_hot = F.one_hot(tgt, pred.shape[1]) one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / (n_class - 1) log_prb = F.log_softmax(pred, axis=1) non_pad_mask = paddle.not_equal( tgt, paddle.zeros(tgt.shape, dtype=tgt.dtype)) loss = -(one_hot * log_prb).sum(axis=1) loss = loss.masked_select(non_pad_mask).mean() else: loss = self.loss_func(pred, tgt) return {'loss': loss}
def forward(self, logits, targets): logits = F.normalize(logits, p=2, axis=1, epsilon=1e-8) wn = F.normalize(self.w, p=2, axis=0, epsilon=1e-8) cosine = logits @ wn sine = paddle.sqrt(1.0 - paddle.square(cosine)) phi = cosine * self.cos_m - sine * self.sin_m # cos(theta + m) if self.easy_margin: phi = paddle.where(cosine > 0, phi, cosine) else: phi = paddle.where(cosine > self.th, phi, cosine - self.mm) target_one_hot = F.one_hot(targets, self.n_classes) outputs = (target_one_hot * phi) + ( (1.0 - target_one_hot) * cosine) - target_one_hot * self.margin2 outputs = self.scale * outputs pred = F.log_softmax(outputs, axis=-1) return self.nll_loss(pred, targets), pred
def forward(self, logit, label): logit = paddle.reshape( logit, [logit.shape[0], logit.shape[1], -1]) # N,C,H,W => N,C,H*W logit = paddle.transpose(logit, [0, 2, 1]) # N,C,H*W => N,H*W,C logit = paddle.reshape(logit, [-1, logit.shape[2]]) # N,H*W,C => N*H*W,C label = paddle.reshape(label, [-1, 1]) range_ = paddle.arange(0, label.shape[0]) range_ = paddle.unsqueeze(range_, axis=-1) label = paddle.cast(label, dtype='int64') label = paddle.concat([range_, label], axis=-1) logpt = F.log_softmax(logit) logpt = paddle.gather_nd(logpt, label) pt = paddle.exp(logpt.detach()) loss = -1 * (1 - pt)**self.gamma * logpt loss = paddle.mean(loss) return loss
def forward(self, x, label): assert len(x.shape) == len(label.shape), \ "x and label shape length should be same but got {} for x.shape and {} for label.shape".format(x.shape, label.shape) if self.epsilon is not None: class_num = x.shape[-1] label = self._labelsmoothing(label, class_num) x = -F.log_softmax(x, axis=self.axis) loss = paddle.sum(x * label, axis=self.axis) else: if label.shape[self.axis] == x.shape[self.axis]: if self.label_act == "softmax": label = F.softmax(label, axis=self.axis) soft_label = True else: soft_label = False loss = F.cross_entropy( x, label=label, soft_label=soft_label, axis=self.axis) loss = loss.mean() return loss
def forward(self, inputs): # 公共网络层 x = F.relu(self.conv1(inputs)) x = F.relu(self.conv2(x)) x = F.relu(self.conv3(x)) # 行动策略网络层 x_act = F.relu(self.act_conv1(x)) x_act = paddle.reshape( x_act, [-1, 4 * self.board_height * self.board_width]) x_act = F.log_softmax(self.act_fc1(x_act)) # 状态价值网络层 x_val = F.relu(self.val_conv1(x)) x_val = paddle.reshape( x_val, [-1, 2 * self.board_height * self.board_width]) x_val = F.relu(self.val_fc1(x_val)) x_val = F.tanh(self.val_fc2(x_val)) return x_act,x_val
def valid_one_epoch(self, epoch): losses = [] accs = [] for i in range(self.model_num): if self.use_data_parallel: self.parallel_models[i].eval() else: self.models[i].eval() losses.append(AvgrageMeter()) accs.append(AvgrageMeter()) for _, (images, labels) in enumerate(self.valid_loader): images, labels = to_variable(images), to_variable(labels) batch_size = images.shape[0] logits = [] if self.use_data_parallel: for model in self.parallel_models: logits.append(model(images)) else: for model in self.models: logits.append(model(images)) for i in range(self.model_num): gt_loss = self.models[i].loss(logits[i], labels) kl_loss = 0 for j in range(self.model_num): if i != j: x = F.log_softmax(logits[i], axis=1) y = fluid.layers.softmax(logits[j], axis=1) kl_loss += fluid.layers.kldiv_loss( x, y, reduction='batchmean') loss = gt_loss if (self.model_num > 1): loss += kl_loss / (self.model_num - 1) prec = fluid.layers.accuracy(input=logits[i], label=labels, k=1) losses[i].update(loss.numpy(), batch_size) accs[i].update(prec.numpy() * 100, batch_size) return losses, accs
def predict_word(dec_seq, enc_output, n_active_inst, n_bm, memory_key_padding_mask): tgt_key_padding_mask = self.generate_padding_mask(dec_seq) dec_seq = self.embedding(dec_seq).transpose([1, 0, 2]) dec_seq = self.positional_encoding(dec_seq) tgt_mask = self.generate_square_subsequent_mask( dec_seq.shape[0]) dec_output = self.decoder( dec_seq, enc_output, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask, ).transpose([1, 0, 2]) dec_output = dec_output[:, -1, :] # Pick the last step: (bh * bm) * d_h word_prob = F.log_softmax(self.tgt_word_prj(dec_output), axis=1) word_prob = word_prob.reshape([n_active_inst, n_bm, -1]) return word_prob
def forward(self, logit_1, logit_2, label=None): """ Calculate the KL loss. If the label is not None, it considers the ignore_index in label and calculates the masked loss. Args: logit_1 (Tensor): Logit tensor, the data type is float32 or float64. The shape is (N, C), where C is number of classes, and if shape is more than 2D, this is (N, C, D1, D2,..., Dk), k >= 1. logit_2 (Tensor): Logit tensor, the data type is float32 or float64. The shape of logit_2 and logit_1 are the same. label (Tensor, optional): Label tensor, the data type is int64. The shape is (N), where each value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is (N, D1, D2,..., Dk), k >= 1. Returns: (Tensor): The average loss. """ if logit_1.shape != logit_2.shape: raise ValueError( 'The shape of logit_1 = {} must be the same as the shape of logit_2 = {}.' .format(logit_1.shape, logit_2.shape)) logit_1 = F.log_softmax(logit_1 / self.temperature, axis=1) logit_2 = F.softmax(logit_2 / self.temperature, axis=1) loss = self.kl_loss(logit_1, logit_2) loss = loss * self.temperature * self.temperature if label is None: avg_loss = paddle.mean(loss) else: mask = label != self.ignore_index mask = paddle.cast(mask, 'float32') mask = paddle.unsqueeze(mask, axis=1) label.stop_gradient = True mask.stop_gradient = True loss = loss * mask avg_loss = paddle.mean(loss) / (paddle.mean(mask) + self.EPS) return avg_loss
def forward_test(self, src): bs = src.shape[0] if self.encoder is not None: src = self.positional_encoding(src.transpose([1, 0, 2])) memory = self.encoder(src) else: memory = src.squeeze(2).transpose([2, 0, 1]) dec_seq = paddle.full((bs, 1), 2, dtype=paddle.int64) for len_dec_seq in range(1, 25): src_enc = memory.clone() tgt_key_padding_mask = self.generate_padding_mask(dec_seq) dec_seq_embed = self.embedding(dec_seq).transpose([1, 0, 2]) dec_seq_embed = self.positional_encoding(dec_seq_embed) tgt_mask = self.generate_square_subsequent_mask( dec_seq_embed.shape[0]) output = self.decoder(dec_seq_embed, src_enc, tgt_mask=tgt_mask, memory_mask=None, tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=None) dec_output = output.transpose([1, 0, 2]) dec_output = dec_output[:, -1, :] # Pick the last step: (bh * bm) * d_h word_prob = F.log_softmax(self.tgt_word_prj(dec_output), axis=1) word_prob = word_prob.reshape([1, bs, -1]) preds_idx = word_prob.argmax(axis=2) if paddle.equal_all( preds_idx[-1], paddle.full(preds_idx[-1].shape, 3, dtype='int64')): break preds_prob = word_prob.max(axis=2) dec_seq = paddle.concat( [dec_seq, preds_idx.reshape([-1, 1])], axis=1) return dec_seq
def _test_base(self, run_ipu=True): scope = fluid.core.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() SEED = self.SEED main_prog.random_seed = SEED startup_prog.random_seed = SEED with fluid.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype=self.feed_dtype[0]) out = F.log_softmax(x, **self.attrs) fetch_list = [out.name] if run_ipu: place = paddle.IPUPlace() else: place = paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(startup_prog) if run_ipu: feed_list = self.feed_list ipu_strategy = compiler.get_ipu_strategy() ipu_strategy.is_training = self.is_training program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog result = exe.run(program, feed=self.feed, fetch_list=fetch_list) return result[0]
def soft_cross_entropy(inp, target): inp_likelihood = F.log_softmax(inp, axis=-1) target_prob = F.softmax(target, axis=-1) return -1. * paddle.mean(paddle.sum(inp_likelihood * target_prob, axis=-1))
def beam_search(self, input_ids, beam_scorer, logits_processors, max_length, pad_token_id, eos_token_id, **model_kwargs): batch_size = len(beam_scorer._beam_hyps) num_beams = beam_scorer.num_beams batch_beam_size, cur_len = input_ids.shape origin_len = cur_len assert ( num_beams * batch_size == batch_beam_size ), "Batch dimension of `input_ids` should be {}, but received {}.".format( num_beams * batch_size, batch_beam_size) beam_scores = paddle.zeros((batch_size, num_beams), dtype=paddle.get_default_dtype()) beam_scores[:, 1:] = -1e9 beam_scores = paddle.reshape(beam_scores, [-1]) while cur_len < max_length: # prepare model inputs & get model output model_inputs = self.prepare_inputs_for_generation( input_ids, **model_kwargs) outputs = self(**model_inputs) logits = outputs[0] if isinstance(outputs, tuple) else outputs # [batch_size, vocab_size] logits = logits[:, -1, :] # pre-process distribution logits = self.adjust_logits_during_generation(logits) logits = logits_processors(input_ids, logits) # beam search # [batch_size * num_beams, vocab_size] next_scores = F.log_softmax(logits) next_scores = next_scores + beam_scores.unsqueeze(-1) # reshape for beam search vocab_size = next_scores.shape[-1] next_scores = next_scores.reshape( [batch_size, num_beams * vocab_size]) next_scores, next_tokens = paddle.topk(next_scores, 2 * num_beams, axis=1) next_indices = next_tokens // vocab_size next_tokens = next_tokens % vocab_size # stateless beam_outputs = beam_scorer.process( input_ids, next_scores, next_tokens, next_indices, pad_token_id=pad_token_id, eos_token_id=eos_token_id, ) beam_scores = beam_outputs["next_beam_scores"] beam_next_tokens = beam_outputs["next_beam_tokens"] beam_idx = beam_outputs["next_beam_indices"] cur_len += 1 input_ids = paddle.concat([ paddle.index_select(input_ids, beam_idx), beam_next_tokens.unsqueeze(-1) ], axis=-1) if beam_scorer.is_done: break model_kwargs = self.update_model_kwargs_for_generation( outputs, model_kwargs) if model_kwargs["cache"] is not None: # reorder the cache model_kwargs["cache"] = map_structure( lambda x: paddle.index_select(x, beam_idx), model_kwargs["cache"]) pred_ids, scores = beam_scorer.finalize(input_ids, beam_scores, next_tokens, next_indices, pad_token_id=pad_token_id, eos_token_id=eos_token_id) return pred_ids[:, origin_len:], scores