def lossweighed(ce_loss, labels): one_hot = fluid.one_hot(input=labels, depth=args["num_labels"]) lw = fluid.layers.matmul(one_hot, weight) lw = fluid.layers.reduce_sum(lw, dim=1) loss = fluid.layers.elementwise_mul(lw, ce_loss) loss = fluid.layers.mean(loss) return loss
def test_api_with_dygraph(self): depth = 10 label = np.array([np.random.randint(0, depth - 1) for i in range(6)]).reshape([6, 1]) with fluid.dygraph.guard(): one_hot_label = fluid.one_hot( input=fluid.dygraph.to_variable(label), depth=depth)
def create_loss_op(self, predict, label, epsilon=1e-7): """compute loss with tensor Args: predict: model output tensor activated by softmax label: a non-sparse tensor Returns: loss: cross-entropy loss """ if self.loss_type == "nl" and self.model_type == "train": one_hot_label = fluid.one_hot(label, depth=predict.shape[-1]) one_hot_label = FL.squeeze(one_hot_label, axes=[-2]) # log neg_prob = 1 - predict log_neg_prob = FL.log( fluid.layers.clip(neg_prob, min=epsilon, max=1.)) ce_loss = -1 * log_neg_prob * one_hot_label cost = FL.reduce_sum(ce_loss, dim=-1, keep_dim=True) else: # PL or evaluation cost = FL.cross_entropy(predict, label) loss = FL.mean(cost) return loss
def soft_dice_loss(logits, labels): probs = L.softmax(logits, axis=-1) one_hot = F.one_hot(labels, depth=probs.shape[-1]) intersection = L.reduce_sum(probs * one_hot, dim=-1) # union = L.reduce_sum(probs, axis=-1) + L.reduce_sum(labels, axis=-1) loss = 1 - intersection return L.reduce_mean(loss)
def chunk_softmax(logits, labels, topk=10): after_exp = L.exp(logits) out, _ = L.argsort(after_exp, axis=-1) denorm = L.reduce_sum(out[:, -topk:], dim=-1, keep_dim=True) probs = after_exp / denorm one_hot = F.one_hot(labels, depth=probs.shape[-1]) loss = -L.reduce_sum(one_hot * L.log(probs)) / logits.shape[0] return loss
def beam_search_step(state, logits, eos_id, beam_width, is_first_step, length_penalty): """logits.shape == [B*W, V]""" _, vocab_size = logits.shape bsz, beam_width = state.log_probs.shape onehot_eos = L.cast(F.one_hot(L.ones([1], 'int64') * eos_id, vocab_size), 'int64') #[1, V] probs = L.log(L.softmax(logits)) #[B*W, V] probs = mask_prob(probs, onehot_eos, state.finished) #[B*W, V] allprobs = L.reshape(state.log_probs, [-1, 1]) + probs #[B*W, V] not_finished = 1 - L.reshape(state.finished, [-1, 1]) #[B*W,1] not_eos = 1 - onehot_eos length_to_add = not_finished * not_eos #[B*W,V] alllen = L.reshape(state.lengths, [-1, 1]) + length_to_add allprobs = L.reshape(allprobs, [-1, beam_width * vocab_size]) alllen = L.reshape(alllen, [-1, beam_width * vocab_size]) allscore = hyp_score(allprobs, alllen, length_penalty) if is_first_step: allscore = L.reshape( allscore, [bsz, beam_width, -1])[:, 0, :] # first step only consiter beam 0 scores, idx = L.topk(allscore, k=beam_width) #[B, W] next_beam_id = idx // vocab_size #[B, W] next_word_id = idx % vocab_size gather_idx = L.concat([L.where(idx != -1)[:, :1], L.reshape(idx, [-1, 1])], 1) next_probs = L.reshape(L.gather_nd(allprobs, gather_idx), idx.shape) next_len = L.reshape(L.gather_nd(alllen, gather_idx), idx.shape) gather_idx = L.concat( [L.where(next_beam_id != -1)[:, :1], L.reshape(next_beam_id, [-1, 1])], 1) next_finished = L.reshape( L.gather_nd(state.finished, gather_idx), state.finished.shape ) #[gather new beam state according to new beam id] #log.debug(gather_idx.numpy()) #log.debug(state.finished.numpy()) #log.debug(next_finished.numpy()) next_finished += L.cast(next_word_id == eos_id, 'int64') next_finished = L.cast(next_finished > 0, 'int64') #log.debug(next_word_id.numpy()) #log.debug(next_beam_id.numpy()) next_state = BeamSearchState(log_probs=next_probs, lengths=next_len, finished=next_finished) output = BeamSearchOutput(scores=scores, predicted_ids=next_word_id, beam_parent_ids=next_beam_id) return output, next_state
def _labelsmoothing(self, target): if target.shape[-1] != self._class_dim: one_hot_target = fluid.one_hot(input=target, depth=self._class_dim) else: one_hot_target = target soft_target = fluid.layers.label_smooth(label=one_hot_target, epsilon=self._epsilon, dtype="float32") return soft_target
def beam_search_step(state, logits, eos_id, beam_width, is_first_step, length_penalty): """logits.shape == [B*W, V]""" beam_size, vocab_size = logits.shape # as batch size=1 in this hub module. the first dim means bsz * beam_size equals beam_size logits_np = logits.numpy() for i in range(beam_size): logits_np[i][17963] = 0 # make [UNK] prob = 0 logits = D.to_variable(logits_np) bsz, beam_width = state.log_probs.shape onehot_eos = L.cast(F.one_hot(L.ones([1], 'int64') * eos_id, vocab_size), 'int64') #[1, V] probs = L.log(L.softmax(logits)) #[B*W, V] probs = mask_prob(probs, onehot_eos, state.finished) #[B*W, V] allprobs = L.reshape(state.log_probs, [-1, 1]) + probs #[B*W, V] not_finished = 1 - L.reshape(state.finished, [-1, 1]) #[B*W,1] not_eos = 1 - onehot_eos length_to_add = not_finished * not_eos #[B*W,V] alllen = L.reshape(state.lengths, [-1, 1]) + length_to_add allprobs = L.reshape(allprobs, [-1, beam_width * vocab_size]) alllen = L.reshape(alllen, [-1, beam_width * vocab_size]) allscore = hyp_score(allprobs, alllen, length_penalty) if is_first_step: allscore = L.reshape( allscore, [bsz, beam_width, -1])[:, 0, :] # first step only consiter beam 0 scores, idx = L.topk(allscore, k=beam_width) #[B, W] next_beam_id = idx // vocab_size #[B, W] next_word_id = idx % vocab_size gather_idx = L.concat([L.where(idx != -1)[:, :1], L.reshape(idx, [-1, 1])], 1) next_probs = L.reshape(L.gather_nd(allprobs, gather_idx), idx.shape) next_len = L.reshape(L.gather_nd(alllen, gather_idx), idx.shape) gather_idx = L.concat( [L.where(next_beam_id != -1)[:, :1], L.reshape(next_beam_id, [-1, 1])], 1) next_finished = L.reshape( L.gather_nd(state.finished, gather_idx), state.finished.shape ) #[gather new beam state according to new beam id] next_finished += L.cast(next_word_id == eos_id, 'int64') next_finished = L.cast(next_finished > 0, 'int64') next_state = BeamSearchState(log_probs=next_probs, lengths=next_len, finished=next_finished) output = BeamSearchOutput(scores=scores, predicted_ids=next_word_id, beam_parent_ids=next_beam_id) return output, next_state
def cross_entropy_label_smooth(preds, targets, epsilon): preds = fluid.layers.softmax(preds) targets_one_hot = fluid.one_hot(input=targets, depth=args.class_num) targets_smooth = fluid.layers.label_smooth(targets_one_hot, epsilon=epsilon, dtype="float32") loss = fluid.layers.cross_entropy(input=preds, label=targets_smooth, soft_label=True) return loss
def req_cost(self, program, score): score = fluid.one_hot(score, CLASSIFY_NUM) loss = program.current_block().create_var(name="cosnn_loss_tmp", dtype="float32", shape=[1]) layers.py_func(func=_gt_score_loss, x=[self.layers_out, score], out=loss, backward_func=_backward_gt_score) # loss = layers.cross_entropy(self.layers_out, score) return layers.mean(loss)
def _run(self, depth): label = fluid.layers.data(name="label", shape=[1], dtype="int64") one_hot_label = fluid.one_hot(input=label, depth=depth) place = fluid.NPUPlace(0) label_data = np.array([np.random.randint(0, 10 - 1) for i in range(6)]).reshape([6, 1]) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) ret = exe.run(feed={'label': label_data, }, fetch_list=[one_hot_label], return_numpy=False)
def test_api_with_dygraph(self): depth = 10 label = np.array([np.random.randint(0, depth - 1) for i in range(6)]).reshape([6, 1]) with fluid.dygraph.guard(): one_hot_label = fluid.one_hot( input=fluid.dygraph.to_variable(label), depth=depth) one_hot_label = paddle.nn.functional.one_hot( fluid.dygraph.to_variable(label), depth) with _test_eager_guard(): one_hot_label = paddle.nn.functional.one_hot( paddle.to_tensor(label), depth)
def build_program(self, backward=False, dtype=None): import paddle.fluid as fluid self.name = "one_hot" with fluid.program_guard(self.main_program, self.startup_program): input = fluid.data(name='input', shape=config.input_shape, dtype='int32', lod_level=0) input.stop_gradient = False result = fluid.one_hot(input=input, depth=config.depth) self.feed_vars = [input] self.fetch_vars = [result]
def arc_margin_product(self, input, label, out_dim, m, s, easy_margin=False): # input = fluid.layers.l2_normalize(input, axis=1) input_norm = fluid.layers.sqrt( fluid.layers.reduce_sum(fluid.layers.square(input), dim=1)) input = fluid.layers.elementwise_div(input, input_norm, axis=0) if self.weight is None: self.weight = fluid.layers.create_parameter( shape=[self.class_dim, input.shape[1]], dtype='float32', name='weight_norm', attr=fluid.param_attr.ParamAttr( initializer=fluid.initializer.Xavier())) # weight = fluid.layers.l2_normalize(weight, axis=1) weight_norm = fluid.layers.sqrt( fluid.layers.reduce_sum(fluid.layers.square(self.weight), dim=1)) weight = fluid.layers.elementwise_div(self.weight, weight_norm, axis=0) weight = fluid.layers.transpose(weight, perm=[1, 0]) cosine = fluid.layers.mul(input, weight) sine = fluid.layers.sqrt(1.0 - fluid.layers.square(cosine) + 1e-6) cos_m = math.cos(m) sin_m = math.sin(m) phi = cosine * cos_m - sine * sin_m th = math.cos(math.pi - m) mm = math.sin(math.pi - m) * m if easy_margin: phi = self.paddle_where_more_than(cosine, 0, phi, cosine) else: phi = self.paddle_where_more_than(cosine, th, phi, cosine - mm) one_hot = fluid.one_hot(input=label, depth=out_dim) one_hot = fluid.layers.squeeze(input=one_hot, axes=[1]) output = fluid.layers.elementwise_mul( one_hot, phi) + fluid.layers.elementwise_mul( (1.0 - one_hot), cosine) output = output * s return output
def arc_margin_product(self, input, label, out_dim, s=32.0, m=0.50, mode=2): input_norm = fluid.layers.sqrt( fluid.layers.reduce_sum( fluid.layers.square(input), dim=1)) input = fluid.layers.elementwise_div(input, input_norm, axis=0) weight = fluid.layers.create_parameter( shape=[out_dim, input.shape[1]], dtype='float32', name='weight_norm', attr=fluid.param_attr.ParamAttr( initializer=fluid.initializer.Xavier(), regularizer=fluid.regularizer.L2Decay(4e-4))) weight_norm = fluid.layers.sqrt( fluid.layers.reduce_sum( fluid.layers.square(weight), dim=1)) weight = fluid.layers.elementwise_div(weight, weight_norm, axis=0) weight = fluid.layers.transpose(weight, perm=[1, 0]) cosine = fluid.layers.mul(input, weight) sine = fluid.layers.sqrt(1.0 - fluid.layers.square(cosine)) cos_m = math.cos(m) sin_m = math.sin(m) phi = cosine * cos_m - sine * sin_m th = math.cos(math.pi - m) mm = math.sin(math.pi - m) * m if mode == 1: phi = self.paddle_where_more_than(cosine, 0, phi, cosine) elif mode == 2: phi = self.paddle_where_more_than(cosine, th, phi, cosine - mm) else: pass one_hot = fluid.one_hot(input=label, depth=out_dim) output = fluid.layers.elementwise_mul( one_hot, phi) + fluid.layers.elementwise_mul( (1.0 - one_hot), cosine) output = output * s return output
def __init__(self, pretrain_path, N, K, max_length, hidden_size, att_dim, induction_iters, relation_size): """ Args: pretrain_path: str. Path for word embedding and word id. N: int. N-way. K: int. K-shot. max_length: int. hidden_size: int. att_dim: int. induction_iters: int. relation_size: int.""" totalQ = fluid.data(name="totalQ", shape=[None], dtype="int32") # total query total_Q = totalQ[0] support = fluid.data(name="support", shape=[None, N, K, max_length], dtype="int64") # [B, N, K, T] support_len = fluid.data(name="support_len", shape=[None, N, K], dtype="int64") # [B, N, K] query = fluid.data(name="query", shape=[None, None, max_length], dtype="int64") # [B, totalQ, T] query_len = fluid.data(name="query_len", shape=[None, None], dtype="int64") # [B, totalQ] label = fluid.data(name="label", shape=[None, None], dtype="int64") # [B, totalQ] # Must be 3 data readers. # See https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/data_preparing/static_mode/use_py_reader.html self.train_reader = fluid.io.DataLoader.from_generator( feed_list=[totalQ, support, support_len, query, query_len, label], capacity=8, iterable=True ) self.val_reader = fluid.io.DataLoader.from_generator( feed_list=[totalQ, support, support_len, query, query_len, label], capacity=8, iterable=True ) self.test_reader = fluid.io.DataLoader.from_generator( feed_list=[totalQ, support, support_len, query, query_len, label], capacity=8, iterable=True ) # 1. Encoder word_vec, vocab_size, embed_size = self.__load_embed_matrix(pretrain_path) support = fluid.layers.reshape(support, shape=[-1, max_length]) # [BNK, T] support_len = fluid.layers.reshape(support_len, shape=[-1]) # [BNK] support_emb = self.encoder_module( support, support_len, max_length, word_vec, vocab_size, embed_size, hidden_size, att_dim) # [BNK, 2H] support_emb = fluid.layers.reshape(support_emb, shape=[-1, N, K, 2 * hidden_size]) # [B, N, K, 2H] query = fluid.layers.reshape(query, shape=[-1, max_length]) # [B*totalQ, T] query_len = fluid.layers.reshape(query_len, shape=[-1]) # [B*totalQ] query_emb = self.encoder_module( query, query_len, max_length, word_vec, vocab_size, embed_size, hidden_size, att_dim) # [B*totalQ, 2H] query_emb = fluid.layers.reshape(query_emb, shape=[-1, total_Q, 2 * hidden_size]) # [B, totalQ, 2H] # 2. Induction class_emb = self.induction_module(support_emb, N, K, induction_iters, hidden_size) # [B, N, 1, 2H] # 3. Relation relation_score = self.relation_module(class_emb, query_emb, N, total_Q, hidden_size, relation_size) # [B, totalQ, N] # Return label_onehot = fluid.one_hot(label, depth=N) # [B, totalQ, N] self.loss = fluid.layers.mse_loss(relation_score, label_onehot) # [1] self.mean_acc = fluid.layers.accuracy( input=fluid.layers.reshape(relation_score, shape=[-1, N]), label=fluid.layers.reshape(label, shape=[-1, 1])) # [1] self.prediction = fluid.layers.argmax(relation_score, axis=-1) # [B, totalQ]
def seq2seq(model, tokenizer, args): log.info('Training starts with args: %r' % args) attn_id = tokenizer.vocab[args.attn_token] def gen_mask(batch_ids, mask_type='bidi', query_len=None, pad_value=0): if query_len is None: query_len = batch_ids.shape[1] if mask_type != 'empty': mask = (batch_ids != pad_value).astype(np.float32) mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1]) if mask_type == 'causal': assert query_len == batch_ids.shape[1] mask = np.tril(mask) elif mask_type == 'causal_without_diag': assert query_len == batch_ids.shape[1] mask = np.tril(mask, -1) elif mask_type == 'diag': assert query_len == batch_ids.shape[1] mask = np.stack([np.diag(np.diag(m)) for m in mask], 0) else: mask_type == 'empty' mask = np.zeros_like(batch_ids).astype(np.float32) mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1]) return mask def make_some_noice(ids): if args.use_random_noice: noice_ids = np.random.randint(1, len(tokenizer.vocab), size=ids.shape) else: noice_ids = np.ones_like(ids) * tokenizer.vocab['[NOISE]'] pos, = np.where(np.ones_like(ids)) np.random.shuffle(pos) pos = pos[:int(args.noise_prob * len(pos))] ids[pos, ] = noice_ids[pos, ] return ids def map_fn(example_id, src_ids, tgt_ids): src_ids = src_ids[:args.max_encode_len] tgt_ids = tgt_ids[:args.max_decode_len] src_ids, src_sids = tokenizer.build_for_ernie(src_ids) src_pids = np.arange(len(src_ids)) tgt_ids, tgt_sids = tokenizer.build_for_ernie(tgt_ids) tgt_pids = np.arange(len(tgt_ids)) + len(src_ids) # continues position tgt_sids = np.ones_like(tgt_sids) * args.tgt_type_id attn_ids = np.ones_like(tgt_ids) * attn_id if args.noise_prob > 0.: tgt_labels = deepcopy(tgt_ids) tgt_ids = make_some_noice(tgt_ids) #corrupted else: tgt_labels = tgt_ids return (example_id, src_ids, src_pids, src_sids, tgt_ids, tgt_pids, tgt_sids, attn_ids, tgt_labels) def after_padding(example_id, src_ids, src_pids, src_sids, tgt_ids, tgt_pids, tgt_sids, attn_ids, tgt_labels): ''' attention mask: *** src, tgt, attn src 00, 01, 11 tgt 10, 11, 12 attn 20, 21, 22 *** s1, s2 | t1 t2 t3| attn1 attn2 attn3 s1 1, 1 | 0, 0, 0,| 0, 0, 0, s2 1, 1 | 0, 0, 0,| 0, 0, 0, - t1 1, 1, | 1, 0, 0,| 0, 0, 0, t2 1, 1, | 1, 1, 0,| 0, 0, 0, t3 1, 1, | 1, 1, 1,| 0, 0, 0, - attn1 1, 1, | 0, 0, 0,| 1, 0, 0, attn2 1, 1, | 1, 0, 0,| 0, 1, 0, attn3 1, 1, | 1, 1, 0,| 0, 0, 1, for details, see Fig3. https://arxiv.org/abs/2001.11314 ''' src_len = src_ids.shape[1] tgt_len = tgt_ids.shape[1] mask_00 = gen_mask(src_ids, 'bidi', query_len=src_len) mask_01 = gen_mask(tgt_ids, 'empty', query_len=src_len) mask_02 = gen_mask(attn_ids, 'empty', query_len=src_len) mask_10 = gen_mask(src_ids, 'bidi', query_len=tgt_len) mask_11 = gen_mask(tgt_ids, 'causal', query_len=tgt_len) mask_12 = gen_mask(attn_ids, 'empty', query_len=tgt_len) mask_20 = gen_mask(src_ids, 'bidi', query_len=tgt_len) mask_21 = gen_mask(tgt_ids, 'causal_without_diag', query_len=tgt_len) mask_22 = gen_mask(attn_ids, 'diag', query_len=tgt_len) ''' mask = np.concatenate([ np.concatenate([mask_00, mask_01, mask_02], 2), np.concatenate([mask_10, mask_11, mask_12], 2), np.concatenate([mask_20, mask_21, mask_22], 2), ], 1) ids = np.concatenate([src_ids, tgt_ids, attn_ids], 1) pids = np.concatenate([src_pids, tgt_pids, tgt_pids], 1) sids = np.concatenate([src_sids, tgt_sids, tgt_sids], 1) ''' mask_src_2_src = mask_00 mask_tgt_2_srctgt = np.concatenate([mask_10, mask_11], 2) mask_attn_2_srctgtattn = np.concatenate([mask_20, mask_21, mask_22], 2) tgt_labels = tgt_labels[np.where(tgt_labels != 0)] return (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels) bytes_vocab = {k.encode('utf8'): v for k, v in tokenizer.vocab.items()} feature_column = propeller.data.FeatureColumns([ propeller.data.LabelColumn('id'), propeller.data.TextColumn('src', unk_id=tokenizer.unk_id, vocab_dict=bytes_vocab), propeller.data.TextColumn('tgt', unk_id=tokenizer.unk_id, vocab_dict=bytes_vocab), ]) train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=False, repeat=True, use_gz=False) \ .map(map_fn) dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \ .map(map_fn) \ .padded_batch(args.eval_bsz) \ .map(after_padding) log.debug('shard %d of %d' % (D.parallel.Env().dev_id, D.parallel.Env().nranks)) train_ds = train_ds.shard( D.parallel.Env().nranks, D.parallel.Env().dev_id).shuffle(10000).padded_batch( args.bsz).map(after_padding) dev_ds = dev_ds.shard(D.parallel.Env().nranks, D.parallel.Env().dev_id) shapes = [[None, None]] * 7 + [[None, None, None]] * 3 + [[None]] types = ['int64'] * 11 train_ds.data_shapes = shapes train_ds.data_types = types dev_ds.data_shapes = shapes dev_ds.data_types = types vocab_size, _ = model.word_emb.weight.shape ctx = D.parallel.prepare_context() model = D.parallel.DataParallel(model, ctx) g_clip = F.clip.GradientClipByGlobalNorm(1.0) opt = AdamW(learning_rate=LinearDecay( args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps), parameter_list=model.parameters(), weight_decay=args.wd, grad_clip=g_clip) attn_id = tokenizer.vocab[args.attn_token] for step, data in enumerate(train_ds.start(place)): (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels) = data _, __, info = model(src_ids, sent_ids=src_sids, pos_ids=src_pids, attn_bias=mask_src_2_src, encode_only=True) cached_k, cached_v = info['caches'] _, __, info = model(tgt_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_tgt_2_srctgt, past_cache=(cached_k, cached_v), encode_only=True) cached_k2, cached_v2 = info['caches'] past_cache_k = [ L.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2) ] past_cache_v = [ L.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2) ] if args.label_smooth > 0.: tgt_labels = L.label_smooth(F.one_hot(tgt_labels, vocab_size), epsilon=args.label_smooth) loss, _, __ = model(attn_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_attn_2_srctgtattn, past_cache=(past_cache_k, past_cache_v), tgt_labels=tgt_labels, tgt_pos=L.where(attn_ids == attn_id)) scaled_loss = model.scale_loss(loss) scaled_loss.backward() model.apply_collective_grads() opt.minimize(scaled_loss) model.clear_gradients() if step % 10 == 0: loss = loss.numpy() ppl = np.exp(loss) log.debug('[step %d]train loss %.5f, ppl %.5f, lr %.3e' % (step, loss, ppl, opt.current_step_lr())) if args.save_dir is not None and step % 1000 == 0 and D.parallel.Env( ).dev_id == 0: F.save_dygraph(model.state_dict(), args.save_dir) if args.predict_output_dir is not None and step > args.skip_eval_steps and step % args.eval_steps == 0: assert os.path.exists( args.predict_output_dir ), 'predict_output_dir not found: %s' % args.predict_output_dir log.debug('doing predict on gpu %d...' % D.parallel.Env().dev_id) evaluate(model, dev_ds, step, args) if step > args.max_steps: break evaluate(model, dev_ds, step, args) if args.save_dir is not None: F.save_dygraph(model.state_dict(), args.save_dir)
def softmax_with_loss(logit, label, ignore_mask=None, num_classes=2, weight=None): ignore_mask = fluid.layers.cast(ignore_mask, 'float32') label = fluid.layers.elementwise_min( label, fluid.layers.assign(np.array([num_classes - 1], dtype=np.int32))) logit = fluid.layers.transpose(logit, [0, 2, 3, 1]) logit = fluid.layers.reshape(logit, [-1, num_classes]) label = fluid.layers.reshape(label, [-1, 1]) label = fluid.layers.cast(label, 'int64') ignore_mask = fluid.layers.reshape(ignore_mask, [-1, 1]) if weight is None: loss, probs = fluid.layers.softmax_with_cross_entropy( logit, label, ignore_index=cfg.DATASET.IGNORE_INDEX, return_softmax=True) else: label = fluid.layers.squeeze(label, axes=[-1]) label_one_hot = fluid.one_hot(input=label, depth=num_classes) if isinstance(weight, list): assert len( weight ) == num_classes, "weight length must equal num of classes" weight = fluid.layers.assign(np.array([weight], dtype='float32')) elif isinstance(weight, str): assert weight.lower( ) == 'dynamic', 'if weight is string, must be dynamic!' tmp = [] total_num = fluid.layers.cast( fluid.layers.shape(label)[0], 'float32') for i in range(num_classes): cls_pixel_num = fluid.layers.reduce_sum(label_one_hot[:, i]) ratio = total_num / (cls_pixel_num + 1) tmp.append(ratio) weight = fluid.layers.concat(tmp) weight = weight / fluid.layers.reduce_sum(weight) * num_classes elif isinstance(weight, fluid.layers.Variable): pass else: raise ValueError( 'Expect weight is a list, string or Variable, but receive {}'. format(type(weight))) weight = fluid.layers.reshape(weight, [1, num_classes]) weighted_label_one_hot = fluid.layers.elementwise_mul( label_one_hot, weight) probs = fluid.layers.softmax(logit) # weighted_label_one_hot = weighted_label_one_hot*(1-probs) loss = fluid.layers.cross_entropy( probs, weighted_label_one_hot, soft_label=True, ignore_index=cfg.DATASET.IGNORE_INDEX) weighted_label_one_hot.stop_gradient = True loss = loss * ignore_mask avg_loss = fluid.layers.mean(loss) / (fluid.layers.mean(ignore_mask) + cfg.MODEL.DEFAULT_EPSILON) label.stop_gradient = True ignore_mask.stop_gradient = True return avg_loss
def get_reg_loss(pred_reg, reg_label, fg_mask, point_num, loc_scope, loc_bin_size, num_head_bin, anchor_size, get_xz_fine=True, get_y_by_bin=False, loc_y_scope=0.5, loc_y_bin_size=0.25, get_ry_fine=False): """ Bin-based 3D bounding boxes regression loss. See https://arxiv.org/abs/1812.04244 for more details. :param pred_reg: (N, C) :param reg_label: (N, 7) [dx, dy, dz, h, w, l, ry] :param loc_scope: constant :param loc_bin_size: constant :param num_head_bin: constant :param anchor_size: (N, 3) or (3) :param get_xz_fine: :param get_y_by_bin: :param loc_y_scope: :param loc_y_bin_size: :param get_ry_fine: :return: """ fg_num = fluid.layers.cast(fluid.layers.reduce_sum(fg_mask), dtype=pred_reg.dtype) fg_num = fluid.layers.clip(fg_num, min=1.0, max=point_num) fg_scale = float(point_num) / fg_num per_loc_bin_num = int(loc_scope / loc_bin_size) * 2 loc_y_bin_num = int(loc_y_scope / loc_y_bin_size) * 2 reg_loss_dict = {} # xz localization loss x_offset_label, y_offset_label, z_offset_label = reg_label[:, 0:1], reg_label[:, 1:2], reg_label[:, 2:3] x_shift = fluid.layers.clip(x_offset_label + loc_scope, 0., loc_scope * 2 - 1e-3) z_shift = fluid.layers.clip(z_offset_label + loc_scope, 0., loc_scope * 2 - 1e-3) x_bin_label = fluid.layers.cast(x_shift / loc_bin_size, dtype='int64') z_bin_label = fluid.layers.cast(z_shift / loc_bin_size, dtype='int64') x_bin_l, x_bin_r = 0, per_loc_bin_num z_bin_l, z_bin_r = per_loc_bin_num, per_loc_bin_num * 2 start_offset = z_bin_r loss_x_bin = fluid.layers.softmax_with_cross_entropy(pred_reg[:, x_bin_l: x_bin_r], x_bin_label) loss_x_bin = fluid.layers.reduce_mean(loss_x_bin * fg_mask) * fg_scale loss_z_bin = fluid.layers.softmax_with_cross_entropy(pred_reg[:, z_bin_l: z_bin_r], z_bin_label) loss_z_bin = fluid.layers.reduce_mean(loss_z_bin * fg_mask) * fg_scale reg_loss_dict['loss_x_bin'] = loss_x_bin reg_loss_dict['loss_z_bin'] = loss_z_bin loc_loss = loss_x_bin + loss_z_bin if get_xz_fine: x_res_l, x_res_r = per_loc_bin_num * 2, per_loc_bin_num * 3 z_res_l, z_res_r = per_loc_bin_num * 3, per_loc_bin_num * 4 start_offset = z_res_r x_res_label = x_shift - (fluid.layers.cast(x_bin_label, dtype=x_shift.dtype) * loc_bin_size + loc_bin_size / 2.) z_res_label = z_shift - (fluid.layers.cast(z_bin_label, dtype=z_shift.dtype) * loc_bin_size + loc_bin_size / 2.) x_res_norm_label = x_res_label / loc_bin_size z_res_norm_label = z_res_label / loc_bin_size x_bin_onehot = fluid.one_hot(x_bin_label[:, 0], depth=per_loc_bin_num) z_bin_onehot = fluid.one_hot(z_bin_label[:, 0], depth=per_loc_bin_num) loss_x_res = fluid.layers.smooth_l1(fluid.layers.reduce_sum(pred_reg[:, x_res_l: x_res_r] * x_bin_onehot, dim=1, keep_dim=True), x_res_norm_label) loss_x_res = fluid.layers.reduce_mean(loss_x_res * fg_mask) * fg_scale loss_z_res = fluid.layers.smooth_l1(fluid.layers.reduce_sum(pred_reg[:, z_res_l: z_res_r] * z_bin_onehot, dim=1, keep_dim=True), z_res_norm_label) loss_z_res = fluid.layers.reduce_mean(loss_z_res * fg_mask) * fg_scale reg_loss_dict['loss_x_res'] = loss_x_res reg_loss_dict['loss_z_res'] = loss_z_res loc_loss += loss_x_res + loss_z_res # y localization loss if get_y_by_bin: y_bin_l, y_bin_r = start_offset, start_offset + loc_y_bin_num y_res_l, y_res_r = y_bin_r, y_bin_r + loc_y_bin_num start_offset = y_res_r y_shift = fluid.layers.clip(y_offset_label + loc_y_scope, 0., loc_y_scope * 2 - 1e-3) y_bin_label = fluid.layers.cast(y_shift / loc_y_bin_size, dtype='int64') y_res_label = y_shift - (fluid.layers.cast(y_bin_label, dtype=y_shift.dtype) * loc_y_bin_size + loc_y_bin_size / 2.) y_res_norm_label = y_res_label / loc_y_bin_size y_bin_onehot = fluid.one_hot(y_bin_label[:, 0], depth=per_loc_bin_num) loss_y_bin = fluid.layers.cross_entropy(pred_reg[:, y_bin_l: y_bin_r], y_bin_label) loss_y_bin = fluid.layers.reduce_mean(loss_y_bin * fg_mask) * fg_scale loss_y_res = fluid.layers.smooth_l1(fluid.layers.reduce_sum(pred_reg[:, y_res_l: y_res_r] * y_bin_onehot, dim=1, keep_dim=True), y_res_norm_label) loss_y_res = fluid.layers.reduce_mean(loss_y_res * fg_mask) * fg_scale reg_loss_dict['loss_y_bin'] = loss_y_bin reg_loss_dict['loss_y_res'] = loss_y_res loc_loss += loss_y_bin + loss_y_res else: y_offset_l, y_offset_r = start_offset, start_offset + 1 start_offset = y_offset_r loss_y_offset = fluid.layers.smooth_l1(fluid.layers.reduce_sum(pred_reg[:, y_offset_l: y_offset_r], dim=1, keep_dim=True), y_offset_label) loss_y_offset = fluid.layers.reduce_mean(loss_y_offset * fg_mask) * fg_scale reg_loss_dict['loss_y_offset'] = loss_y_offset loc_loss += loss_y_offset # angle loss ry_bin_l, ry_bin_r = start_offset, start_offset + num_head_bin ry_res_l, ry_res_r = ry_bin_r, ry_bin_r + num_head_bin ry_label = reg_label[:, 6:7] if get_ry_fine: # divide pi/2 into several bins angle_per_class = (np.pi / 2) / num_head_bin ry_label = ry_label % (2 * np.pi) # 0 ~ 2pi opposite_flag = fluid.layers.logical_and(ry_label > np.pi * 0.5, ry_label < np.pi * 1.5) opposite_flag = fluid.layers.cast(opposite_flag, dtype=ry_label.dtype) shift_angle = (ry_label + opposite_flag * np.pi + np.pi * 0.5) % (2 * np.pi) # (0 ~ pi) shift_angle.stop_gradient = True shift_angle = fluid.layers.clip(shift_angle - np.pi * 0.25, min=1e-3, max=np.pi * 0.5 - 1e-3) # (0, pi/2) # bin center is (5, 10, 15, ..., 85) ry_bin_label = fluid.layers.cast(shift_angle / angle_per_class, dtype='int64') ry_res_label = shift_angle - (fluid.layers.cast(ry_bin_label, dtype=shift_angle.dtype) * angle_per_class + angle_per_class / 2) ry_res_norm_label = ry_res_label / (angle_per_class / 2) else: # divide 2pi into several bins angle_per_class = (2 * np.pi) / num_head_bin heading_angle = ry_label % (2 * np.pi) # 0 ~ 2pi shift_angle = (heading_angle + angle_per_class / 2) % (2 * np.pi) shift_angle.stop_gradient = True ry_bin_label = fluid.layers.cast(shift_angle / angle_per_class, dtype='int64') ry_res_label = shift_angle - (fluid.layers.cast(ry_bin_label, dtype=shift_angle.dtype) * angle_per_class + angle_per_class / 2) ry_res_norm_label = ry_res_label / (angle_per_class / 2) ry_bin_onehot = fluid.one_hot(ry_bin_label[:, 0], depth=num_head_bin) loss_ry_bin = fluid.layers.softmax_with_cross_entropy(pred_reg[:, ry_bin_l:ry_bin_r], ry_bin_label) loss_ry_bin = fluid.layers.reduce_mean(loss_ry_bin * fg_mask) * fg_scale loss_ry_res = fluid.layers.smooth_l1(fluid.layers.reduce_sum(pred_reg[:, ry_res_l: ry_res_r] * ry_bin_onehot, dim=1, keep_dim=True), ry_res_norm_label) loss_ry_res = fluid.layers.reduce_mean(loss_ry_res * fg_mask) * fg_scale reg_loss_dict['loss_ry_bin'] = loss_ry_bin reg_loss_dict['loss_ry_res'] = loss_ry_res angle_loss = loss_ry_bin + loss_ry_res # size loss size_res_l, size_res_r = ry_res_r, ry_res_r + 3 assert pred_reg.shape[1] == size_res_r, '%d vs %d' % (pred_reg.shape[1], size_res_r) anchor_size_var = fluid.layers.zeros(shape=[3], dtype=reg_label.dtype) fluid.layers.assign(np.array(anchor_size).astype('float32'), anchor_size_var) size_res_norm_label = (reg_label[:, 3:6] - anchor_size_var) / anchor_size_var size_res_norm_label = fluid.layers.reshape(size_res_norm_label, shape=[-1, 1], inplace=True) size_res_norm = pred_reg[:, size_res_l:size_res_r] size_res_norm = fluid.layers.reshape(size_res_norm, shape=[-1, 1], inplace=True) size_loss = fluid.layers.smooth_l1(size_res_norm, size_res_norm_label) size_loss = fluid.layers.reshape(size_loss, shape=[-1, 3]) size_loss = fluid.layers.reduce_mean(size_loss * fg_mask) * fg_scale # Total regression loss reg_loss_dict['loss_loc'] = loc_loss reg_loss_dict['loss_angle'] = angle_loss reg_loss_dict['loss_size'] = size_loss return loc_loss, angle_loss, size_loss, reg_loss_dict
def _build_decoder(self, z_mean=None, z_log_var=None, enc_output=None, mode='train', beam_size=10): dec_input = layers.dropout(self.tar_emb, dropout_prob=self.dec_dropout_in, dropout_implementation="upscale_in_train") # `output_layer` will be used within BeamSearchDecoder output_layer = lambda x: layers.fc(x, size=self.tar_vocab_size, num_flatten_dims=len(x.shape) - 1, name="output_w") # `sample_output_layer` samples an id from the logits distribution instead of argmax(logits) # it will be used within BeamSearchDecoder sample_output_layer = lambda x: layers.unsqueeze( fluid.one_hot(layers.unsqueeze( layers.sampling_id(layers.softmax( layers.squeeze(output_layer(x), [1])), dtype='int'), [1]), depth=self.tar_vocab_size), [1]) if mode == 'train': latent_z = self._sampling(z_mean, z_log_var) else: latent_z = layers.gaussian_random_batch_size_like( self.tar, shape=[-1, self.latent_size]) dec_first_hidden_cell = layers.fc(latent_z, 2 * self.hidden_size * self.num_layers, name='fc_hc') dec_first_hidden, dec_first_cell = layers.split( dec_first_hidden_cell, 2) if self.num_layers > 1: dec_first_hidden = layers.split(dec_first_hidden, self.num_layers) dec_first_cell = layers.split(dec_first_cell, self.num_layers) else: dec_first_hidden = [dec_first_hidden] dec_first_cell = [dec_first_cell] dec_initial_states = [[h, c] for h, c in zip(dec_first_hidden, dec_first_cell) ] dec_cell = DecoderCell(self.num_layers, self.hidden_size, latent_z, self.param_attr_initializer, self.param_attr_scale, self.dec_dropout_out) if mode == 'train': dec_output, _ = rnn(cell=dec_cell, inputs=dec_input, initial_states=dec_initial_states, sequence_length=self.tar_sequence_length) dec_output = output_layer(dec_output) return dec_output elif mode == 'greedy': start_token = 1 end_token = 2 max_length = 100 beam_search_decoder = BeamSearchDecoder( dec_cell, start_token, end_token, beam_size=1, embedding_fn=self.tar_embeder, output_fn=output_layer) outputs, _ = dynamic_decode(beam_search_decoder, inits=dec_initial_states, max_step_num=max_length) return outputs elif mode == 'sampling': start_token = 1 end_token = 2 max_length = 100 beam_search_decoder = BeamSearchDecoder( dec_cell, start_token, end_token, beam_size=1, embedding_fn=self.tar_embeder, output_fn=sample_output_layer) outputs, _ = dynamic_decode(beam_search_decoder, inits=dec_initial_states, max_step_num=max_length) return outputs else: print("mode not supprt", mode)
def finetune( self, train_path, dev_path=None, save_dir="ernie_gen_result", init_ckpt_path=None, use_gpu=True, max_steps=500, batch_size=8, max_encode_len=50, max_decode_len=50, learning_rate=5e-5, warmup_proportion=0.1, weight_decay=0.1, noise_prob=0, label_smooth=0, beam_width=5, length_penalty=1.0, log_interval=100, save_interval=200, ): """ finetune with the specified dataset. Args: train_path(str): the train dataset path. dev_path(str): the dev dataset path. save_dir(str): the model params and dev dataset predict result save path. init_ckpt_path(str): incremental training load path. use_gpu(bool): use gpu or not. max_steps(int): max training steps. batch_size(int): the batch size. max_encode_len(int): the max encode length. max_decode_len(int): the max decode length. learning_rate(float): the learning rate. warmup_proportion(float): the warmup proportion. weight_decay(float): the weight decay magnitude. noise_prob(float): the nosie probability. see the ernie gen paper for details. label_smooth(float): the label smooth magnitude. beam_width(int): the beam size during evaluating the dev dataset. length_penalty(float): the length penalty during evaluating the dev dataset. log_interval(int): the log interval. save_interval(int): the save interval. dev set will be evaluated after saving. Return: result(dict): A Dictionary of shape:: { last_save_path(str): last model save path. last_ppl(float): last model ppl. } """ self.max_encode_len = max_encode_len self.max_decode_len = max_decode_len self.noise_prob = noise_prob place = F.CUDAPlace(0) if use_gpu else F.CPUPlace() with F.dygraph.guard(place): if init_ckpt_path is not None: logger.info('loading checkpoint from %s' % init_ckpt_path) sd, _ = D.load_dygraph(init_ckpt_path) self.model.set_dict(sd) feature_column = propeller.data.FeatureColumns([ propeller.data.LabelColumn('id'), propeller.data.TextColumn( 'src', unk_id=self.tokenizer.unk_id, vocab_dict=self.tokenizer.vocab, tokenizer=self.tokenizer.tokenize), propeller.data.TextColumn( 'tgt', unk_id=self.tokenizer.unk_id, vocab_dict=self.tokenizer.vocab, tokenizer=self.tokenizer.tokenize), ]) train_ds = feature_column.build_dataset('train', data_file=train_path, shuffle=False, repeat=True, use_gz=False)\ .map(self._map_fn).shuffle(10000).padded_batch(batch_size).map(self._after_padding) train_ds.data_shapes = [[None, None]] * 7 + [[None, None, None] ] * 3 + [[None]] train_ds.data_types = ['int64'] * 11 if dev_path: dev_ds = feature_column.build_dataset('dev', data_file=dev_path, shuffle=False, repeat=False, use_gz=False) \ .map(self._map_fn) \ .padded_batch(1) \ .map(self._after_padding) dev_ds.data_shapes = [[None, None]] * 7 + [[None, None, None] ] * 3 + [[None]] dev_ds.data_types = ['int64'] * 11 vocab_size, _ = self.model.word_emb.weight.shape g_clip = F.clip.GradientClipByGlobalNorm(1.0) opt = AdamW( learning_rate=LinearDecay(learning_rate, int(warmup_proportion * max_steps), max_steps), parameter_list=self.model.parameters(), weight_decay=weight_decay, grad_clip=g_clip) loss = None save_path = None ppl = None if save_dir and not os.path.exists(save_dir): os.makedirs(save_dir) for step, data in enumerate(train_ds.start(place)): (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels) = data _, __, info = self.model( src_ids, sent_ids=src_sids, pos_ids=src_pids, attn_bias=mask_src_2_src, encode_only=True) cached_k, cached_v = info['caches'] _, __, info = self.model( tgt_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_tgt_2_srctgt, past_cache=(cached_k, cached_v), encode_only=True) cached_k2, cached_v2 = info['caches'] past_cache_k = [ L.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2) ] past_cache_v = [ L.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2) ] if label_smooth > 0.: tgt_labels = L.label_smooth( F.one_hot(tgt_labels, vocab_size), epsilon=label_smooth) loss, _, __ = self.model( attn_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_attn_2_srctgtattn, past_cache=(past_cache_k, past_cache_v), tgt_labels=tgt_labels, tgt_pos=L.where(attn_ids == self.tokenizer.vocab['[MASK]'])) loss.backward() opt.minimize(loss) self.model.clear_gradients() if step % log_interval == 0: loss_np = loss.numpy() ppl = np.exp(loss_np) logger.info( '[step %d / %d]train loss %.5f, ppl %.5f, elr %.3e' % (step, max_steps, loss_np, ppl, opt.current_step_lr())) if save_dir and step % save_interval == 0 and step > 0: loss_np = loss.numpy() ppl = np.exp(loss_np) save_name = "step_%s_ppl_%.5f" % (step, ppl) save_path = os.path.join(save_dir, save_name) logger.info("save the model in %s" % save_path) F.save_dygraph(self.model.state_dict(), save_path) if dev_path: logger.info('evaluating...') res = self._evaluate(dev_ds, place, beam_width, length_penalty) output_path = os.path.join( save_dir, "step_%s_ppl_%.5f.txt" % (step, ppl)) logger.info( 'save the predict result in %s' % output_path) with open(output_path, 'w') as fout: fout.write(('\n'.join(res))) if step > max_steps: break if loss: loss_np = loss.numpy() ppl = np.exp(loss_np) logger.info('[final step %d]train loss %.5f, ppl %.5f, elr %.3e' % (step, loss_np, ppl, opt.current_step_lr())) if save_dir: save_name = "step_%s_ppl_%.5f" % (step, ppl) save_path = os.path.join(save_dir, save_name) logger.info("save the model in %s" % save_path) F.save_dygraph(self.model.state_dict(), save_path) if dev_path: logger.info('evaluating...') res = self._evaluate(dev_ds, place, beam_width, length_penalty) output_path = os.path.join( save_dir, "step_%s_ppl_%.5f.txt" % (step, ppl)) logger.info( 'save the predict result in %s' % output_path) with open(output_path, 'w') as fout: fout.write(('\n'.join(res))) result = { "last_save_path": "%s.pdparams" % save_path, "last_ppl": ppl[0], } return result
def test_bad_x(): label = fluid.layers.data(name="label", shape=[4], append_batch_size=False, dtype="float32") one_hot_label = fluid.one_hot(input=label, depth=4)
def focal_loss(logits, label, gamma=1): probs = L.softmax(logits, axis=-1) one_hot = F.one_hot(label, depth=probs.shape[-1]) loss = -L.reduce_sum(one_hot * ( (1.0 - probs)**gamma) * L.log(probs)) / logits.shape[0] return loss
def _init_train(self): instances = self.instances Backbone = self.Backbone bb_conf = self.bb_conf bb_name = self.bb_name dev_count = self.dev_count num_instances = len(instances) mrs = self.mrs # set first_target/main task instance main_inst = None for inst in instances: if inst.is_target: main_inst = inst inst.is_first_target = True break main_conf = main_inst.config if not os.path.exists(main_conf['save_path']): os.makedirs(main_conf['save_path']) os.makedirs(os.path.join(main_conf['save_path'], 'ckpt')) # prepare backbone train_backbone = Backbone(bb_conf, phase='train') pred_backbone = Backbone(bb_conf, phase='pred') # create reader, task # then check i/o across reader, backbone and task_layer task_attrs = [] pred_task_attrs = [] for inst in instances: train_reader = inst.Reader(inst.config, phase='train') inst.reader['train'] = train_reader train_parad = inst.Paradigm(inst.config, phase='train', backbone_config=bb_conf) inst.task_layer['train'] = train_parad task_attr_from_reader = _encode_inputs(train_parad.inputs_attrs['reader'], inst.name) task_attrs.append(task_attr_from_reader) _check_io(train_backbone.inputs_attr, train_reader.outputs_attr, in_name=bb_name+'_backbone', out_name='reader.train') _check_io(train_parad.inputs_attrs['reader'], train_reader.outputs_attr, in_name='task_paradigm.train.reader', out_name='reader.train') _check_io(train_parad.inputs_attrs['backbone'], train_backbone.outputs_attr, in_name='task_paradigm.train.backbone', out_name=bb_name+'_backbone') if inst.is_target: if 'pred_file' not in inst.config: inst.config['pred_file'] = '' pred_reader = inst.Reader(inst.config, phase='pred') pred_parad = inst.Paradigm(inst.config, phase='pred', backbone_config=bb_conf) inst.task_layer['pred'] = pred_parad task_attr_from_reader = _encode_inputs(pred_parad.inputs_attrs['reader'], inst.name) pred_task_attrs.append(task_attr_from_reader) _check_io(pred_backbone.inputs_attr, pred_reader.outputs_attr, in_name=bb_name+'_backbone', out_name='reader.pred') _check_io(pred_parad.inputs_attrs['reader'], pred_reader.outputs_attr, in_name='task_paradigm.pred.reader', out_name='reader.pred') _check_io(pred_parad.inputs_attrs['backbone'], pred_backbone.outputs_attr, in_name='task_paradigm.pred.backbone', out_name=bb_name+'_backbone') # merge reader input attrs from backbone and task_instances joint_input_names, joint_shape_and_dtypes, name_to_position = merge_input_attrs(train_backbone.inputs_attr, task_attrs) pred_joint_input_names, pred_joint_shape_and_dtypes, _ = merge_input_attrs(pred_backbone.inputs_attr, pred_task_attrs, insert_taskid=False, insert_batchsize=False, insert_seqlen=False, insert_batchsize_x_seqlen=False) # shapes: [task_id, shapes_of_backbone, shapes_of_inst1, ..., shapes_of_instN] if DEBUG: print('----- for debug -----') print('joint input names:') print(joint_input_names) print('joint input shape and dtypes:') print(joint_shape_and_dtypes) # load data for inst in instances: print(inst.name+": preparing data...", end='') inst.reader['train'].load_data() print('ok!') # merge dataset iterators and create net input vars iterators = [] prefixes = [] mrs = [] for inst in instances: iterators.append(inst.reader['train'].iterator()) prefixes.append(inst.name) mrs.append(inst.mix_ratio) joint_iterator_fn = create_joint_iterator_fn(iterators, prefixes, joint_shape_and_dtypes, mrs, name_to_position, dev_count=dev_count, verbose=VERBOSE, return_type='dict') self._joint_iterator_fn = joint_iterator_fn input_attrs = [[i, j, k] for i, (j,k) in zip(joint_input_names, joint_shape_and_dtypes)] pred_input_attrs = [[i, j, k] for i, (j,k) in zip(pred_joint_input_names, pred_joint_shape_and_dtypes)] # net_inputs = create_net_inputs(input_attrs, async=True, iterator_fn=joint_iterator_fn, dev_count=dev_count, n_prefetch=3) net_inputs = create_net_inputs(input_attrs, async=False) self._net_inputs = net_inputs # build backbone and task layers train_prog = fluid.default_main_program() train_init_prog = fluid.default_startup_program() bb_output_vars = train_backbone.build(net_inputs, scope_name='__paddlepalm_') assert sorted(bb_output_vars.keys()) == sorted(train_backbone.outputs_attr.keys()) pred_prog = fluid.Program() pred_init_prog = fluid.Program() with fluid.program_guard(main_program = pred_prog, startup_program = pred_init_prog): pred_net_inputs = create_net_inputs(pred_input_attrs) pred_bb_output_vars = pred_backbone.build(pred_net_inputs, scope_name='__paddlepalm_') fluid.framework.switch_main_program(train_prog) fluid.framework.switch_startup_program(train_init_prog) task_output_vars = {} for inst in instances: task_inputs = {'backbone': bb_output_vars} task_inputs_from_reader = _decode_inputs(net_inputs, inst.name) task_inputs['reader'] = task_inputs_from_reader scope = inst.task_reuse_scope + '/' with fluid.unique_name.guard(scope): output_vars = inst.build_task_layer(task_inputs, phase='train', scope=scope) output_vars = {inst.name+'/'+key: val for key, val in output_vars.items()} old = len(task_output_vars) # for debug task_output_vars.update(output_vars) assert len(task_output_vars) - old == len(output_vars) # for debug # prepare predict vars for saving inference model if inst.is_target: with fluid.program_guard(pred_prog, pred_init_prog): cur_inputs = _decode_inputs(pred_net_inputs, inst.name) inst.pred_input = cur_inputs pred_task_inputs = {'backbone': pred_bb_output_vars, 'reader': cur_inputs} scope = inst.task_reuse_scope + '/' with fluid.unique_name.guard(scope): inst.build_task_layer(pred_task_inputs, phase='pred', scope=scope) bb_fetches = {k: v.name for k,v in bb_output_vars.items()} task_fetches = {k: v.name for k,v in task_output_vars.items()} fetches = task_fetches fetches['__task_id'] = net_inputs['__task_id'].name # compute loss task_id_var = net_inputs['__task_id'] task_id_vec = fluid.one_hot(task_id_var, num_instances) losses = fluid.layers.concat([task_output_vars[inst.name+'/loss'] for inst in instances], axis=0) loss = layers.reduce_sum(task_id_vec * losses) main_reader = main_inst.reader['train'] num_examples = main_reader.num_examples for inst in instances: max_train_steps = int(main_conf['num_epochs']* inst.mix_ratio * (num_examples // main_conf['batch_size'] // dev_count)) if inst.is_target: print('{}: expected train steps {}.'.format(inst.name, max_train_steps)) inst.steps_pur_epoch = inst.reader['train'].num_examples // main_conf['batch_size'] // dev_count inst.expected_train_steps = max_train_steps global_max_train_steps = int(main_conf['num_epochs'] * sum(mrs) * (num_examples // main_conf['batch_size'] // dev_count)) print('Estimated overall train steps {}.'.format(global_max_train_steps)) if 'warmup_proportion' in main_conf and main_conf['warmup_proportion'] > 0: warmup_steps = int(global_max_train_steps * main_conf['warmup_proportion']) print('Warmup steps: '+str(warmup_steps)) else: warmup_steps = 0 # build optimizer if 'optimizer' in main_conf: optim_mod = importlib.import_module(OPTIMIZER_DIR + '.' + main_conf['optimizer']) optimize = getattr(optim_mod, OPTIMIZE_METHOD) optimize(loss, main_conf, max_train_steps, warmup_steps, fluid.default_main_program()) loss.persistable = True if main_conf.get('use_ema', False): assert 'ema_decay' in main_conf, "ema_decay should be set when use_ema is enabled." ema = fluid.optimizer.ExponentialMovingAverage(main_conf['ema_decay']) ema.update() # prepare for train self.train_backbone = train_backbone self.train_program = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(loss_name=loss.name) self.saver_program = fluid.default_main_program() self.main_inst = main_inst self.fetches = fetches self.has_init_train = True self.has_init_pred = True self.exe.run(fluid.default_startup_program()) print("\nRandomly initialize parameters...\n")
input_layer7, out_logits7 = model7.x2paddle_net(input=adv_image) out7 = fluid.layers.softmax(out_logits7[0]) model8 = models.__dict__[model_name8]() input_layer8, out_logits8 = model8.x2paddle_net(input=adv_image) out8 = fluid.layers.softmax(out_logits8[0]) model9 = models.__dict__[model_name9]() input_layer9, out_logits9 = model9.x2paddle_net(input=adv_image) out9 = fluid.layers.softmax(out_logits9[0]) place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) one_hot_label = fluid.one_hot(input=label, depth=121) one_hot_label2 = fluid.one_hot(input=label2, depth=121) smooth_label = fluid.layers.label_smooth(label=one_hot_label, epsilon=0.1, dtype="float32")[0] #print(smooth_label.shape) smooth_label2 = fluid.layers.label_smooth(label=one_hot_label2, epsilon=0.1, dtype="float32")[0] #print(smooth_label2.shape) #print(one_hot_label) #print(smooth_label) #尝试三种损失函数 #第一种 loss_logp = -1*fluid.layers.log(1-fluid.layers.matmul(out1,one_hot_label[0],transpose_y=True))\ -1*fluid.layers.log(1-fluid.layers.matmul(out2,one_hot_label[0],transpose_y=True))\
def beam_search(): """Beam search function""" max_len = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=self.max_out_len, force_cpu=True) min_len = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=self.min_out_len) neg_inf = layers.fill_constant(shape=[1], dtype='float32', value=-INF) step_idx = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=0, force_cpu=True) step_next_idx = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=1, force_cpu=True) cond = layers.less_than(x=step_idx, y=max_len) # default force_cpu=True while_op = layers.While(cond) # array states will be stored for each step. ids = layers.array_write(layers.reshape(start_tokens, (-1, 1)), step_idx) scores = layers.array_write(init_scores, step_idx) # cell states will be overwrited at each step. # caches contains states of history steps in decoder self-attention # and static encoder output projections in encoder-decoder attention # to reduce redundant computation. caches = [ { "k": # for self attention layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, self._n_head, 0, self._emb_size // self._n_head], dtype=enc_words_output.dtype, value=0), "v": # for self attention layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, self._n_head, 0, self._emb_size // self._n_head], dtype=enc_words_output.dtype, value=0), "static_k_word": # for encoder-decoder attention layers.create_tensor(dtype=enc_words_output.dtype), "static_v_word": # for encoder-decoder attention layers.create_tensor(dtype=enc_words_output.dtype), "static_k_sent": # for encoder-decoder attention layers.create_tensor(dtype=enc_sents_output.dtype), "static_v_sent": # for encoder-decoder attention layers.create_tensor(dtype=enc_sents_output.dtype) } for i in range(self._dec_n_layer) ] trigram_blocking = TrigramBlocking(start_tokens, self.tokenizer, use_fp16=self._use_fp16, beam_size=self.beam_size) with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) # Since beam_search_op dosen't enforce pre_ids' shape, we can do # inplace reshape here which actually change the shape of pre_ids. # pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) # gather cell states corresponding to selected parent pre_src_words_attn_bias = layers.gather( tgt_src_words_attn_bias, index=parent_idx) pre_src_sents_attn_bias = layers.gather( tgt_src_sents_attn_bias, index=parent_idx) pre_graph_attn_bias = layers.gather(graph_attn_bias, index=parent_idx) pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input= pre_src_sents_attn_bias, # cann't use lod tensor here value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) logits = self.decode( dec_input=(pre_ids, pre_pos, None, pre_src_words_attn_bias, pre_src_sents_attn_bias, pre_graph_attn_bias), enc_words_output=enc_words_output, enc_sents_output=enc_sents_output, caches=caches, gather_idx=parent_idx) # prevent generating end token if length less than min_out_len eos_index = layers.fill_constant( shape=[layers.shape(logits)[0]], dtype='int64', value=self.eos_idx) eos_index = fluid.one_hot(eos_index, depth=self.voc_size) less_cond = layers.cast(layers.less_than(x=step_idx, y=min_len), dtype='float32') less_val = layers.elementwise_mul(less_cond, neg_inf) eos_val = layers.elementwise_mul(eos_index, less_val, axis=0) revised_logits = layers.elementwise_add(logits, eos_val, axis=0) # topK reduction across beams, also contain special handle of # end beams and end sentences(batch reduction) topk_scores, topk_indices = layers.topk( input=layers.softmax(revised_logits), k=self.beam_size) # Roll-Back previous-scores for length-penalty # previous-scores has been length-penaltied, before this timestep length-penalty, need roll-back # because of doing this, we need store the length-penaltied score in `scores` # while calculating use the un-penaltied score # -> safe for step_idx == 0 (initialization state), because previous-score == 0 pre_timestep_length_penalty = fluid.layers.pow( ((5.0 + fluid.layers.cast(step_idx, pre_scores.dtype)) / 6.0), self.len_penalty) pre_scores_wo_len_penalty = fluid.layers.elementwise_mul( pre_scores, pre_timestep_length_penalty) # calc trigram-blocking delta scores for current alive sequence if self.block_trigram: trigram_blocking.update_seq(pre_ids, parent_idx) trigram_blocking.expand_cand_seq(topk_indices) fluid.layers.py_func( func=trigram_blocking.blocking_forward, x=[ trigram_blocking.cand_seq, trigram_blocking.id2is_full_token ], out=trigram_blocking.delta_score_out, backward_func=None) layers.Print(trigram_blocking.delta_score_out, summarize=100, message="trigram_blocking.delta_score_out") pre_scores_wo_len_penalty = fluid.layers.elementwise_add( x=trigram_blocking.delta_score_out, y=pre_scores_wo_len_penalty, axis=0) # => [N, topk] accu_scores = layers.elementwise_add( x=layers.log(topk_scores), y=pre_scores_wo_len_penalty, axis=0) cur_timestep_length_penalty = layers.pow( ((5.0 + layers.cast(step_next_idx, accu_scores.dtype)) / 6.0), self.len_penalty) curr_scores = layers.elementwise_div( accu_scores, cur_timestep_length_penalty) # beam_search op uses lod to differentiate branches. curr_scores = layers.lod_reset(curr_scores, pre_ids) topk_indices = layers.lod_reset(topk_indices, pre_ids) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=curr_scores, beam_size=self.beam_size, end_id=self.eos_idx, return_parent_idx=True) layers.increment(x=step_idx, value=1.0, in_place=True) layers.increment(x=step_next_idx, value=1.0, in_place=True) # cell states(caches) have been updated in wrap_decoder, # only need to update beam search states here. layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.assign(gather_idx, parent_idx) layers.assign(pre_src_words_attn_bias, tgt_src_words_attn_bias) layers.assign(pre_src_sents_attn_bias, tgt_src_sents_attn_bias) layers.assign(pre_graph_attn_bias, graph_attn_bias) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not( layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=self.beam_size, end_id=self.eos_idx) return finished_ids, finished_scores
def fast_decode(self): """create model for inference""" if self.task_type == "dialog": emb_num = 4 else: emb_num = 3 input_shapes = [[-1, self.max_seq_len, 1]] * emb_num + \ [[-1, self.max_seq_len, self.max_seq_len]] input_dtypes = ['int64'] * emb_num + ['float32'] input_lod_levels = [0] * emb_num + [0] shapes = input_shapes + [[-1, 1, 1], [-1, 1, 1], [-1, 1], [-1], [-1, 1, self.max_seq_len], [-1, 1]] dtypes = input_dtypes + [ 'int64', 'int64', 'float32', 'int32', 'float32', 'int64' ] lod_levels = input_lod_levels + [2, 2, 2, 0, 0, 0] inputs = self.to_tensor(shapes, dtypes, lod_levels) pyreader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=70, iterable=False) emb_ids = {} for key, value in zip(self.emb_keys, inputs[:emb_num]): emb_ids[key] = value input_mask = inputs[emb_num] tgt_ids, tgt_pos, init_scores, parent_idx, tgt_input_mask, data_ids = inputs[ -6:] unimo = UNIMOModel(emb_ids=emb_ids, input_mask=input_mask, config=self.gene_config, task_type=self.task_type, decoding=True, gather_idx=parent_idx) max_len = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=self.max_out_len, force_cpu=True) min_len = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=self.min_out_len, force_cpu=True) neg_inf = layers.fill_constant(shape=[1], dtype='float32', value=-1e18) step_idx = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=0, force_cpu=True) step_next_idx = layers.fill_constant(shape=[1], dtype=tgt_ids.dtype, value=1, force_cpu=True) cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond) ids = layers.array_write(layers.reshape(tgt_ids, (-1, 1)), step_idx) pos_biases = layers.array_write(tgt_pos, step_idx) scores = layers.array_write(init_scores, step_idx) tgt_masks = layers.array_write(tgt_input_mask, step_idx) trigram_blocking = TrigramBlocking(tgt_ids, self.tokenizer, beam_size=self.beam_size) with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) pos_bias = layers.array_read(array=pos_biases, i=step_idx) pos_bias = layers.gather(input=pos_bias, index=parent_idx) def gen_batch_like(value, dtype="int64", shape=[-1, 1, 1], is_scalar=True): """generate batch""" if is_scalar: return layers.fill_constant_batch_size_like( input=parent_idx, value=value, shape=shape, dtype=dtype) else: return layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=parent_idx, value=1, shape=shape, dtype=dtype), y=value, axis=0) tmp_mask = layers.array_read(tgt_masks, i=step_idx) tmp_mask = layers.gather(input=tmp_mask, index=parent_idx) append_1_mask = gen_batch_like(1.0, dtype=tmp_mask.dtype) pre_mask = layers.concat([tmp_mask, append_1_mask], axis=2) pre_pos = gen_batch_like(step_idx, is_scalar=False) pre_pos = pre_pos + pos_bias ####################### pos start from 2 pre_sent = gen_batch_like(self.tgt_type_id, dtype=pre_ids.dtype) dec_emb_ids = {"word_embedding": pre_ids, "pos_embedding": pre_pos} if self.task_type == "dialog": role_ids = gen_batch_like(0) turn_ids = gen_batch_like(0) dec_emb_ids["role_embedding"] = role_ids dec_emb_ids["turn_embedding"] = turn_ids else: dec_emb_ids["sent_embedding"] = pre_sent dec_out = unimo.encode(emb_ids=dec_emb_ids, input_mask=pre_mask, gather_idx=parent_idx) fc_out = self.cal_logit(dec_out, None) # prevent generating end token if length less than min_out_len eos_index = layers.fill_constant(shape=[layers.shape(fc_out)[0]], dtype='int64', value=self.eos_id) eos_index = fluid.one_hot(eos_index, depth=self.vocab_size) less_cond = layers.cast(layers.less_than(x=step_idx, y=min_len), dtype='float32') less_val = layers.elementwise_mul(less_cond, neg_inf) eos_val = layers.elementwise_mul(eos_index, less_val, axis=0) revised_logits = layers.elementwise_add(fc_out, eos_val, axis=0) # topK reduction across beams, also contain special handle of # end beams and end sentences(batch reduction) topk_scores, topk_indices = layers.topk( input=layers.softmax(revised_logits), k=self.beam_size) # Roll-Back previous-scores for length-penalty # previous-scores has been length-penaltied, before this timestep length-penalty, need roll-back # because of doing this, we need store the length-penaltied score in `scores` # while calculating use the un-penaltied score # -> safe for step_idx == 0 (initialization state), because previous-score == 0 pre_timestep_length_penalty = fluid.layers.pow( ((5.0 + fluid.layers.cast(step_idx, pre_scores.dtype)) / 6.0), self.length_penalty) pre_scores_wo_len_penalty = fluid.layers.elementwise_mul( pre_scores, pre_timestep_length_penalty) # calc trigram-blocking delta scores for current alive sequence if self.block_trigram: trigram_blocking.update_seq(pre_ids, parent_idx) trigram_blocking.expand_cand_seq(topk_indices) fluid.layers.py_func(func=trigram_blocking.blocking_forward, x=[ trigram_blocking.cand_seq, trigram_blocking.id2is_full_token ], out=trigram_blocking.delta_score_out, backward_func=None) pre_scores_wo_len_penalty = fluid.layers.elementwise_add( x=trigram_blocking.delta_score_out, y=pre_scores_wo_len_penalty, axis=0) # => [N, topk] accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores_wo_len_penalty, axis=0) cur_timestep_length_penalty = layers.pow( ((5.0 + layers.cast(step_next_idx, accu_scores.dtype)) / 6.0), self.length_penalty) curr_scores = layers.elementwise_div(accu_scores, cur_timestep_length_penalty) # beam_search op uses lod to differentiate branches. curr_scores = layers.lod_reset(curr_scores, pre_ids) topk_indices = layers.lod_reset(topk_indices, pre_ids) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=curr_scores, beam_size=self.beam_size, end_id=self.eos_id, return_parent_idx=True) layers.increment(x=step_idx, value=1.0, in_place=True) layers.increment(x=step_next_idx, value=1.0, in_place=True) # cell states(caches) have been updated in wrap_decoder, # only need to update beam search states here. layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.array_write(pre_mask, i=step_idx, array=tgt_masks) layers.array_write(pos_bias, i=step_idx, array=pos_biases) layers.assign(gather_idx, parent_idx) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=self.beam_size, end_id=self.eos_id) graph_vars = { "finished_ids": finished_ids, "finished_scores": finished_scores, "data_ids": data_ids } for k, v in graph_vars.items(): v.persistable = True return pyreader, graph_vars