def test_norm(self): self.init_test_case() inputs = np.random.random((2, 3, 5, 5)).astype(np.float32) shape = inputs.shape n, c, h, w = shape[0], shape[1], shape[2], shape[3] scale_shape = [c] mean_shape = [n * c] scale = np.ones(scale_shape).astype(np.float32) bias = np.zeros(scale_shape).astype(np.float32) mean, variance = _cal_mean_variance(inputs, self.epsilon, mean_shape) out_np, _, _ = _reference_instance_norm_naive(inputs, scale, bias, self.epsilon, mean, variance) for place in self.places: with fluid.dygraph.guard(place): instance_norm = fluid.dygraph.InstanceNorm(3, param_attr=True, bias_attr=True) outputs = instance_norm(to_variable(inputs)) self.assertTrue(np.allclose(outputs.numpy(), out_np, atol=1e-6))
def validation(): # 开启动态图工作环境 with dygraph.guard(): # 声明定义好的线性回归模型 model = Regressor("Regressor") # 开启模型训练模式 model.eval() # 参数为保存模型参数的文件地址 model_dict, _ = fluid.load_dygraph('LR_model') model.load_dict(model_dict) model.eval() # 参数为数据集的文件地址 test_data, label = load_one_example('./work/housing.data') # 将数据转为动态图的variable格式 test_data = dygraph.to_variable(test_data) results = model(test_data) # 对结果做反归一化处理 results = results * (max_values[-1] - min_values[-1]) + avg_values[-1] print("Inference result is {}, the corresponding label is {}".format( results.numpy(), label))
def get_part_mask(densepose_map): """ Obtain mask of different body parts of humans. This is done by looking at the body part map from DensePose. Args: densepose_map (NxCxHxW tensor): DensePose map. Returns: mask (NxKxHxW tensor): Body part mask, where K is the number of parts. """ # Group of body parts. Each group contains IDs of body labels in DensePose. # The 9 groups here are: background, torso, hands, feet, upper legs, lower legs, # upper arms, lower arms, head. part_groups = [[0], [1, 2], [3, 4], [5, 6], [7, 9, 8, 10], [11, 13, 12, 14], [15, 17, 16, 18], [19, 21, 20, 22], [23, 24]] n_parts = len(part_groups) densepose_map = densepose_map.numpy() need_reshape = len(densepose_map.shape) == 4 if need_reshape: bo, t, h, w = densepose_map.shape densepose_map = np.reshape(densepose_map, (-1, h, w)) b, h, w = densepose_map.shape part_map = (densepose_map / 2 + 0.5) * 24 assert np.all(part_map >= 0) and np.all(part_map < 25) mask = np.zeros((b, n_parts, h, w)).astype("bool") for i in range(n_parts): for j in part_groups[i]: # Account for numerical errors. mask[:, i] = np.logical_or( mask[:, i], np.logical_and((part_map > j - 0.1), (part_map < j + 0.1))) if need_reshape: mask = np.reshape(mask, (bo, t, -1, h, w)) mask = dg.to_variable(mask.astype("float32")) return mask
def func_test_buffer_not_persistable_assign(self): with fluid.dygraph.guard(): net = fluid.Layer() var1 = to_variable(np.zeros([1])) net.register_buffer("buffer_name", var1, persistable=False) # Assigning Nones will remove the buffer, but allow to re-assign # to remark it as buffer. net.buffer_name = None self.assertEqual(len(net.buffers()), 0) self.assertEqual(len(net.state_dict()), 0) net.buffer_name = var1 self.assertEqual(len(net.buffers()), 1) self.assertEqual(len(net.state_dict()), 0) # Re-assign a ParamBase will remove the buffer. if in_dygraph_mode(): net.buffer_name = EagerParamBase([2, 2], 'float32') else: net.buffer_name = ParamBase([2, 2], 'float32') self.assertEqual(len(net.buffers()), 0) self.assertEqual(len(net.state_dict()), 1)
def infer(model, infer_data, max_seq_len=300, is_tensor=True, logits_softmax=True): """ 用dygraph模型预测 [IN] model: dygraph模型结构 infer_data: list[(input1[, input2, ...])], 待预测数据 max_seq_len: int, 最大长度 is_tensor: boolean, true则infer_data已经是paddle可处理的tensor logits_softmax: boolean, true则预测结果为softmax后的logits [OUT] pred: list[float], 预测结果 """ # 在这个with域内ernie不会进行梯度计算; with D.base._switch_tracer_mode_guard_(is_train=False): # 控制模型进入eval模式,这将会关闭所有的dropout; model.eval() # 如果infer_data没有转tensor 则转为paddle接收的tensor if not is_tensor: infer_data = D.to_variable(np.array(infer_data)) logits = model(infer_data, logits_softmax=logits_softmax) # TODO: 返回rate值 pred = L.argmax(logits, -1).numpy() # 进入train模式 model.train() return pred
def paddle_nn_layer(self): x_var = dg.to_variable(self.input) if self.output_padding != 0: output_size = None else: output_size = self.output_size conv = nn.Conv2DTranspose(self.num_channels, self.num_filters, self.filter_size, padding=self.padding, output_padding=self.output_padding, stride=self.stride, dilation=self.dilation, groups=self.groups, data_format=self.data_format) conv.weight.set_value(self.weight) if not self.no_bias: conv.bias.set_value(self.bias) y_var = conv(x_var, output_size) y_np = y_var.numpy() return y_np
def func_test_register_buffer_with_error(self): with fluid.dygraph.guard(): net = fluid.Layer() var = to_variable(np.zeros([1])) with self.assertRaisesRegexp(TypeError, "name of buffer should be a string"): net.register_buffer(12, var) with self.assertRaisesRegexp(TypeError, "buffer should be a Paddle.Tensor"): if in_dygraph_mode(): net.register_buffer("buffer_name", EagerParamBase([2, 2], 'float32')) else: net.register_buffer("buffer_name", ParamBase([2, 2], 'float32')) with self.assertRaisesRegexp(KeyError, "name of buffer can not contain"): net.register_buffer("buffer.name", var) with self.assertRaisesRegexp(KeyError, "name of buffer can not be empty"): net.register_buffer("", var) net.attr_name = 10 with self.assertRaisesRegexp(KeyError, "already exists"): net.register_buffer("attr_name", var) del net.attr_name if in_dygraph_mode(): net.attr_name = EagerParamBase([2, 2], 'float32') else: net.attr_name = ParamBase([2, 2], 'float32') with self.assertRaisesRegexp(KeyError, "already exists"): net.register_buffer("attr_name", var)
def forward(self, inputs): # Use `to_variable` to create a copy of global h_0 created not in `DynamicGRU`, # to avoid modify it because `h_0` is both used in other `DynamicGRU`. hidden = to_variable(self.h_0) hidden.stop_gradient = True res = [] for i in range(inputs.shape[1]): if self.is_reverse: j = fluid.layers.shape(inputs)[1] - 1 - i else: # TODO(Aurelius84): In while block, if the var created in parent block # participates in the calculation of gradient, the result of gradient # is incorrect because each step scope always returns the same value # generated by last step. Here we add 0 to create `j` in while block to # avoid this bug, and working on fixing it in next PR. j = i + 0 # FIXME(Aurelius84): see above explanation. hidden = fluid.layers.scale(hidden, 1) # See above explanation. # input_ = inputs[:, i:i+1, :] # original code input_ = fluid.layers.slice(inputs, axes=[1], starts=[j], ends=[j + 1]) input_ = fluid.layers.reshape(input_, [-1, input_.shape[2]], inplace=False) hidden, reset, gate = self.gru_unit(input_, hidden) hidden_ = fluid.layers.reshape(hidden, [-1, 1, hidden.shape[1]], inplace=False) res.append(hidden_) if self.is_reverse: res = res[::-1] res = fluid.layers.concat(res, axis=1) return res
def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, pos=None, query_pos=None): output = tgt intermediate = [] assert tgt_mask is None, "Not implement compute tgt_mask's attn_mask." if memory_mask is not None: bs, tgt_length = tgt.shape[:2] memory_length = memory.shape[1] attn_mask = L.zeros([bs, tgt_length, memory_length], dtype="float32") memory_mask = L.expand(L.unsqueeze(memory_mask, [1]), (1, tgt_length, 1)) # [bs, tgt_length, memory_length] attn_mask = attn_mask.numpy() memory_mask = memory_mask.numpy() attn_mask[memory_mask] = -1e8 attn_mask = dg.to_variable(attn_mask) attn_mask = L.expand(L.unsqueeze(attn_mask, [1]), (1, self.nhead, 1, 1)) # [bs, nhead, tgt_length, memory_length] memory_mask = attn_mask for layer in self.layers: output = layer(output, memory, tgt_mask=tgt_mask, memory_mask=memory_mask, pos=pos, query_pos=query_pos) if self.return_intermediate: intermediate.append(self.norm(output)) if self.norm is not None: output = self.norm(output) if self.return_intermediate: intermediate.pop() intermediate.append(output) if self.return_intermediate: return L.stack(intermediate) return L.unsqueeze(output, [0])
def evaluate(env): """Evaluate""" args = env.args puncts = dygraph.to_variable(env.puncts, zero_copy=False) logging.info("Load the dataset") evaluates = Corpus.load(args.test_data_path, env.fields) dataset = TextDataset(evaluates, env.fields, args.buckets) # set the data loader dataset.loader = batchify(dataset, args.batch_size) logging.info(f"{len(dataset)} sentences, " f"{len(dataset.loader)} batches, " f"{len(dataset.buckets)} buckets") logging.info("Load the model") model = load(args.model_path) logging.info("Evaluate the dataset") start = datetime.datetime.now() loss, metric = epoch_evaluate(args, model, dataset.loader, puncts) total_time = datetime.datetime.now() - start logging.info(f"Loss: {loss:.4f} {metric}") logging.info(f"{total_time}s elapsed, " f"{len(dataset) / total_time.total_seconds():.2f} Sents/s")
def __init__(self, dict_dim, batch_size, seq_len): super(GRU, self).__init__() self.dict_dim = dict_dim self.emb_dim = 128 self.hid_dim = 128 self.fc_hid_dim = 96 self.class_dim = 2 self.batch_size = batch_size self.seq_len = seq_len self.embedding = Embedding( size=[self.dict_dim + 1, self.emb_dim], dtype='float32', param_attr=fluid.ParamAttr(learning_rate=30), is_sparse=False) h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32") h_0 = to_variable(h_0) self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.hid_dim * 3) self._fc2 = Linear(input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh") self._fc_prediction = Linear(input_dim=self.fc_hid_dim, output_dim=self.class_dim, act="softmax") self._gru = DynamicGRU(size=self.hid_dim, h_0=h_0)
def compute_position_embedding(radians, speaker_position_rate): """Compute sin/cos interleaved matrix from the radians. Arg: radians (Variable): shape(n_vocab, embed_dim), dtype float32, the radians matrix. speaker_position_rate (Variable): shape(B, ), speaker positioning rate. Returns: Variable: shape(B, n_vocab, embed_dim), the sin, cos interleaved matrix. """ _, embed_dim = radians.shape batch_size = speaker_position_rate.shape[0] speaker_position_rate = F.unsqueeze(speaker_position_rate, [1, 2]) scaled_radians = speaker_position_rate * radians odd_mask = (np.arange(embed_dim) % 2).astype(np.float32) odd_mask = dg.to_variable(odd_mask) out = odd_mask * F.cos(scaled_radians) \ + (1 - odd_mask) * F.sin(scaled_radians) out = F.concat( [F.zeros((batch_size, 1, embed_dim), radians.dtype), out[:, 1:, :]], axis=1) return out
def test_with_different_input(self): with fluid.dygraph.guard(fluid.CPUPlace()): x_data = np.ones([16, 10]).astype('float32') y_data = np.ones([10]).astype('float32') * 2 z_data = np.ones([10]).astype('float32') * 2.2 foo = declarative(foo_func) # [16, 10] + [10] (varbase) out_1 = foo(to_variable(x_data), to_variable(y_data)) self.assertTrue(np.allclose(x_data + y_data, out_1.numpy())) self.assertTrue(len(foo.program_cache) == 1) self.assertTrue(len(foo.program_cache.concrete_programs()) == 1) first_program = foo.program_cache.last() # [16, 10] + [10] (numpy) out_2 = foo(to_variable(x_data), y_data) self.assertTrue(np.allclose(x_data + y_data, out_2.numpy())) self.assertTrue(len(foo.program_cache) == 1) # [16, 10] + [10] (numpy) out_3 = foo(to_variable(x_data), z_data) self.assertTrue(np.allclose(x_data + z_data, out_3.numpy())) # hit cache program self.assertTrue(len(foo.program_cache) == 1) # [16, 10] + [10] (numpy) with other different arguments (c=3) out_4 = foo(to_variable(x_data), z_data, 3) self.assertTrue(np.allclose(x_data + z_data, out_4.numpy())) # create a new program self.assertTrue(len(foo.program_cache) == 2) # test for recent program foo(to_variable(x_data), y_data) recent_program = foo.program_cache.last() self.assertTrue(first_program == recent_program)
def run_main(self, np_arr, place): with guard(place): var = to_variable(np_arr) self.assertTrue(np.array_equal(np_arr, var.numpy()))
def train(to_static): program_translator = ProgramTranslator() program_translator.enable(to_static) random.seed(0) np.random.seed(0) place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace() with fluid.dygraph.guard(place): fluid.default_startup_program().random_seed = 1000 fluid.default_main_program().random_seed = 1000 model = YOLOv3(3, is_train=True) boundaries = cfg.lr_steps gamma = cfg.lr_gamma step_num = len(cfg.lr_steps) learning_rate = cfg.learning_rate values = [learning_rate * (gamma**i) for i in range(step_num + 1)] lr = fluid.dygraph.PiecewiseDecay(boundaries=boundaries, values=values, begin=0) lr = fluid.layers.linear_lr_warmup( learning_rate=lr, warmup_steps=cfg.warm_up_iter, start_lr=0.0, end_lr=cfg.learning_rate, ) optimizer = fluid.optimizer.Momentum( learning_rate=lr, regularization=fluid.regularizer.L2Decay(cfg.weight_decay), momentum=cfg.momentum, parameter_list=model.parameters()) start_time = time.time() snapshot_loss = 0 snapshot_time = 0 total_sample = 0 input_size = cfg.input_size shuffle = True shuffle_seed = None total_iter = cfg.max_iter mixup_iter = total_iter - cfg.no_mixup_iter train_reader = FakeDataReader().reader() smoothed_loss = SmoothedValue() ret = [] for iter_id, data in enumerate(train_reader()): prev_start_time = start_time start_time = time.time() img = np.array([x[0] for x in data]).astype('float32') img = to_variable(img) gt_box = np.array([x[1] for x in data]).astype('float32') gt_box = to_variable(gt_box) gt_label = np.array([x[2] for x in data]).astype('int32') gt_label = to_variable(gt_label) gt_score = np.array([x[3] for x in data]).astype('float32') gt_score = to_variable(gt_score) loss = model(img, gt_box, gt_label, gt_score, None, None) smoothed_loss.add_value(np.mean(loss.numpy())) snapshot_loss += loss.numpy() snapshot_time += start_time - prev_start_time total_sample += 1 print("Iter {:d}, loss {:.6f}, time {:.5f}".format( iter_id, smoothed_loss.get_mean_value(), start_time - prev_start_time)) ret.append(smoothed_loss.get_mean_value()) loss.backward() optimizer.minimize(loss) model.clear_gradients() return np.array(ret)
def train(args, fake_data_reader, to_static): program_translator = ProgramTranslator() program_translator.enable(to_static) config = parse_config(args.config) train_config = merge_configs(config, 'train', vars(args)) valid_config = merge_configs(config, 'valid', vars(args)) print_configs(train_config, 'Train') place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() random.seed(0) np.random.seed(0) with fluid.dygraph.guard(place): paddle.seed(1000) paddle.framework.random._manual_program_seed(1000) video_model = TSM_ResNet("TSM", train_config, 'Train') optimizer = create_optimizer(train_config.TRAIN, video_model.parameters()) train_reader = fake_data_reader.create_reader() ret = [] for epoch in range(train_config.TRAIN.epoch): video_model.train() total_loss = 0.0 total_acc1 = 0.0 total_acc5 = 0.0 total_sample = 0 for batch_id, data in enumerate(train_reader()): x_data = np.array([item[0] for item in data]) y_data = np.array([item[1] for item in data]).reshape([-1, 1]) imgs = to_variable(x_data) labels = to_variable(y_data) labels.stop_gradient = True outputs = video_model(imgs) loss = fluid.layers.cross_entropy(input=outputs, label=labels, ignore_index=-1) avg_loss = fluid.layers.mean(loss) acc_top1 = fluid.layers.accuracy(input=outputs, label=labels, k=1) acc_top5 = fluid.layers.accuracy(input=outputs, label=labels, k=5) avg_loss.backward() optimizer.minimize(avg_loss) video_model.clear_gradients() total_loss += avg_loss.numpy()[0] total_acc1 += acc_top1.numpy()[0] total_acc5 += acc_top5.numpy()[0] total_sample += 1 print('TRAIN Epoch {}, iter {}, loss = {}, acc1 {}, acc5 {}'. format(epoch, batch_id, avg_loss.numpy()[0], acc_top1.numpy()[0], acc_top5.numpy()[0])) ret.extend([ avg_loss.numpy()[0], acc_top1.numpy()[0], acc_top5.numpy()[0] ]) print( 'TRAIN End, Epoch {}, avg_loss= {}, avg_acc1= {}, avg_acc5= {}' .format(epoch, total_loss / total_sample, total_acc1 / total_sample, total_acc5 / total_sample)) return ret
# 定义外层循环 for epoch_id in range(EPOCH_NUM): # 在每轮迭代开始之前,将训练数据的顺序随机的打乱 np.random.shuffle(training_data) # 将训练数据进行拆分,每个batch包含10条数据 mini_batches = [ training_data[k:k + BATCH_SIZE] for k in range(0, len(training_data), BATCH_SIZE) ] # 定义内层循环 for iter_id, mini_batch in enumerate(mini_batches): x = np.array(mini_batch[:, :-1]).astype('float32') # 获得当前批次训练数据 y = np.array(mini_batch[:, -1:]).astype('float32') # 获得当前批次训练标签(真实房价) # 将numpy数据转为飞桨动态图variable形式 house_features = dygraph.to_variable(x) prices = dygraph.to_variable(y) # 前向计算 predicts = model(house_features) # 计算损失 loss = fluid.layers.square_error_cost(predicts, label=prices) avg_loss = fluid.layers.mean(loss) if iter_id % 20 == 0: print("epoch: {}, iter: {}, loss is: {}".format( epoch_id, iter_id, avg_loss.numpy())) # 反向传播 avg_loss.backward() # 最小化loss,更新参数
def alignments(args): local_rank = dg.parallel.Env().local_rank place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()) with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) with dg.guard(place): network_cfg = cfg['network'] model = TransformerTTS( network_cfg['embedding_size'], network_cfg['hidden_size'], network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'], cfg['audio']['num_mels'], network_cfg['outputs_per_step'], network_cfg['decoder_num_head'], network_cfg['decoder_n_layers']) # Load parameters. global_step = io.load_parameters( model=model, checkpoint_path=args.checkpoint_transformer) model.eval() # get text data root = Path(args.data) csv_path = root.joinpath("metadata.csv") table = pd.read_csv(csv_path, sep="|", header=None, quoting=csv.QUOTE_NONE, names=["fname", "raw_text", "normalized_text"]) ljspeech_processor = audio.AudioProcessor( sample_rate=cfg['audio']['sr'], num_mels=cfg['audio']['num_mels'], min_level_db=cfg['audio']['min_level_db'], ref_level_db=cfg['audio']['ref_level_db'], n_fft=cfg['audio']['n_fft'], win_length=cfg['audio']['win_length'], hop_length=cfg['audio']['hop_length'], power=cfg['audio']['power'], preemphasis=cfg['audio']['preemphasis'], signal_norm=True, symmetric_norm=False, max_norm=1., mel_fmin=0, mel_fmax=None, clip_norm=True, griffin_lim_iters=60, do_trim_silence=False, sound_norm=False) pbar = tqdm(range(len(table))) alignments = OrderedDict() for i in pbar: fname, raw_text, normalized_text = table.iloc[i] # init input text = np.asarray(text_to_sequence(normalized_text)) text = fluid.layers.unsqueeze(dg.to_variable(text), [0]) pos_text = np.arange(1, text.shape[1] + 1) pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0]) wav = ljspeech_processor.load_wav( os.path.join(args.data, 'wavs', fname + ".wav")) mel_input = ljspeech_processor.melspectrogram(wav).astype( np.float32) mel_input = np.transpose(mel_input, axes=(1, 0)) mel_input = fluid.layers.unsqueeze(dg.to_variable(mel_input), [0]) mel_lens = mel_input.shape[1] dec_slf_mask = get_triu_tensor(mel_input, mel_input).astype(np.float32) dec_slf_mask = np.expand_dims(dec_slf_mask, axis=0) dec_slf_mask = fluid.layers.cast(dg.to_variable(dec_slf_mask != 0), np.float32) * (-2**32 + 1) pos_mel = np.arange(1, mel_input.shape[1] + 1) pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0]) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( text, mel_input, pos_text, pos_mel, dec_slf_mask) mel_input = fluid.layers.concat( [mel_input, postnet_pred[:, -1:, :]], axis=1) alignment, _ = get_alignment(attn_probs, mel_lens, network_cfg['decoder_num_head']) alignments[fname] = alignment with open(args.output + '.txt', "wb") as f: pickle.dump(alignments, f)
def generate(self, texts, use_gpu=False, beam_width=5): """ Get the continuation of the input poetry. Args: texts(list): the front part of a poetry. use_gpu(bool): whether use gpu to predict or not beam_width(int): the beam search width. Returns: results(list): the poetry continuations. """ if texts and isinstance(texts, list) and all(texts) and all( [isinstance(text, str) for text in texts]): predicted_data = texts else: raise ValueError( "The input texts should be a list with nonempty string elements." ) for i, text in enumerate(texts): if len(text) > self.line: logger.warning( 'The input text: %s, contains more than %i characters, which will be cut off' % (text, self.line)) texts[i] = text[:self.line] for char in text: if not '\u4e00' <= char <= '\u9fff': logger.warning( 'The input text: %s, contains non-Chinese characters, which may result in magic output' % text) break if use_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ: use_gpu = False logger.warning( "use_gpu has been set False as you didn't set the environment variable CUDA_VISIBLE_DEVICES while using use_gpu=True" ) if use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() with fluid.dygraph.guard(place): self.model.eval() results = [] for text in predicted_data: sample_results = [] ids, sids = self.tokenizer.encode(text) src_ids = D.to_variable(np.expand_dims(ids, 0)) src_sids = D.to_variable(np.expand_dims(sids, 0)) output_ids = beam_search_infilling( self.model, src_ids, src_sids, eos_id=self.tokenizer.sep_id, sos_id=self.tokenizer.cls_id, attn_id=self.tokenizer.vocab['[MASK]'], max_decode_len=80, max_encode_len=20, beam_width=beam_width, tgt_type_id=1) output_str = self.rev_lookup(output_ids[0].numpy()) for ostr in output_str.tolist(): if '[SEP]' in ostr: ostr = ostr[:ostr.index('[SEP]')] sample_results.append("".join(ostr)) results.append(sample_results) return results
random.shuffle(train_features) train_batch_data = batchify(train_features, args.bsz, args.max_seqlen) if args.debug: print(len(train_batch_data)) print(train_batch_data[0]) token_ids, seg_ids, labels = train_batch_data[0] for r1, r2, r3 in zip(token_ids, seg_ids, labels): print(r1) print(r2) print(r3) print(convert_ids_to_tokens(tokenizer.vocab, r1)) for step, d in enumerate(tqdm(train_batch_data, desc='training')): ids, sids, labels = d # print(ids.shape, sids.shape, labels.shape) ids, sids, labels = FD.to_variable(ids), FD.to_variable( sids), FD.to_variable(labels) loss, logits = model(ids, sids, labels=labels) if args.ohem_ratio > 0: labels = L.reshape(labels, [-1, 1]) loss = L.softmax_with_cross_entropy(logits, labels) N = int(args.bsz * args.ohem_ratio) top_loss = L.argsort(loss, axis=0)[0][-N:] if args.debug: print(loss) print(top_loss) print(N) loss = L.reduce_sum(top_loss) / N loss.backward() global_step += 1 if step % 1000 == 0 and step > 0:
def scale_loss(self, loss): """ Scale the loss. In data parallel mode, the loss should be scale with the number of trainers. If not in data parallel mode, return the loss directly. Args: loss(Variable): The loss of the current Model. Returns: Variable: the scaled loss. Examples: .. code-block:: python import paddle import paddle.nn as nn import paddle.optimizer as opt import paddle.distributed as dist class LinearNet(nn.Layer): def __init__(self): super(LinearNet, self).__init__() self._linear1 = nn.Linear(10, 10) self._linear2 = nn.Linear(10, 1) def forward(self, x): return self._linear2(self._linear1(x)) def train(): # 1. enable dynamic mode paddle.disable_static() # 2. initialize parallel environment dist.init_parallel_env() # 3. create data parallel layer & optimizer layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() adam = opt.Adam( learning_rate=0.001, parameters=dp_layer.parameters()) # 4. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) loss = dp_layer.scale_loss(loss) loss.backward() dp_layer.apply_collective_grads() adam.step() adam.clear_grad() if __name__ == '__main__': # 1. start by ``paddle.distributed.spawn`` (default) dist.spawn(train, nprocs=2) # 2. start by ``paddle.distributed.launch`` # train() """ if not self._is_data_parallel_mode(): return loss loss_scale = to_variable( np.array([self._strategy.nranks]).astype("float32")) loss_scale.stop_gradient = True loss = loss / loss_scale return loss
def train(env): """Train""" args = env.args logging.info("loading data.") train = Corpus.load(args.train_data_path, env.fields) dev = Corpus.load(args.valid_data_path, env.fields) test = Corpus.load(args.test_data_path, env.fields) logging.info("init dataset.") train = TextDataset(train, env.fields, args.buckets) dev = TextDataset(dev, env.fields, args.buckets) test = TextDataset(test, env.fields, args.buckets) logging.info("set the data loaders.") train.loader = batchify(train, args.batch_size, args.use_data_parallel, True) dev.loader = batchify(dev, args.batch_size) test.loader = batchify(test, args.batch_size) logging.info(f"{'train:':6} {len(train):5} sentences, " f"{len(train.loader):3} batches, " f"{len(train.buckets)} buckets") logging.info(f"{'dev:':6} {len(dev):5} sentences, " f"{len(dev.loader):3} batches, " f"{len(train.buckets)} buckets") logging.info(f"{'test:':6} {len(test):5} sentences, " f"{len(test.loader):3} batches, " f"{len(train.buckets)} buckets") logging.info("Create the model") model = Model(args, env.WORD.embed) # init parallel strategy if args.use_data_parallel: strategy = dygraph.parallel.prepare_context() model = dygraph.parallel.DataParallel(model, strategy) if args.use_cuda: grad_clip = fluid.clip.GradientClipByNorm(clip_norm=args.clip) else: grad_clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=args.clip) decay = dygraph.ExponentialDecay(learning_rate=args.lr, decay_steps=args.decay_steps, decay_rate=args.decay) optimizer = fluid.optimizer.AdamOptimizer( learning_rate=decay, beta1=args.mu, beta2=args.nu, epsilon=args.epsilon, parameter_list=model.parameters(), grad_clip=grad_clip) total_time = datetime.timedelta() best_e, best_metric = 1, Metric() puncts = dygraph.to_variable(env.puncts, zero_copy=False) logging.info("start training.") for epoch in range(1, args.epochs + 1): start = datetime.datetime.now() # train one epoch and update the parameter logging.info(f"Epoch {epoch} / {args.epochs}:") epoch_train(args, model, optimizer, train.loader, epoch) if args.local_rank == 0: loss, dev_metric = epoch_evaluate(args, model, dev.loader, puncts) logging.info(f"{'dev:':6} Loss: {loss:.4f} {dev_metric}") loss, test_metric = epoch_evaluate(args, model, test.loader, puncts) logging.info(f"{'test:':6} Loss: {loss:.4f} {test_metric}") t = datetime.datetime.now() - start # save the model if it is the best so far if dev_metric > best_metric and epoch > args.patience // 10: best_e, best_metric = epoch, dev_metric save(args.model_path, args, model, optimizer) logging.info(f"{t}s elapsed (saved)\n") else: logging.info(f"{t}s elapsed\n") total_time += t if epoch - best_e >= args.patience: break if args.local_rank == 0: model = load(args.model_path, model) loss, metric = epoch_evaluate(args, model, test.loader, puncts) logging.info( f"max score of dev is {best_metric.score:.2%} at epoch {best_e}") logging.info( f"the score of test at epoch {best_e} is {metric.score:.2%}") logging.info(f"average time of each epoch is {total_time / epoch}s") logging.info(f"{total_time}s elapsed")
def __init__(self, args, vocab_size, num_labels, length=None): super(lex_net, self).__init__() """ define the lexical analysis network structure word: stores the input of the model for_infer: a boolean value, indicating if the model to be created is for training or predicting. return: for infer: return the prediction otherwise: return the prediction """ self.word_emb_dim = args.word_emb_dim self.vocab_size = vocab_size self.num_labels = num_labels self.grnn_hidden_dim = args.grnn_hidden_dim self.emb_lr = args.emb_learning_rate if 'emb_learning_rate' in dir( args) else 1.0 self.crf_lr = args.emb_learning_rate if 'crf_learning_rate' in dir( args) else 1.0 self.bigru_num = args.bigru_num self.init_bound = 0.1 #self.IS_SPARSE = True self.word_embedding = Embedding( size=[self.vocab_size, self.word_emb_dim], dtype='float32', #is_sparse=self.IS_SPARSE, param_attr=fluid.ParamAttr(learning_rate=self.emb_lr, name="word_emb", initializer=fluid.initializer.Uniform( low=-self.init_bound, high=self.init_bound))) h_0 = np.zeros((args.batch_size, self.grnn_hidden_dim), dtype="float32") h_0 = to_variable(h_0) self.bigru_units = [] for i in range(self.bigru_num): if i == 0: self.bigru_units.append( self.add_sublayer( "bigru_units%d" % i, BiGRU(self.grnn_hidden_dim, self.grnn_hidden_dim, self.init_bound, h_0=h_0))) else: self.bigru_units.append( self.add_sublayer( "bigru_units%d" % i, BiGRU(self.grnn_hidden_dim * 2, self.grnn_hidden_dim, self.init_bound, h_0=h_0))) self.fc = Linear(input_dim=self.grnn_hidden_dim * 2, output_dim=self.num_labels, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-self.init_bound, high=self.init_bound), regularizer=fluid.regularizer.L2DecayRegularizer( regularization_coeff=1e-4))) #, #num_flatten_dims=2) self.linear_chain_crf = Linear_chain_crf(param_attr=fluid.ParamAttr( name='linear_chain_crfw', learning_rate=self.crf_lr), size=self.num_labels) self.crf_decoding = Crf_decoding(param_attr=fluid.ParamAttr( name='crfw', learning_rate=self.crf_lr), size=self.num_labels)
@np.vectorize def rev_lookup(i): return rev_dict[i] ernie = ErnieCloze.from_pretrained(model_dir) ernie.eval() ids, _ = tokenizer.encode( '戊[MASK]变法,又称百日维新,是 [MASK] [MASK] [MASK] 、梁启超等维新派人士通过光绪帝进行 的一场资产阶级改良。') mask_id = tokenizer.mask_id print(ids) ids = np.expand_dims(ids, 0) ids = D.to_variable(ids) logits = ernie(ids).numpy() output_ids = np.argmax(logits, -1) seg_txt = rev_lookup(output_ids) print(''.join(seg_txt)) def predict_mask(sentence_with_mask): """ predict multi masks, support top5, multi mask :param sentence_with_mask: :return: """ ids, id_types = tokenizer.encode(sentence_with_mask) mask_id = tokenizer.mask_id
def train_bmn(args, place, to_static): program_translator.enable(to_static) loss_data = [] with fluid.dygraph.guard(place): paddle.manual_seed(SEED) paddle.framework.random._manual_program_seed(SEED) global local_random local_random = np.random.RandomState(SEED) bmn = BMN(args) adam = optimizer(args, parameter_list=bmn.parameters()) train_reader = fake_data_reader(args, 'train') for epoch in range(args.epoch): for batch_id, data in enumerate(train_reader()): video_feat = np.array([item[0] for item in data]).astype(DATATYPE) gt_iou_map = np.array([item[1] for item in data]).astype(DATATYPE) gt_start = np.array([item[2] for item in data]).astype(DATATYPE) gt_end = np.array([item[3] for item in data]).astype(DATATYPE) x_data = to_variable(video_feat) gt_iou_map = to_variable(gt_iou_map) gt_start = to_variable(gt_start) gt_end = to_variable(gt_end) gt_iou_map.stop_gradient = True gt_start.stop_gradient = True gt_end.stop_gradient = True pred_bm, pred_start, pred_end = bmn(x_data) loss, tem_loss, pem_reg_loss, pem_cls_loss = bmn_loss_func( pred_bm, pred_start, pred_end, gt_iou_map, gt_start, gt_end, args) avg_loss = fluid.layers.mean(loss) avg_loss.backward() adam.minimize(avg_loss) bmn.clear_gradients() # log loss data to verify correctness loss_data += [ avg_loss.numpy()[0], tem_loss.numpy()[0], pem_reg_loss.numpy()[0], pem_cls_loss.numpy()[0] ] if args.log_interval > 0 and (batch_id % args.log_interval == 0): print('[TRAIN] Epoch {}, iter {} '.format(epoch, batch_id) + '\tLoss = {}, \ttem_loss = {}, \tpem_reg_loss = {}, \tpem_cls_loss = {}'.format( '%f' % avg_loss.numpy()[0], '%f' % tem_loss.numpy()[0], \ '%f' % pem_reg_loss.numpy()[0], '%f' % pem_cls_loss.numpy()[0])) # validation if batch_id % args.valid_interval == 0 and batch_id > 0: bmn.eval() val_loss_data = val_bmn(bmn, args) bmn.train() loss_data += val_loss_data if batch_id == args.train_batch_num: if to_static: fluid.dygraph.jit.save(bmn, args.infer_dir) else: fluid.dygraph.save_dygraph(bmn.state_dict(), args.dy_param_path) break return np.array(loss_data)
if args.init_checkpoint is not None: print('loading checkpoint from %s' % args.init_checkpoint) sd, _ = FD.load_dygraph(args.init_checkpoint) model.set_dict(sd) test_batch_data = batchify(test_features, args.bsz, args.max_seqlen) if args.debug: print(len(test_batch_data)) print(test_batch_data[0]) token_ids, seg_ids, labels = test_batch_data[0] for r1, r2 in zip(token_ids[:5], seg_ids[:5]): print(r1) print(r2) print(convert_ids_to_tokens(tokenizer.vocab, r1)) y_pred = [] with FD.base._switch_tracer_mode_guard_(is_train=False): model.eval() for step, d in enumerate(tqdm(test_batch_data, desc='predicting')): ids, sids, _ = d ids, sids = FD.to_variable(ids), FD.to_variable(sids) _, logits = model(ids, sids) #print('\n'.join(map(str, logits.numpy().tolist()))) y_pred += L.softmax(logits, -1).numpy().tolist() if args.debug and len(y_pred) > 5: break print(len(y_pred), y_pred[:5]) print(test_segs[:5]) with open(args.save_path, 'wb') as f: pickle.dump({'segs': test_segs, 'y_pred': y_pred}, f)
def run_main(self, np_arr, place): with guard(place): embedding = Embedding(size=[10, 10]) var = to_variable(np_arr) self.assertTrue(np.array_equal(np_arr, var.numpy()))
def forward(self, outputs, targets): """ Performs the matching Params: outputs: This is a dict contains at least these entries: "pred_logits": Tensor of dim[batch_size, num_queries, num_classes] with the classification logits "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicated box coordinates targets: This is a list of targets (len(targets) == batch_size), where each target is a dict containing: "labels": Tensor of dim[num_target_boxes] (where num_target_boxes is the number of ground-truth) objects in the target) containing the class labels "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordiantes Returns: A list of size batch_size, containing tuples of (index_i, index_j) where: - index_i is the indices of the selected predictions (in order) - index_j is the indices of the corresponding selected targets (in order) For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) """ with dg.no_grad(): bs, num_queries, num_classes = outputs["pred_logits"].shape # We flatten to compute the cost matrices in a batch out_prob = L.reshape( outputs["pred_logits"], [-1, num_classes]) # [batch_size * num_queries, num_classes] out_prob = L.softmax( out_prob, axis=-1) # [batch_size * num_queries, num_classes] out_bbox = L.reshape(outputs["pred_boxes"], [-1, 4]) # [batch_size * num_queries, 4] # Alse concat the target labels and boxes tgt_ids = L.concat([v["labels"] for v in targets]).astype( "int64") # [batch_size * num_target_boxes_i] tgt_bbox = L.concat([v["boxes"] for v in targets]).astype( "float32") # [batch_size * num_target_boxes_i] # Compute the classification cost. Contrary to the loss, we don't use the NLL, # but approximate it in 1 - proba[target class]. # The 1 is a constant that donesn't change the matching, it can be ommitted. cost_class = -out_prob.numpy()[:, tgt_ids.numpy( )] # [batch_size * num_queries, num_all_target_boxes] cost_class = dg.to_variable(cost_class) # Compute the L1 cost between boxes num_all_target_boxes = tgt_bbox.shape[0] expanded_out_bbox = L.expand( L.unsqueeze(out_bbox, [1]), [1, num_all_target_boxes, 1 ]) # [batch_size * num_queries, num_all_target_boxes, 4] expanded_tgt_bbox = L.expand( L.unsqueeze(tgt_bbox, [0]), [bs * num_queries, 1, 1 ]) # [batch_size * num_queries, num_all_target_boxes, 4] cost_bbox = F.loss.l1_loss( expanded_out_bbox, expanded_tgt_bbox, reduction='none' ) # [batch_size * num_queries, num_all_target_boxes, 4] cost_bbox = L.reduce_mean( cost_bbox, -1) # [batch_size * num_queries, num_all_target_boxes] # Compute the giou cost between boxes cost_giou = -generalied_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox)) # Final cost matrix C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou C = L.reshape( C, [bs, num_queries, -1 ]) # [batch_size, num_queries, num_all_target_boxes] sizes = [len(v["boxes"]) for v in targets] indices = [ linear_sum_assignment(c[i].numpy()) for i, c in enumerate(L.split(C, sizes, dim=-1)) ] return [(dg.to_variable(i.astype("int64")), dg.to_variable(j.astype("int64"))) for i, j in indices]
def setUp(self): paddle.disable_static() program_trans.enable(True) self.x = to_variable(np.ones([4, 10]).astype('float32'))
def train(self, train_data_list, eval_data_list, model_save_path=None, best_model_save_path=None, epochs=5, batch_size=32, learning_rate=5e-5, max_seq_len=300, max_ensure=False, print_step=50, load_best_model=True, **kwargs): """ 训练dygraph模型 [IN] model: dygraph模型结构 optimizer: 优化器 train_data_list: list[(input1[, input2, ...], label)], 训练数据 eval_data_list: list[(input1[, input2, ...], label)], 评估数据 label_encoder: LabelEncoder, 类别转化工具 model_save_path: string, 模型存储路径 best_model_save_path: string, 最优模型存储路径 epochs: int, 训练轮数 batch_size: int, 批大小 max_seq_len: int, 最大长度 max_ensure: boolean, true则始终补齐到max_seq_len best_acc: float, 最优acc初始值 print_step: int, 每个print_step打印训练情况 logits_softmax: boolean, true则验证时输出softmax后的logits eval_method: str, eval模型效果 with_label: boolean, true则数据中有label [OUT] best_acc: float, 训练得到的最优acc """ logging.info("train model start") train_start_time = time.time() # 加载最优模型 if load_best_model: self.load_model(best_model_save_path) # 进入train模式 self.model.train() # 初始化优化器 self.init_optimizer(learning_rate) def train_data_reader(): return gen_batch_data(train_data_list, batch_size, max_seq_len, max_ensure) cur_train_step = 0 for cur_epoch in range(epochs): # 每个epoch都shuffle数据以获得最佳训练效果; np.random.shuffle(train_data_list) train_data_batch = F.contrib.reader.distributed_batch_reader(train_data_reader)() \ if self.parallelized else train_data_reader() for cur_train_batch in train_data_batch: cur_train_step += 1 cur_train_batch = [D.to_variable(x) for x in cur_train_batch] loss = self.get_loss(*cur_train_batch, **kwargs) if self.parallelized: # 若多卡 则将各训练的loss归一化 loss = self.model.scale_loss(loss) # 反向传播 loss.backward() if self.parallelized: # 若多卡 则各训练的梯度收集 # 注意梯度更新时需要时LoDTensor,即为dense矩阵 # 例如:embedding层的is_sparse参数需要为False, # 否则更新时将是稀疏更新, 多卡训练时会出错 self.model.apply_collective_grads() self.optimizer.minimize(loss) # 清空梯度 self.model.clear_gradients() if cur_train_step % print_step == 0: speed = cur_train_step / (time.time() - train_start_time) logging.info('train epoch %d, step %d: loss %.5f, speed %.2f step/s' % \ (cur_epoch, cur_train_step, loss.numpy(), speed)) if model_save_path is not None: # 每轮保存模型 logging.info("save model at epoch {}".format(cur_epoch)) self.save_model(model_save_path + "_epoch{}".format(cur_epoch)) # 计算验证集准确率 cur_eval_res = self.evaluate(eval_data_list, batch_size=batch_size, max_seq_len=max_seq_len, **kwargs) is_best = self.check_if_best(cur_eval_res) if best_model_save_path is not None and is_best: # 如果是当前最优效果模型 则保存为best模型 logging.info("cur best score, save model at epoch {} as best model".format(cur_epoch)) self.save_model(best_model_save_path) logging.info("train model cost time %.4fs" % (time.time() - train_start_time)) return self.get_best_score()