def train(): trainer_count = fluid.dygraph.parallel.Env().nranks place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \ if args.use_data_parallel else fluid.CUDAPlace(0) with fluid.dygraph.guard(place): if args.use_data_parallel: strategy = fluid.dygraph.parallel.prepare_context() if args.benchmark: args.epoch = 1 processor = reader.SentaProcessor( data_dir=args.data_dir, vocab_path=args.vocab_path, random_seed=args.random_seed) num_labels = len(processor.get_labels()) num_train_examples = processor.get_num_examples(phase="train") # max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count train_data_generator = processor.data_generator( batch_size=args.batch_size, phase='train', epoch=args.epoch, shuffle=True) eval_data_generator = processor.data_generator( batch_size=args.batch_size, phase='dev', epoch=args.epoch, shuffle=False) cnn_net = nets.CNN("cnn_net", args.vocab_size, args.batch_size, args.padding_size) if args.use_data_parallel: cnn_net = fluid.dygraph.parallel.DataParallel(cnn_net, strategy) if args.use_data_parallel: train_data_generator = fluid.contrib.reader.distributed_batch_reader(train_data_generator) sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=args.lr) steps = 0 total_cost, total_acc, total_num_seqs = [], [], [] length = len(list(enumerate(train_data_generator()))) for eop in range(args.epoch): time_begin = time.time() batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.9f') progress = ProgressMeter(length, batch_time, data_time, prefix="epoch: [{}]".format(eop)) end = Tools.time() for batch_id, data in enumerate(train_data_generator()): data_time.update(Tools.time() - end) steps += 1 doc = to_variable( np.array([ np.pad(x[0][0:args.padding_size], ( 0, args.padding_size - len(x[0][ 0:args.padding_size])), 'constant', constant_values=(args.vocab_size)) for x in data ]).astype('int64').reshape(-1, 1)) label = to_variable( np.array([x[1] for x in data]).astype('int64').reshape( args.batch_size, 1)) cnn_net.train() avg_cost, prediction, acc = cnn_net(doc, label) if args.use_data_parallel: avg_cost = cnn_net.scale_loss(avg_cost) avg_cost.backward() cnn_net.apply_collective_grads() else: avg_cost.backward() batch_time.update(Tools.time() - end) np_mask = (doc.numpy() != args.vocab_size).astype('int32') word_num = np.sum(np_mask) sgd_optimizer.minimize(avg_cost) cnn_net.clear_gradients() total_cost.append(avg_cost.numpy() * word_num) total_acc.append(acc.numpy() * word_num) total_num_seqs.append(word_num) if steps % args.skip_steps == 0: time_end = time.time() used_time = time_end - time_begin progress.print(batch_id + 1) #print("step: %d, ave loss: %f, " # "ave acc: %f, speed: %f steps/s" % # (steps, np.sum(total_cost) / np.sum(total_num_seqs), # np.sum(total_acc) / np.sum(total_num_seqs), # args.skip_steps / used_time)) total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() if steps % args.validation_steps == 0: total_eval_cost, total_eval_acc, total_eval_num_seqs = [], [], [] cnn_net.eval() eval_steps = 0 for eval_batch_id, eval_data in enumerate( eval_data_generator()): eval_np_doc = np.array([ np.pad(x[0][0:args.padding_size], (0, args.padding_size - len(x[0][0:args.padding_size])), 'constant', constant_values=(args.vocab_size)) for x in eval_data ]).astype('int64').reshape(1, -1) eval_label = to_variable( np.array([x[1] for x in eval_data]).astype('int64') .reshape(args.batch_size, 1)) eval_doc = to_variable(eval_np_doc.reshape(-1, 1)) eval_avg_cost, eval_prediction, eval_acc = cnn_net( eval_doc, eval_label) eval_np_mask = ( eval_np_doc != args.vocab_size).astype('int32') eval_word_num = np.sum(eval_np_mask) total_eval_cost.append(eval_avg_cost.numpy() * eval_word_num) total_eval_acc.append(eval_acc.numpy() * eval_word_num) total_eval_num_seqs.append(eval_word_num) eval_steps += 1 time_end = time.time() used_time = time_end - time_begin print("Final validation result: step: %d, ave loss: %f, " "ave acc: %f, speed: %f steps/s" % (steps, np.sum(total_eval_cost) / np.sum(total_eval_num_seqs), np.sum(total_eval_acc) / np.sum(total_eval_num_seqs), eval_steps / used_time)) time_begin = time.time() if steps % args.save_steps == 0: save_path = "save_dir_" + str(steps) print('save model to: ' + save_path) fluid.dygraph.save_persistables(cnn_net.state_dict(), save_path) end = Tools.time()
def train_one_epoch(self, epoch): losses = [] accs = [] for i in range(self.model_num): if self.use_data_parallel: self.parallel_models[i].train() else: self.models[i].train() losses.append(AvgrageMeter()) accs.append(AvgrageMeter()) for step_indx, (images, labels) in enumerate(self.train_loader): images, labels = to_variable(images), to_variable(labels) batch_size = images.shape[0] logits = [] if self.use_data_parallel: for model in self.parallel_models: logits.append(model(images)) else: for model in self.models: logits.append(model(images)) log_msg = 'Train Epoch {}, Step {}'.format(epoch, step_indx) for i in range(self.model_num): gt_loss = self.models[i].loss(logits[i], labels) kl_loss = 0 for j in range(self.model_num): if i != j: x = F.log_softmax(logits[i], axis=1) y = fluid.layers.softmax(logits[j], axis=1) kl_loss += fluid.layers.kldiv_loss( x, y, reduction='batchmean') loss = gt_loss if (self.model_num > 1): loss += kl_loss / (self.model_num - 1) prec = fluid.layers.accuracy(input=logits[i], label=labels, k=1) losses[i].update(loss.numpy(), batch_size) accs[i].update(prec.numpy() * 100, batch_size) if self.use_data_parallel: loss = self.parallel_models[i].scale_loss(loss) loss.backward() self.parallel_models[i].apply_collective_grads() else: loss.backward() self.optimizers[i].minimize(loss) if self.use_data_parallel: self.parallel_models[i].clear_gradients() else: self.models[i].clear_gradients() log_msg += ', model{}_loss: {:.3f}'.format( i + 1, losses[i].avg[0]) if step_indx % self.log_freq == 0: logger.info(log_msg) return losses, accs
def train(args): config = parse_config(args.config) train_config = merge_configs(config, 'train', vars(args)) valid_config = merge_configs(config, 'valid', vars(args)) print_configs(train_config, 'Train') use_data_parallel = False trainer_count = fluid.dygraph.parallel.Env().nranks place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \ if use_data_parallel else fluid.CUDAPlace(0) with fluid.dygraph.guard(place): if use_data_parallel: strategy = fluid.dygraph.parallel.prepare_context() video_model = NonLocal("NonLocal", train_config, mode="train") optimizer = create_optimizer(train_config.TRAIN, video_model.parameters()) if use_data_parallel: video_model = fluid.dygraph.parallel.DataParallel(video_model, strategy) bs_denominator = 1 if args.use_gpu: # check number of GPUs gpus = os.getenv("CUDA_VISIBLE_DEVICES", "") if gpus == "": pass else: gpus = gpus.split(",") num_gpus = len(gpus) assert num_gpus == train_config.TRAIN.num_gpus, \ "num_gpus({}) set by CUDA_VISIBLE_DEVICES" \ "shoud be the same as that" \ "set in {}({})".format( num_gpus, args.config, train_config.TRAIN.num_gpus) bs_denominator = train_config.TRAIN.num_gpus train_config.TRAIN.batch_size = int(train_config.TRAIN.batch_size / bs_denominator) train_reader = NonlocalReader(name="NONLOCAL", mode="train", cfg=train_config) train_reader = train_reader.create_reader() if use_data_parallel: train_reader = fluid.contrib.reader.distributed_batch_reader( train_reader) for epoch in range(train_config.TRAIN.epoch): video_model.train() total_loss = 0.0 total_acc1 = 0.0 total_acc5 = 0.0 total_sample = 0 for batch_id, data in enumerate(train_reader()): x_data = np.array([item[0] for item in data]).astype('float32') y_data = np.array([item[1] for item in data]).astype('int64') x_data = to_variable(x_data) labels = to_variable(y_data) labels.stop_gradient = True outputs = video_model(x_data, train_config) loss = fluid.layers.cross_entropy(outputs, labels, soft_label=False, ignore_index=-100) loss = fluid.layers.reduce_sum(loss, dim=-1) avg_loss = fluid.layers.mean(loss) acc_top1 = fluid.layers.accuracy(input=outputs, label=labels, k=1) acc_top5 = fluid.layers.accuracy(input=outputs, label=labels, k=5) if use_data_parallel: avg_loss = video_model.scale_loss(avg_loss) avg_loss.backward() video_model.apply_collective_grads() else: avg_loss.backward() optimizer.minimize(avg_loss) video_model.clear_gradients() total_loss += avg_loss.numpy()[0] total_acc1 += acc_top1.numpy()[0] total_acc5 += acc_top5.numpy()[0] total_sample += 1 print('TRAIN Epoch {}, iter {}, loss = {}, acc1 {}, acc5 {}'. format(epoch, batch_id, avg_loss.numpy()[0], acc_top1.numpy()[0], acc_top5.numpy()[0])) print( 'TRAIN End, Epoch {}, avg_loss= {}, avg_acc1= {}, avg_acc5= {}'. format(epoch, total_loss / total_sample, total_acc1 / total_sample, total_acc5 / total_sample)) video_model.eval() val(epoch, video_model, valid_config, args) if fluid.dygraph.parallel.Env().local_rank == 0: fluid.dygraph.save_dygraph(video_model.state_dict(), "final") logger.info('[TRAIN] training finished')
def train(): with fluid.dygraph.guard(place): if args.ce: print("ce mode") seed = 90 np.random.seed(seed) fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed processor = reader.SentaProcessor(data_dir=args.data_dir, vocab_path=args.vocab_path, random_seed=args.random_seed) num_labels = len(processor.get_labels()) num_train_examples = processor.get_num_examples(phase="train") max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count if not args.ce: train_data_generator = processor.data_generator( batch_size=args.batch_size, phase='train', epoch=args.epoch, shuffle=True) eval_data_generator = processor.data_generator( batch_size=args.batch_size, phase='dev', epoch=args.epoch, shuffle=False) else: train_data_generator = processor.data_generator( batch_size=args.batch_size, phase='train', epoch=args.epoch, shuffle=False) eval_data_generator = processor.data_generator( batch_size=args.batch_size, phase='dev', epoch=args.epoch, shuffle=False) cnn_net = nets.CNN("cnn_net", args.vocab_size, args.batch_size, args.padding_size) sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=args.lr) steps = 0 total_cost, total_acc, total_num_seqs = [], [], [] for eop in range(args.epoch): time_begin = time.time() for batch_id, data in enumerate(train_data_generator()): enable_profile = steps > args.profile_steps with profile_context(enable_profile): steps += 1 doc = to_variable( np.array([ np.pad(x[0][0:args.padding_size], (0, args.padding_size - len(x[0][0:args.padding_size])), 'constant', constant_values=(args.vocab_size)) for x in data ]).astype('int64').reshape(-1, 1)) label = to_variable( np.array([x[1] for x in data]).astype('int64').reshape( args.batch_size, 1)) cnn_net.train() avg_cost, prediction, acc = cnn_net(doc, label) avg_cost.backward() np_mask = (doc.numpy() != args.vocab_size).astype('int32') word_num = np.sum(np_mask) sgd_optimizer.minimize(avg_cost) cnn_net.clear_gradients() total_cost.append(avg_cost.numpy() * word_num) total_acc.append(acc.numpy() * word_num) total_num_seqs.append(word_num) if steps % args.skip_steps == 0: time_end = time.time() used_time = time_end - time_begin print("step: %d, ave loss: %f, " "ave acc: %f, speed: %f steps/s" % (steps, np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs), args.skip_steps / used_time)) total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() if steps % args.validation_steps == 0: total_eval_cost, total_eval_acc, total_eval_num_seqs = [], [], [] cnn_net.eval() eval_steps = 0 for eval_batch_id, eval_data in enumerate( eval_data_generator()): eval_np_doc = np.array([ np.pad(x[0][0:args.padding_size], (0, args.padding_size - len(x[0][0:args.padding_size])), 'constant', constant_values=(args.vocab_size)) for x in eval_data ]).astype('int64').reshape(1, -1) eval_label = to_variable( np.array([x[1] for x in eval_data ]).astype('int64').reshape( args.batch_size, 1)) eval_doc = to_variable(eval_np_doc.reshape(-1, 1)) eval_avg_cost, eval_prediction, eval_acc = cnn_net( eval_doc, eval_label) eval_np_mask = (eval_np_doc != args.vocab_size).astype('int32') eval_word_num = np.sum(eval_np_mask) total_eval_cost.append(eval_avg_cost.numpy() * eval_word_num) total_eval_acc.append(eval_acc.numpy() * eval_word_num) total_eval_num_seqs.append(eval_word_num) eval_steps += 1 time_end = time.time() used_time = time_end - time_begin print( "Final validation result: step: %d, ave loss: %f, " "ave acc: %f, speed: %f steps/s" % (steps, np.sum(total_eval_cost) / np.sum(total_eval_num_seqs), np.sum(total_eval_acc) / np.sum(total_eval_num_seqs), eval_steps / used_time)) time_begin = time.time() if args.ce: print("kpis\ttrain_loss\t%0.3f" % (np.sum(total_eval_cost) / np.sum(total_eval_num_seqs))) print("kpis\ttrain_acc\t%0.3f" % (np.sum(total_eval_acc) / np.sum(total_eval_num_seqs))) if steps % args.save_steps == 0: save_path = "save_dir_" + str(steps) print('save model to: ' + save_path) fluid.dygraph.save_persistables( cnn_net.state_dict(), save_path) if enable_profile: print('save profile result into /tmp/profile_file') return
NUM_CLASSES = 7 if __name__ == '__main__': with fluid.dygraph.guard(): model = YOLOv3('yolov3', num_classes=NUM_CLASSES, is_train=False) model_state_dict, _ = fluid.load_dygraph(WEIGHT_FILE) model.load_dict(model_state_dict) model.eval() total_results = [] test_loader = single_image_data_loader(IMAGE_NAME, mode='test') for i, data in enumerate(test_loader()): img_name, img_data, img_scale_data = data img = to_variable(img_data) img_scale = to_variable(img_scale_data) outputs = model.forward(img) bboxes, scores = model.get_pred(outputs, im_shape=img_scale, anchors=ANCHORS, anchor_masks=ANCHOR_MASKS, valid_thresh = VALID_THRESH) bboxes_data = bboxes.numpy() scores_data = scores.numpy() results = multiclass_nms(bboxes_data, scores_data, score_thresh=VALID_THRESH, nms_thresh=NMS_THRESH, pre_nms_topk=NMS_TOPK,
def func_testSetNumpyBeforeTrain(self): seed = 90 hidden_size = 10 vocab_size = 1000 num_layers = 1 num_steps = 3 init_scale = 0.1 batch_size = 4 batch_num = 200 with fluid.dygraph.guard(): paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) # TODO: marsyang1993 Change seed to ptb_model = PtbModel(hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale) bd = [] lr_arr = [0.0] # this a fake lr decay strategy for i in range(1, 10): bd.append(100 * i) # set lr to 0.0, not update parameter new_lr = 0.0 lr_arr.append(new_lr) place = fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0) adam = Adam(learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr_arr), beta1=0.8, beta2=0.6, parameter_list=ptb_model.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None last_hidden = None last_cell = None np_opti_dict = {} np_state_dict = {} for k, v in self.opti_dict.items(): if isinstance(v, (core.VarBase, core.eager.Tensor)): np_opti_dict[v.name] = v.numpy() else: np_opti_dict[k] = v for k, v in self.state_dict.items(): np_state_dict[k] = v.numpy() adam.set_state_dict(np_opti_dict) ptb_model.set_state_dict(np_state_dict) for i in range(1): x_data = np.arange(12).reshape(4, 3).astype('int64') y_data = np.arange(1, 13).reshape(4, 3).astype('int64') y_data = y_data.reshape((-1, 1)) init_hidden_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') x = to_variable(x_data) y = to_variable(y_data) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) dy_loss, last_hidden, last_cell = ptb_model( x, y, init_hidden, init_cell) dy_loss.backward() adam.minimize(dy_loss) ptb_model.clear_gradients() opti_dict = adam.state_dict() for k, v in opti_dict.items(): if k == "global_step": self.assertTrue( np.array_equal(v.numpy(), self.base_opti[v.name] + 1)) if k.find("beta1_pow_acc_0") > 0: self.assertTrue( np.array_equal(v.numpy(), self.base_opti[v.name] * adam._beta1)) if k.find("beta2_pow_acc_0") > 0: self.assertTrue( np.array_equal(v.numpy(), self.base_opti[v.name] * adam._beta2)) # check parameter state_dict = ptb_model.state_dict() for k, v in state_dict.items(): new_t = v.numpy() base_t = self.model_base[k] self.assertTrue(np.array_equal(new_t, base_t))
def train(args): config = parse_config(args.config) train_config = merge_configs(config, 'train', vars(args)) valid_config = merge_configs(config, 'valid', vars(args)) print_configs(train_config, 'Train') local_rank = fluid.dygraph.parallel.Env().local_rank use_data_parallel = args.use_data_parallel trainer_count = fluid.dygraph.parallel.Env().nranks if not args.use_gpu: place = fluid.CPUPlace() elif not args.use_data_parallel: place = fluid.CUDAPlace(0) else: #(data_parallel step1/6) place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) #load pretrain assert os.path.exists(args.weights), \ "Given dir {} not exist.".format(args.weights) pre_state_dict = fluid.load_program_state(args.weights) #for key in pre_state_dict.keys(): # print('pre_state_dict.key: {}'.format(key)) with fluid.dygraph.guard(place): #1. init model video_model = TSM_ResNet("TSM", train_config) #2. set weights param_state_dict = {} model_dict = video_model.state_dict() for key in model_dict.keys(): weight_name = model_dict[key].name if weight_name in pre_state_dict.keys( ) and weight_name != "fc_0.w_0" and weight_name != "fc_0.b_0": print('succ Load weight: {}, shape: {}'.format( weight_name, pre_state_dict[weight_name].shape)) param_state_dict[key] = pre_state_dict[weight_name] else: print('fail Load weight: {}'.format(weight_name)) param_state_dict[key] = model_dict[key] video_model.set_dict(param_state_dict) #3. init optim optimizer = create_optimizer(train_config.TRAIN, video_model.parameters()) if use_data_parallel: #(data_parallel step2,3/6) strategy = fluid.dygraph.parallel.prepare_context() video_model = fluid.dygraph.parallel.DataParallel( video_model, strategy) # 4. load checkpoint if args.checkpoint: assert os.path.exists(args.checkpoint + ".pdparams"), \ "Given dir {}.pdparams not exist.".format(args.checkpoint) assert os.path.exists(args.checkpoint + ".pdopt"), \ "Given dir {}.pdopt not exist.".format(args.checkpoint) para_dict, opti_dict = fluid.dygraph.load_dygraph(args.checkpoint) video_model.set_dict(para_dict) optimizer.set_dict(opti_dict) # 5. reader bs_denominator = 1 if args.use_gpu: gpus = os.getenv("CUDA_VISIBLE_DEVICES", "") if gpus == "": pass else: gpus = gpus.split(",") num_gpus = len(gpus) assert num_gpus == train_config.TRAIN.num_gpus, \ "num_gpus({}) set by CUDA_VISIBLE_DEVICES" \ "shoud be the same as that" \ "set in {}({})".format( num_gpus, args.config, train_config.TRAIN.num_gpus) bs_denominator = train_config.TRAIN.num_gpus train_config.TRAIN.batch_size = int(train_config.TRAIN.batch_size / bs_denominator) train_reader = UCF101Reader(name="TSM", mode="train", cfg=train_config) train_reader = train_reader.create_reader() if use_data_parallel: #(data_parallel step4/6) train_reader = fluid.contrib.reader.distributed_batch_reader( train_reader) # 6. train loop reader_cost_averager = TimeAverager() batch_cost_averager = TimeAverager() for epoch in range(train_config.TRAIN.epoch): epoch_start = time.time() video_model.train() total_loss = 0.0 total_acc1 = 0.0 total_acc5 = 0.0 total_sample = 0 # 6.1 for each batch, call model() , backward(), and minimize() batch_start = time.time() for batch_id, data in enumerate(train_reader()): t1 = time.time() reader_cost_averager.record(t1 - batch_start) x_data = np.array([item[0] for item in data]) y_data = np.array([item[1] for item in data]).reshape([-1, 1]) imgs = to_variable(x_data) labels = to_variable(y_data) labels.stop_gradient = True t2 = time.time() outputs = video_model(imgs) t3 = time.time() loss = fluid.layers.cross_entropy(input=outputs, label=labels, ignore_index=-1) avg_loss = fluid.layers.mean(loss) acc_top1 = fluid.layers.accuracy(input=outputs, label=labels, k=1) acc_top5 = fluid.layers.accuracy(input=outputs, label=labels, k=5) current_step_lr = optimizer.current_step_lr() if use_data_parallel: #(data_parallel step5/6) avg_loss = video_model.scale_loss(avg_loss) avg_loss.backward() video_model.apply_collective_grads() else: avg_loss.backward() t4 = time.time() optimizer.minimize(avg_loss) video_model.clear_gradients() avg_loss_value = avg_loss.numpy()[0] acc_top1_value = acc_top1.numpy()[0] acc_top5_value = acc_top5.numpy()[0] total_loss += avg_loss_value total_acc1 += acc_top1_value total_acc5 += acc_top5_value total_sample += 1 t5 = time.time() batch_cost_averager.record( t5 - batch_start, num_samples=train_config.TRAIN.batch_size) if batch_id % args.log_interval == 0: print( 'TRAIN Epoch: %d, iter: %d, loss: %.5f, acc1: %.5f, acc5: %.5f, lr: %.5f, forward_cost:%.5f s, backward_cost:%.5f s, minimize_cost:%.5f s, to_variable_cost: %.5f s, batch_cost: %.5f sec, reader_cost: %.5f sec, ips: %.5f samples/sec' % (epoch, batch_id, avg_loss_value, acc_top1_value, acc_top5_value, current_step_lr, t3 - t2, t4 - t3, t5 - t4, t2 - t1, batch_cost_averager.get_average(), reader_cost_averager.get_average(), batch_cost_averager.get_ips_average())) batch_cost_averager.reset() reader_cost_averager.reset() batch_start = time.time() train_epoch_cost = time.time() - epoch_start print( 'TRAIN End, Epoch {}, avg_loss= {:.5f}, avg_acc1= {:.5f}, avg_acc5= {:.5f}, lr={:.5f}, epoch_cost: {:.5f} sec' .format(epoch, total_loss / total_sample, total_acc1 / total_sample, total_acc5 / total_sample, current_step_lr, train_epoch_cost)) # 6.2 save checkpoint if local_rank == 0: if not os.path.isdir(args.model_save_dir): os.makedirs(args.model_save_dir) model_path = os.path.join( args.model_save_dir, args.model_path_pre + "_epoch{}".format(epoch)) fluid.dygraph.save_dygraph(video_model.state_dict(), model_path) fluid.dygraph.save_dygraph(optimizer.state_dict(), model_path) print('save_dygraph End, Epoch {}/{} '.format( epoch, train_config.TRAIN.epoch)) # 6.3 validation video_model.eval() val(epoch, video_model, valid_config, args) # 7. save final model if local_rank == 0: model_path = os.path.join(args.model_save_dir, args.model_path_pre + "_final") fluid.dygraph.save_dygraph(video_model.state_dict(), model_path) fluid.dygraph.save_dygraph(optimizer.state_dict(), model_path) logger.info('[TRAIN] training finished')
def test_forward_hook_return_value(self): seed = 90 places = [fluid.CPUPlace()] if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for place in places: with fluid.dygraph.guard(place): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed fluid.set_flags({'FLAGS_sort_sum_gradient': True}) input_word = np.array( [0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8]).reshape(6, 3).astype('int64') input_word1 = input_word * 2 input_word = input_word.reshape((-1, 3, 1)) input_word1 = input_word1.reshape((-1, 3, 1)) y_data = np.array( [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9]).reshape(6, 3).astype('int64') y_data = y_data.reshape((-1, 1)) input = base.to_variable(input_word) input1 = base.to_variable(input_word1) y = base.to_variable(y_data) simplenet = SimpleNet( hidden_size=20, vocab_size=32, num_steps=3, init_scale=0.1, is_sparse=False, dtype="float32") # origin, don't register any hook outs_origin = simplenet(input, y) outs_origin1 = simplenet(input1, y) # register forward_pre_hook forward_pre_hook_handle1 = simplenet.register_forward_pre_hook( forward_pre_hook1) outs_pre_hook = simplenet(input, y) self.assertTrue( np.array_equal(outs_pre_hook.numpy(), outs_origin1.numpy())) # remove forward_pre_hook forward_pre_hook_handle1.remove() outs_pre_hook = simplenet(input, y) self.assertTrue( np.array_equal(outs_pre_hook.numpy(), outs_origin.numpy())) # register forward_hook forward_hook_handle1 = simplenet.register_forward_post_hook( forward_hook1) outs_forward_hook = simplenet(input, y) self.assertTrue( np.array_equal(outs_forward_hook.numpy(), outs_origin.numpy() * 2)) # remove forward_hook forward_hook_handle1.remove() outs_forward_hook = simplenet(input, y) self.assertTrue( np.array_equal(outs_forward_hook.numpy(), outs_origin.numpy()))
def parse(self, db_value): x = to_variable(db_value) return {"x1": x}
def test_forward_hook(self): seed = 90 places = [fluid.CPUPlace()] if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for place in places: with fluid.dygraph.guard(place): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy.sort_sum_gradient = True global call_forward_hook global call_forward_pre_hook input_word = np.array( [0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8]).reshape(6, 3).astype('int64') input_word = input_word.reshape((-1, 3, 1)) y_data = np.array( [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9]).reshape(6, 3).astype('int64') y_data = y_data.reshape((-1, 1)) input = base.to_variable(input_word) y = base.to_variable(y_data) simplenet = SimpleNet(hidden_size=20, vocab_size=32, num_steps=3, init_scale=0.1, is_sparse=False, dtype="float32") # origin, don't register any hook outs_origin = simplenet(input, y) self.assertFalse(call_forward_hook) self.assertFalse(call_forward_pre_hook) # register forward_hook and forward_pre_hook forward_hook_handle = simplenet.register_forward_post_hook( forward_hook) forward_pre_hook_handle = simplenet.register_forward_pre_hook( forward_pre_hook) outs_hook = simplenet(input, y) self.assertTrue(call_forward_hook) self.assertTrue(call_forward_pre_hook) outs_hook = simplenet(input, y) self.assertTrue(call_forward_hook) self.assertTrue(call_forward_pre_hook) # remove forward_hook forward_hook_handle.remove() call_forward_hook = False call_forward_pre_hook = False outs_remove_forward_hook = simplenet(input, y) self.assertFalse(call_forward_hook) self.assertTrue(call_forward_pre_hook) # remove forward_pre_hook forward_pre_hook_handle.remove() call_forward_hook = False call_forward_pre_hook = False outs_remove_hook = simplenet(input, y) self.assertFalse(call_forward_hook) self.assertFalse(call_forward_pre_hook)
def train_model(): place = fluid.CUDAPlace(0) with fluid.dygraph.guard(place): # 1. init net and optimizer if args.model == "MobileNetV1": net = MobileNetV1(class_dim=args.class_dim, scale=1.0) elif args.model == "MobileNetV2": net = MobileNetV2(class_dim=args.class_dim, scale=1.0) elif args.model == "ResNet50": net = ResNet() elif args.model == "ResNet101": net = ResNet(layers=101) else: print( "wrong model name, please try model = ResNet50 or MobileNetV1 or MobileNetV2" ) exit() optimizer = fluid.optimizer.AdamOptimizer( parameter_list=net.parameters()) # for param in net.parameters(): # print(param.name, param.shape) input_fake = np.ones((args.batch_size, 3, 224, 224)).astype(np.float32) target_fake = np.ones((args.batch_size, 1)).astype(np.int) global train_images batch_number = train_images / args.batch_size # 2. train loop for eop in range(args.num_epochs): net.train() img = to_variable(input_fake) label = to_variable(target_fake) print("\nBegin Training Epoch {}".format(eop + 1)) epoch_start_time = time.time() batch_id = 0 for i in range(int(batch_number)): t1 = time.time() # img = to_variable(input_fake) # label = to_variable(target_fake) out = net(img) softmax_out = fluid.layers.softmax(out, use_cudnn=False) loss = fluid.layers.cross_entropy(input=softmax_out, label=label) avg_loss = fluid.layers.mean(x=loss) avg_loss.backward() optimizer.minimize(avg_loss) net.clear_gradients() t2 = time.time() train_batch_elapse = t2 - t1 print("epoch id: %d, batch step: %d, forward_backward %2.4f" % (eop, batch_id, train_batch_elapse)) batch_id += 1 epoch_end_time = time.time() print("\nAfter Training Epoch {} time is: {:.4f}".format( eop + 1, epoch_end_time - epoch_start_time))
def forward(self, xforward): """ xforward, xreverse = B T C H W tensors. """ xreverse = xforward[:, ::-1, :, :, :] y_out_fwd, _ = self.forward_net(xforward) y_out_rev, _ = self.reverse_net(xreverse) y_out_fwd = y_out_fwd[ -1] # outputs of last CLSTM layer = B, T, C, H, W y_out_rev = y_out_rev[-1] # print(reversed_idx) y_out_rev = y_out_rev[:, ::-1, :, :, :] # print(y_out_rev.shape) ycat = fluid.layers.concat([y_out_fwd, y_out_rev], axis=2) return ycat if __name__ == '__main__': with fluid.dygraph.guard(): input = np.random.randn(5, 20, 1280, 7, 7).astype('float32') x = to_variable(input) model = ConvBGRU(in_channels=1280, hidden_channels=64, kernel_size=(3, 3), num_layers=2) out = model(x) print(out.shape)
def train(args): with fluid.dygraph.guard(): backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy.sort_sum_gradient = True ocr_attention = OCRAttention("ocr_attention") if Config.learning_rate_decay == "piecewise_decay": learning_rate = fluid.layers.piecewise_decay( [50000], [Config.LR, Config.LR * 0.01]) else: learning_rate = Config.LR optimizer = fluid.optimizer.Adam(learning_rate=0.001) dy_param_init_value = {} grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5.0 ) train_reader = data_reader.train( Config.batch_size, max_length=Config.max_length, train_images_dir=args.train_images, train_list_file=args.train_list, cycle=args.total_step > 0, shuffle=True, model=args.model) infer_image= './data/data/test_images/' infer_files = './data/data/test.list' test_reader = data_reader.train( Config.batch_size, 1000, train_images_dir= infer_image, train_list_file= infer_files, cycle=False, model=args.model) def eval(): ocr_attention.eval() total_loss = 0.0 total_step = 0.0 equal_size = 0 for data in test_reader(): data_dict = get_attention_feeder_data(data) label_in = to_variable(data_dict["label_in"]) label_out = to_variable(data_dict["label_out"]) label_out._stop_gradient = True label_out.trainable = False img = to_variable(data_dict["pixel"]) prediction = ocr_attention(img, label_in) prediction = fluid.layers.reshape( prediction, [label_out.shape[0] * label_out.shape[1], -1], inplace=False) score, topk = layers.topk( prediction, 1) seq = topk.numpy() seq = seq.reshape( ( args.batch_size, -1)) mask = data_dict['mask'].reshape( (args.batch_size, -1)) seq_len = np.sum( mask, -1) trans_ref = data_dict["label_out"].reshape( (args.batch_size, -1)) for i in range( args.batch_size ): length = int(seq_len[i] -1 ) trans = seq[i][:length - 1] ref = trans_ref[i][ : length - 1] if np.array_equal( trans, ref ): equal_size += 1 total_step += args.batch_size print( "eval cost", equal_size / total_step ) total_step = 0 epoch_num = 20 for epoch in range(epoch_num): batch_id = 0 total_loss = 0.0 for data in train_reader(): total_step += 1 data_dict = get_attention_feeder_data(data) label_in = to_variable(data_dict["label_in"]) label_out = to_variable(data_dict["label_out"]) label_out._stop_gradient = True label_out.trainable = False img = to_variable(data_dict["pixel"]) prediction = ocr_attention(img, label_in) prediction = fluid.layers.reshape( prediction, [label_out.shape[0] * label_out.shape[1], -1], inplace=False) label_out = fluid.layers.reshape(label_out, [-1, 1], inplace=False) loss = fluid.layers.cross_entropy( input=prediction, label=label_out) mask = to_variable(data_dict["mask"]) loss = layers.elementwise_mul( loss, mask, axis=0) avg_loss = fluid.layers.reduce_sum(loss) total_loss += avg_loss.numpy() avg_loss.backward() optimizer.minimize(avg_loss, grad_clip=grad_clip) ocr_attention.clear_gradients() framework._dygraph_tracer()._clear_ops() if batch_id > 0 and batch_id % 1000 == 0: print("epoch: {}, batch_id: {}, loss {}".format(epoch, batch_id, total_loss / args.batch_size / 1000)) total_loss = 0.0 if total_step > 0 and total_step % 2000 == 0: model_value = ocr_attention.state_dict() np.savez( "model/" + str(total_step), **model_value ) ocr_attention.eval() eval() ocr_attention.train() batch_id +=1
dy_param_updated = dict() dy_param_init = dict() dy_loss = None last_hidden = None last_cell = None for i in range(batch_num): x_data = np.arange(12).reshape(4, 3).astype('int64') y_data = np.arange(1, 13).reshape(4, 3).astype('int64') x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, 1)) init_hidden_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') x = to_variable(x_data) y = to_variable(y_data) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) outs = ptb_model(x, y, init_hidden, init_cell) dy_loss, last_hidden, last_cell = outs if i == 0: for param in ptb_model.parameters(): dy_param_init[param.name] = param.numpy() dy_loss.backward() sgd.minimize(dy_loss) ptb_model.clear_gradients() if i == batch_num - 1: for param in ptb_model.parameters(): dy_param_updated[param.name] = param.numpy()
def func_testSetNumpy(self): seed = 90 hidden_size = 10 vocab_size = 1000 num_layers = 1 num_steps = 3 init_scale = 0.1 batch_size = 4 batch_num = 200 with fluid.dygraph.guard(): paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) # TODO: marsyang1993 Change seed to ptb_model = PtbModel(hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale) bd = [] lr_arr = [1.0] # this a fake lr decay strategy for i in range(1, 10): bd.append(100 * i) new_lr = 1.0 lr_arr.append(new_lr) place = fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0) adam = Adam(learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr_arr), parameter_list=ptb_model.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None last_hidden = None last_cell = None for i in range(batch_num): x_data = np.arange(12).reshape(4, 3).astype('int64') y_data = np.arange(1, 13).reshape(4, 3).astype('int64') y_data = y_data.reshape((-1, 1)) init_hidden_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') x = to_variable(x_data) y = to_variable(y_data) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) dy_loss, last_hidden, last_cell = ptb_model( x, y, init_hidden, init_cell) if i == 0: for param in ptb_model.parameters(): dy_param_init[param.name] = param.numpy() dy_loss.backward() adam.minimize(dy_loss) ptb_model.clear_gradients() if i == batch_num - 1: for param in ptb_model.parameters(): dy_param_updated[param.name] = param.numpy() # check optimizer opti_dict = adam.state_dict() np_opti_dict = {} # set to zero for k, v in opti_dict.items(): if isinstance(v, (core.VarBase, core.eager.Tensor)): np_t = v.numpy() np_opti_dict[v.name] = np_t var = v.value().get_tensor() var.set(np.zeros_like(np_t), place) self.assertTrue(np.sum(np.abs(v.numpy())) == 0) else: np_opti_dict[k] = v if isinstance(adam._learning_rate, LearningRateDecay): adam._learning_rate.step_num = 0 adam.set_state_dict(np_opti_dict) opti_dict = adam.state_dict() for k, v in opti_dict.items(): if isinstance(v, (core.VarBase, core.eager.Tensor)): self.assertTrue( np.array_equal(v.numpy(), self.base_opti[v.name])) else: self.assertEqual(v, self.base_opti[k]) # check parameter state_dict = ptb_model.state_dict() np_state_dict = {} for k, v in state_dict.items(): np_t = v.numpy() np_state_dict[k] = np_t var = v.value().get_tensor() var.set(np.zeros_like(np_t), place) ptb_model.set_state_dict(np_state_dict) state_dict = ptb_model.state_dict() for k, v in state_dict.items(): new_t = v.numpy() base_t = self.model_base[k] self.assertTrue(np.array_equal(new_t, base_t))
def train(): place = fluid.CUDAPlace(0) if cfg.use_cuda else fluid.CPUPlace() if cfg.train_model == 'deepfm': with fluid.dygraph.guard(place): model = DeepFM() elif cfg.train_model == 'dnnplus': with fluid.dygraph.guard(place): model = DNNPlus() elif cfg.train_model == 'dnn': with fluid.dygraph.guard(place): model = DNN() elif cfg.train_model == 'drnn': with fluid.dygraph.guard(place): model = DRNN() with fluid.dygraph.guard(place): optimizer = fluid.optimizer.Adam( learning_rate=cfg.learning_rate, parameter_list=model.parameters(), regularization=fluid.regularizer.L2DecayRegularizer(cfg.reg)) # optimizer = fluid.optimizer.SGD(learning_rate=cfg.learning_rate, # parameter_list=model.parameters()) file_list = [ os.path.join(cfg.train_files_path, x) for x in os.listdir(cfg.train_files_path) ] train_reader = data_reader(cfg.batch_size, file_list, cfg.feat_dict, data_type="train") start_epoch = 0 if cfg.checkpoint: model_dict, optimizer_dict = fluid.dygraph.load_dygraph( cfg.checkpoint) model.set_dict(model_dict) optimizer.set_dict(optimizer_dict) start_epoch = int( os.path.basename(cfg.checkpoint).split("_")[ -1]) # get next train epoch logger.info("load model {} finished.".format(cfg.checkpoint)) logger.info("Training Begin") for epoch in range(start_epoch, cfg.epoches): start_time = time.time() total_loss = 0.0 total_auc = 0.0 count = 0 auc_metric = fluid.metrics.Auc('ROC') if not os.path.isdir(os.path.join(cfg.log_dir, model.name)): os.makedirs( os.path.join(cfg.log_dir, model.name)) log_path = os.path.join(cfg.log_dir, model.name, str(epoch + 1) + '_train_result.log') f = open(log_path, 'w+') model.train() for batch_id, data in enumerate(train_reader()): raw_feat_idx, raw_feat_value, label = zip(*data) raw_feat_idx = np.array(raw_feat_idx, dtype=np.int64) raw_feat_value = np.array(raw_feat_value, dtype=np.float32) label = np.array(label, dtype=np.int64) raw_feat_idx, raw_feat_value, label = [ to_variable(i) for i in [raw_feat_idx, raw_feat_value, label] ] predict = model(raw_feat_idx, raw_feat_value, label) loss = fluid.layers.log_loss( input=predict, label=fluid.layers.cast(label, dtype="float32")) batch_loss = fluid.layers.reduce_sum(loss) total_loss += batch_loss.numpy().item() batch_loss.backward() optimizer.minimize(batch_loss) model.clear_gradients() count += 1 predict_2d = fluid.layers.concat([1 - predict, predict], 1) auc_metric.update(preds=predict_2d.numpy(), labels=label.numpy()) if (batch_id + 1) % cfg.log_interval == 0: logger.info( "epoch: %d, batch_id: %d, loss: %.6f, auc: %.6f" % ( epoch + 1, batch_id + 1, total_loss / count / cfg.batch_size, auc_metric.eval())) if (batch_id + 1) % cfg.log_interval_2 == 0: f.write('%d,%d,%.4f,%.4f\n' % (epoch + 1, batch_id + 1, total_loss / count / cfg.batch_size, auc_metric.eval())) end_time = time.time() logger.info("epoch %d finished, use time = %ds \n" % ((epoch + 1), end_time - start_time)) if (epoch + 1) % cfg.save_interval == 0: model_path = os.path.join(str(cfg.save_path), model.name, model.name + "_epoch_" + str(epoch + 1)) if not os.path.isdir(model_path): os.makedirs(model_path) logger.info("saving model to %s \n" % (model_path)) fluid.dygraph.save_dygraph(model.state_dict(), model_path) fluid.dygraph.save_dygraph(optimizer.state_dict(), model_path) f.close() evaluate(model, epoch + 1) logger.info("Done.")
def func_testSetVariableBeforeTrain(self): seed = 90 hidden_size = 10 vocab_size = 1000 num_layers = 1 num_steps = 3 init_scale = 0.1 batch_size = 4 batch_num = 200 with fluid.dygraph.guard(): # TODO: marsyang1993 Change seed to ptb_model = PtbModel(hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale) place = fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0) adam = Adam(learning_rate=0.0, beta1=0.8, beta2=0.6, parameter_list=ptb_model.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None last_hidden = None last_cell = None adam.set_state_dict(self.opti_dict) ptb_model.set_state_dict(self.state_dict) for i in range(1): x_data = np.arange(12).reshape(4, 3).astype('int64') y_data = np.arange(1, 13).reshape(4, 3).astype('int64') y_data = y_data.reshape((-1, 1)) init_hidden_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') x = to_variable(x_data) y = to_variable(y_data) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) dy_loss, last_hidden, last_cell = ptb_model( x, y, init_hidden, init_cell) dy_loss.backward() adam.minimize(dy_loss) ptb_model.clear_gradients() opti_dict = adam.state_dict() for k, v in opti_dict.items(): if k == "global_step": self.assertTrue( np.array_equal(v.numpy(), self.base_opti[v.name] + 1)) if k.find("beta1_pow_acc_0") > 0: self.assertTrue( np.array_equal(v.numpy(), self.base_opti[v.name] * adam._beta1)) if k.find("beta2_pow_acc_0") > 0: self.assertTrue( np.array_equal(v.numpy(), self.base_opti[v.name] * adam._beta2)) state_dict = ptb_model.state_dict() for k, v in state_dict.items(): new_t = v.numpy() base_t = self.model_base[k] self.assertTrue(np.array_equal(new_t, base_t))
def simple_net_float32(self, is_sparse, dtype): places = [fluid.CPUPlace()] if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for place in places: seed = 90 hidden_size = 10 vocab_size = 1000 num_steps = 3 init_scale = 0.1 batch_size = 4 batch_num = 200 for is_sort_sum_gradient in [True, False]: with fluid.dygraph.guard(place): paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) simple_net = SimpleNet(hidden_size=hidden_size, vocab_size=vocab_size, num_steps=num_steps, init_scale=init_scale, is_sparse=is_sparse, dtype=dtype) sgd = SGDOptimizer(learning_rate=1e-3, parameter_list=simple_net.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None helper = DyGraphProgramDescTracerTestHelper(self) fluid.set_flags( {'FLAGS_sort_sum_gradient': is_sort_sum_gradient}) for i in range(batch_num): x_data = np.arange(12).reshape(4, 3).astype('int64') y_data = np.arange(1, 13).reshape(4, 3).astype('int64') x_data = x_data.reshape((-1, num_steps)) y_data = y_data.reshape((-1, 1)) x = to_variable(x_data) y = to_variable(y_data) outs = simple_net(x, y) dy_loss = outs if i == 0: for param in simple_net.parameters(): dy_param_init[param.name] = param.numpy() dy_loss.backward() sgd.minimize(dy_loss) sgd.clear_gradients() if i == batch_num - 1: for param in simple_net.parameters(): dy_param_updated[param.name] = param.numpy() dy_loss_value = dy_loss.numpy() with new_program_scope(): paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) simple_net = SimpleNet(hidden_size=hidden_size, vocab_size=vocab_size, num_steps=num_steps, is_sparse=is_sparse, dtype=dtype) exe = fluid.Executor(place) sgd = SGDOptimizer(learning_rate=1e-3) x = fluid.layers.data(name="x", shape=[-1, num_steps], dtype='int64') y = fluid.layers.data(name="y", shape=[-1, 1], dtype=dtype) static_loss = simple_net(x, y) sgd.minimize(static_loss) static_param_updated = dict() static_param_init = dict() static_param_name_list = list() for param in simple_net.parameters(): static_param_name_list.append(param.name) out = exe.run(fluid.default_startup_program(), fetch_list=static_param_name_list) for i in range(len(static_param_name_list)): static_param_init[static_param_name_list[i]] = out[i] static_loss_value = None for i in range(batch_num): x_data = np.arange(12).reshape(4, 3).astype('int64') y_data = np.arange(1, 13).reshape(4, 3).astype('int64') x_data = x_data.reshape((-1, num_steps)) y_data = y_data.reshape((-1, 1)) fetch_list = [static_loss] fetch_list.extend(static_param_name_list) out = exe.run(fluid.default_main_program(), feed={ "x": x_data, "y": y_data }, fetch_list=fetch_list) static_loss_value = out[0] if i == batch_num - 1: for k in range(3, len(out)): static_param_updated[static_param_name_list[ k - 1]] = out[k] self.assertTrue( np.allclose(static_loss_value, dy_loss_value, rtol=1e-3)) for key, value in six.iteritems(static_param_init): self.assertTrue(np.array_equal(value, dy_param_init[key])) for key, value in six.iteritems(static_param_updated): self.assertTrue( np.array_equal(value, dy_param_updated[key]))
def test_gnn_float32(self): paddle.manual_seed(90) paddle.framework.random._manual_program_seed(90) startup = fluid.Program() main = fluid.Program() scope = fluid.core.Scope() with new_program_scope(main=main, startup=startup, scope=scope): features = fluid.layers.data( name='features', shape=[1, 100, 50], dtype='float32', append_batch_size=False) # Use selected rows when it's supported. adj = fluid.layers.data( name='adj', shape=[1, 100, 100], dtype='float32', append_batch_size=False) labels = fluid.layers.data( name='labels', shape=[100, 1], dtype='int64', append_batch_size=False) model = GCN('test_gcn', 50) logits = model(features, adj) logits = fluid.layers.reshape(logits, logits.shape[1:]) # In other example, it's nll with log_softmax. However, paddle's # log_loss only supports binary classification now. loss = fluid.layers.softmax_with_cross_entropy(logits, labels) loss = fluid.layers.reduce_sum(loss) adam = AdamOptimizer(learning_rate=1e-3) adam.minimize(loss) exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) exe.run(startup) static_loss = exe.run(feed={ 'features': np.ones( [1, 100, 50], dtype=np.float32), 'adj': np.ones( [1, 100, 100], dtype=np.float32), 'labels': np.ones( [100, 1], dtype=np.int64) }, fetch_list=[loss])[0] static_weight = np.array( scope.find_var(model.gc.weight.name).get_tensor()) with fluid.dygraph.guard(): paddle.manual_seed(90) paddle.framework.random._manual_program_seed(90) features = np.ones([1, 100, 50], dtype=np.float32) # Use selected rows when it's supported. adj = np.ones([1, 100, 100], dtype=np.float32) labels = np.ones([100, 1], dtype=np.int64) model = GCN('test_gcn', 50) logits = model(to_variable(features), to_variable(adj)) logits = fluid.layers.reshape(logits, logits.shape[1:]) # In other example, it's nll with log_softmax. However, paddle's # log_loss only supports binary classification now. loss = fluid.layers.softmax_with_cross_entropy(logits, to_variable(labels)) loss = fluid.layers.reduce_sum(loss) loss.backward() adam = AdamOptimizer( learning_rate=1e-3, parameter_list=model.parameters()) adam.minimize(loss) model.clear_gradients() loss_value = loss.numpy() model_gc_weight_value = model.gc.weight.numpy() with fluid.dygraph.guard(): paddle.manual_seed(90) paddle.framework.random._manual_program_seed(90) features2 = np.ones([1, 100, 50], dtype=np.float32) # Use selected rows when it's supported. adj2 = np.ones([1, 100, 100], dtype=np.float32) labels2 = np.ones([100, 1], dtype=np.int64) model2 = GCN('test_gcn', 50) logits2 = model2(to_variable(features2), to_variable(adj2)) logits2 = fluid.layers.reshape(logits2, logits2.shape[1:]) # In other example, it's nll with log_softmax. However, paddle's # log_loss only supports binary classification now. loss2 = fluid.layers.softmax_with_cross_entropy( logits2, to_variable(labels2)) loss2 = fluid.layers.reduce_sum(loss2) loss2.backward() adam2 = AdamOptimizer( learning_rate=1e-3, parameter_list=model2.parameters()) adam2.minimize(loss2) model2.clear_gradients() loss2_value = loss2.numpy() model2_gc_weight_value = model2.gc.weight.numpy() self.assertEqual(static_loss, loss_value) self.assertTrue(np.allclose(static_weight, model_gc_weight_value)) self.assertEqual(static_loss, loss2_value) self.assertTrue(np.allclose(static_weight, model2_gc_weight_value)) sys.stderr.write('%s %s\n' % (static_loss, loss_value))
def evaluate(): place = fluid.CUDAPlace(0) if cfg.use_cuda else fluid.CPUPlace() inference_scope = fluid.Scope() test_files = [ os.path.join(cfg.evaluate_file_path, x) for x in os.listdir(cfg.evaluate_file_path) ] dataset = CriteoDataset() test_reader = paddle.batch(dataset.test(test_files), batch_size=cfg.batch_size) with fluid.dygraph.guard(place): if cfg.train_model == 'drnn': model = DRNN() elif cfg.train_model == 'dnn': model = DNN() elif cfg.train_model == 'fcdnn': model = FCDNN() model_path = os.path.join(cfg.save_path, model.name, model.name + "_epoch_" + str(cfg.test_epoch)) model_dict, optimizer_dict = fluid.dygraph.load_dygraph(model_path) model.set_dict(model_dict) logger.info("load model {} finished.".format(model_path)) model.eval() logger.info('Begin evaluate model.') run_index = 0 infer_auc = 0.0 L = [] for batch_id, data in enumerate(test_reader()): dense_feature, sparse_feature, label = zip(*data) sparse_feature = np.array(sparse_feature, dtype=np.int64) dense_feature = np.array(dense_feature, dtype=np.float32) label = np.array(label, dtype=np.int64) sparse_feature, dense_feature, label = [ to_variable(i) for i in [sparse_feature, dense_feature, label] ] avg_cost, auc_var = model(dense_feature, sparse_feature, label) run_index += 1 infer_auc += auc_var.numpy().item() L.append(avg_cost.numpy() / cfg.batch_size) if batch_id % cfg.log_interval == 0: logger.info("TEST --> batch: {} loss: {} auc: {}".format( batch_id, avg_cost.numpy() / cfg.batch_size, infer_auc / run_index)) infer_loss = np.mean(L) infer_auc = infer_auc / run_index infer_result = {} infer_result['loss'] = infer_loss infer_result['auc'] = infer_auc if not os.path.isdir(cfg.log_dir): os.makedirs(cfg.log_dir) log_path = os.path.join(cfg.log_dir, model.name + '_infer_result.log') logger.info(str(infer_result)) with open(log_path, 'w+') as f: f.write(str(infer_result)) logger.info("Done.") return infer_result
# 输出:output, (hn,cn) # 输入数据格式: # input(seq_len, batch, input_size) # h0(num_layers * num_directions, batch, hidden_size) # c0(num_layers * num_directions, batch, hidden_size) # 输出数据格式: # output(seq_len, batch, hidden_size * num_directions) # hn(num_layers * num_directions, batch, hidden_size) # cn(num_layers * num_directions, batch, hidden_size) # x = torch.rand(7,3,151) # net = nn.LSTM(input_size=151, hidden_size=128, num_layers=1, batch_first=True) # y,_ = net(x) # print(y.shape) import paddle.fluid as fluid import paddle.fluid.dygraph.base as base import numpy D = 151 T = 1 #sum(lod[0]) input = numpy.random.rand(T, 3 * D).astype('float32') hidden_input = numpy.random.rand(T, D).astype('float32') with fluid.dygraph.guard(): gru = fluid.dygraph.GRUUnit(size=D * 3) h, r, g = gru(base.to_variable(input), base.to_variable(hidden_input)) print(h.shape, r.shape, g.shape)
def test_mnist_sort_gradient_float32(self): seed = 90 epoch_num = 1 with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy.sort_sum_gradient = True mnist2 = MNIST("mnist") sgd2 = SGDOptimizer(learning_rate=1e-3) train_reader2 = paddle.batch(paddle.dataset.mnist.train(), batch_size=128, drop_last=True) mnist2.train() dy_param_init_value2 = {} for epoch in range(epoch_num): for batch_id, data in enumerate(train_reader2()): dy_x_data2 = np.array([ x[0].reshape(1, 28, 28) for x in data ]).astype('float32') y_data2 = np.array([x[1] for x in data ]).astype('int64').reshape(128, 1) img2 = to_variable(dy_x_data2) label2 = to_variable(y_data2) label2.stop_gradient = True cost2 = mnist2(img2) loss2 = fluid.layers.cross_entropy(cost2, label2) avg_loss2 = fluid.layers.mean(loss2) dy_out2 = avg_loss2.numpy() if epoch == 0 and batch_id == 0: for param in mnist2.parameters(): dy_param_init_value2[param.name] = param.numpy() avg_loss2.backward(backward_strategy) sgd2.minimize(avg_loss2) mnist2.clear_gradients() dy_param_value2 = {} for param in mnist2.parameters(): dy_param_value2[param.name] = param.numpy() if batch_id == 20: break with new_program_scope(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) mnist = MNIST("mnist") sgd = SGDOptimizer(learning_rate=1e-3) train_reader = paddle.batch(paddle.dataset.mnist.train(), batch_size=128, drop_last=True) img = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') cost = mnist(img) loss = fluid.layers.cross_entropy(cost, label) avg_loss = fluid.layers.mean(loss) sgd.minimize(avg_loss) # initialize params and fetch them static_param_init_value = {} static_param_name_list = [] for param in mnist.parameters(): static_param_name_list.append(param.name) out = exe.run(fluid.default_startup_program(), fetch_list=static_param_name_list) for i in range(len(static_param_name_list)): static_param_init_value[static_param_name_list[i]] = out[i] for epoch in range(epoch_num): for batch_id, data in enumerate(train_reader()): static_x_data = np.array([ x[0].reshape(1, 28, 28) for x in data ]).astype('float32') y_data = np.array([x[1] for x in data ]).astype('int64').reshape([128, 1]) fetch_list = [avg_loss.name] fetch_list.extend(static_param_name_list) out = exe.run(fluid.default_main_program(), feed={ "pixel": static_x_data, "label": y_data }, fetch_list=fetch_list) static_param_value = {} static_out = out[0] for i in range(1, len(out)): static_param_value[static_param_name_list[i - 1]] = out[i] if batch_id == 20: break self.assertTrue(np.allclose(dy_x_data2.all(), static_x_data.all())) for key, value in six.iteritems(static_param_init_value): self.assertTrue(np.allclose(value, dy_param_init_value2[key])) self.assertTrue(np.allclose(static_out, dy_out2)) for key, value in six.iteritems(static_param_value): self.assertTrue(np.allclose(value, dy_param_value2[key], atol=1e-5))
def forward(self, input): N, C, H, W = input.shape _, _, res4, res5 = self.resnet(input) feature = self.connect_conv(res5) feature = self.connect_bn(feature) feature = self.connect_relu(feature) gru_output = self.gru_module(feature) dropout = self.dropout(gru_output) logit = self.get_logit_conv(dropout) logit = F.common.interpolate(logit, size=[H, W], mode='BILINEAR') if 1: aux_logit = self.auxhead(res4) aux_logit = F.common.interpolate(aux_logit, size=[H, W], mode='BILINEAR') return logit, aux_logit return logit if __name__ == "__main__": from paddle.fluid.dygraph.base import to_variable import numpy as np with fluid.dygraph.guard(): model = GruModule(input_channel=512, num_state=128, num_node=64) data = np.random.uniform(-1, 1, [2, 512, 96, 96]).astype('float32') data = to_variable(data) y = model(data) print(y.shape)
def train_ptb_lm(): args = parse_args() # check if set use_gpu=True in paddlepaddle cpu version model_check.check_cuda(args.use_gpu) place = core.CPUPlace() if args.use_gpu: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) # check if paddlepaddle version is satisfied model_check.check_version() model_type = args.model_type vocab_size = 10000 if model_type == "test": num_layers = 1 batch_size = 2 hidden_size = 10 num_steps = 3 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 1 max_epoch = 1 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "small": num_layers = 2 batch_size = 20 hidden_size = 200 num_steps = 20 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 4 max_epoch = 13 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "medium": num_layers = 2 batch_size = 20 hidden_size = 650 num_steps = 35 init_scale = 0.05 max_grad_norm = 5.0 epoch_start_decay = 6 max_epoch = 39 dropout = 0.5 lr_decay = 0.8 base_learning_rate = 1.0 elif model_type == "large": num_layers = 2 batch_size = 20 hidden_size = 1500 num_steps = 35 init_scale = 0.04 max_grad_norm = 10.0 epoch_start_decay = 14 max_epoch = 55 dropout = 0.65 lr_decay = 1.0 / 1.15 base_learning_rate = 1.0 else: print("model type not support") return with fluid.dygraph.guard(place): if args.ce: print("ce mode") seed = 33 np.random.seed(seed) fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed max_epoch = 1 ptb_model = PtbModel(hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale, dropout=dropout) if args.init_from_pretrain_model: if not os.path.exists(args.init_from_pretrain_model + '.pdparams'): print(args.init_from_pretrain_model) raise Warning("The pretrained params do not exist.") return fluid.load_dygraph(args.init_from_pretrain_model) print("finish initing model from pretrained params from %s" % (args.init_from_pretrain_model)) dy_param_updated = dict() dy_param_init = dict() dy_loss = None last_hidden = None last_cell = None data_path = args.data_path print("begin to load data") ptb_data = reader.get_ptb_data(data_path) print("finished load data") train_data, valid_data, test_data = ptb_data batch_len = len(train_data) // batch_size total_batch_size = (batch_len - 1) // num_steps log_interval = 200 bd = [] lr_arr = [1.0] for i in range(1, max_epoch): bd.append(total_batch_size * i) new_lr = base_learning_rate * (lr_decay**max( i + 1 - epoch_start_decay, 0.0)) lr_arr.append(new_lr) grad_clip = fluid.clip.GradientClipByGlobalNorm(max_grad_norm) sgd = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr_arr), parameter_list=ptb_model.parameters(), grad_clip=grad_clip) def reader_decorator(reader): def __reader__(): for item in reader: x_data = item[0].reshape((-1, num_steps, 1)) y_data = item[1].reshape((-1, num_steps, 1)) yield x_data, y_data return __reader__ def eval(model, data): print("begin to eval") total_loss = 0.0 iters = 0.0 init_hidden_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') model.eval() train_data_iter = reader_decorator( reader.get_data_iter(data, batch_size, num_steps)) eval_data_loader = fluid.io.DataLoader.from_generator(capacity=200) eval_data_loader.set_batch_generator(train_data_iter, places=place) for batch_id, batch in enumerate(eval_data_loader): x, y = batch init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) dy_loss, last_hidden, last_cell = ptb_model( x, y, init_hidden, init_cell) out_loss = dy_loss.numpy() init_hidden_data = last_hidden.numpy() init_cell_data = last_cell.numpy() total_loss += out_loss iters += num_steps print("eval finished") ppl = np.exp(total_loss / iters) print("ppl ", batch_id, ppl[0]) ce_time = [] ce_ppl = [] total_batch_num = 0 #this is for benchmark for epoch_id in range(max_epoch): epoch_start = time.time() ptb_model.train() total_loss = 0.0 iters = 0.0 init_hidden_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') train_data_iter = reader_decorator( reader.get_data_iter(train_data, batch_size, num_steps)) train_data_loader = fluid.io.DataLoader.from_generator( capacity=200) train_data_loader.set_batch_generator(train_data_iter, places=place) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) batch_cost_avg = TimeCostAverage() reader_cost_avg = TimeCostAverage() batch_start = time.time() for batch_id, batch in enumerate(train_data_loader): if args.max_iter and total_batch_num == args.max_iter: return train_reader_cost = time.time() - batch_start reader_cost_avg.record(train_reader_cost) x, y = batch dy_loss, last_hidden, last_cell = ptb_model( x, y, init_hidden, init_cell) init_hidden = last_hidden.detach() init_cell = last_cell.detach() out_loss = dy_loss.numpy() dy_loss.backward() sgd.minimize(dy_loss) ptb_model.clear_gradients() global_lr = sgd._global_learning_rate().numpy() total_loss += out_loss iters += num_steps total_batch_num = total_batch_num + 1 #this is for benchmark train_batch_cost = time.time() - batch_start batch_cost_avg.record(train_batch_cost) if batch_id > 0 and batch_id % log_interval == 0: ppl = np.exp(total_loss / iters) print( "-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f, batch_cost: %.5f sec, reader_cost: %.5f sec, ips: %.5f words/sec" % (epoch_id, batch_id, ppl[0], global_lr, out_loss, batch_cost_avg.get_average(), reader_cost_avg.get_average(), batch_size / batch_cost_avg.get_average())) batch_cost_avg.reset() reader_cost_avg.reset() batch_start = time.time() ppl = np.exp(total_loss / iters) train_epoch_cost = time.time() - epoch_start print("-- Epoch:[%d]; ppl: %.5f, epoch_cost: %.5f s" % (epoch_id, ppl[0], train_epoch_cost)) ce_time.append(train_epoch_cost) ce_ppl.append(ppl[0]) if batch_size <= 20 and epoch_id == 0 and ppl[0] > 1000: # for bad init, after first epoch, the loss is over 1000 # no more need to continue print( "Parameters are randomly initialized and not good this time because the loss is over 1000 after the first epoch." ) print("Abort this training process and please start again.") return save_model_dir = os.path.join(args.save_model_dir, str(epoch_id), 'params') fluid.save_dygraph(ptb_model.state_dict(), save_model_dir) print("Saved model to: %s.\n" % save_model_dir) eval(ptb_model, valid_data) if args.ce: _ppl = 0 _time = 0 try: _time = ce_time[-1] _ppl = ce_ppl[-1] except: print("ce info error") print("kpis\ttrain_duration_card%s\t%s" % (dev_count, _time)) print("kpis\ttrain_ppl_card%s\t%f" % (dev_count, _ppl)) eval(ptb_model, test_data)
if model_depth == 10: model = ResNet(BasicBlock, [1, 1, 1, 1], get_inplanes(), **kwargs) elif model_depth == 18: model = ResNet(BasicBlock, [2, 2, 2, 2], get_inplanes(), **kwargs) elif model_depth == 34: model = ResNet(BasicBlock, [3, 4, 6, 3], get_inplanes(), **kwargs) elif model_depth == 50: model = ResNet(Bottleneck, [3, 4, 6, 3], get_inplanes(), **kwargs) elif model_depth == 101: model = ResNet(Bottleneck, [3, 4, 23, 3], get_inplanes(), **kwargs) elif model_depth == 152: model = ResNet(Bottleneck, [3, 8, 36, 3], get_inplanes(), **kwargs) elif model_depth == 200: model = ResNet(Bottleneck, [3, 24, 36, 3], get_inplanes(), **kwargs) return model if __name__ == "__main__": with fluid.dygraph.guard(): """ 输入: 输入Tensor的维度: [N,Cin,Din,Hin,Win] """ x = np.random.randn(10, 3, 8, 224, 224).astype('float32') x = to_variable(x) net = generate_model(10, conv1_t_size=8) # net = FrameSubNet(3, 3) out = net(x) print(out.shape)
simple_net = fluid.dygraph.parallel.DataParallel(simple_net, strategy) train_reader = paddle.batch(ptb_train_reader(), batch_size=batch_size, drop_last=True) train_reader = fluid.contrib.reader.distributed_batch_reader(train_reader) sgd = fluid.optimizer.SGD(learning_rate=1e-3, parameter_list=simple_net.parameters()) dy_loss = None for i, data in enumerate(train_reader()): x_data = np.array([x[0].reshape(3) for x in data]).astype('int64') y_data = np.array([x[1].reshape(3) for x in data]).astype('int64') x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, 1)) x = to_variable(x_data) y = to_variable(y_data) dy_loss = simple_net(x, y) dy_loss = simple_net.scale_loss(dy_loss) dy_loss.backward() simple_net.apply_collective_grads() sgd.minimize(dy_loss) simple_net.clear_gradients() dy_loss_value = dy_loss.numpy() print("- dygrah loss: %.6f" % dy_loss_value[0])
learning_rate=0.001, regularization=fluid.regularizer.L2Decay(0.0005)) train_loader = multithread_loader(TRAINDIR, batch_size=10, mode='train') valid_loader = multithread_loader(VALIDDIR, batch_size=10, mode='valid') MAX_EPOCH = 300 # 提升点: 可以改变训练的轮数 for epoch in range(MAX_EPOCH): for i, data in enumerate(train_loader()): img, gt_boxes, gt_labels, img_scale = data gt_scores = np.ones(gt_labels.shape).astype('float32') gt_scores = to_variable(gt_scores) img = to_variable(img) gt_boxes = to_variable(gt_boxes) gt_labels = to_variable(gt_labels) outputs = model(img) loss = model.get_loss(outputs, gt_boxes, gt_labels, gtscore=gt_scores, anchors=ANCHORS, anchor_masks=ANCHOR_MASKS, ignore_thresh=IGNORE_THRESH, use_label_smooth=False) loss.backward() opt.minimize(loss)
def test_while_op(self): seed = 90 epoch_num = 1 if core.is_compiled_with_cuda(): batch_num = 3 else: batch_num = 2 np.random.seed = seed image_np = np.random.randn(Config.batch_size, Config.DATA_SHAPE[0], Config.DATA_SHAPE[1], Config.DATA_SHAPE[2]).astype('float32') label_in_np = np.arange( 0, Config.max_length, dtype='int64').reshape([1, Config.max_length]) for i in range(2, Config.batch_size + 1): label_in_np = np.vstack((label_in_np, np.arange( (i - 1) * Config.max_length, i * Config.max_length, dtype='int64').reshape([1, Config.max_length]))) label_out_np = np.arange( 0, Config.max_length, dtype='int64').reshape([1, Config.max_length]) for i in range(2, Config.batch_size + 1): label_out_np = np.vstack((label_out_np, np.arange( (i - 1) * Config.max_length, i * Config.max_length, dtype='int64').reshape([1, Config.max_length]))) with fluid.dygraph.guard(): fluid.set_flags({'FLAGS_sort_sum_gradient': True}) paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) ocr_attention = OCRAttention() if Config.learning_rate_decay == "piecewise_decay": learning_rate = fluid.layers.piecewise_decay( [50000], [Config.LR, Config.LR * 0.01]) else: learning_rate = Config.LR optimizer = fluid.optimizer.SGD( learning_rate=0.001, parameter_list=ocr_attention.parameters()) dy_param_init_value = {} for param in ocr_attention.parameters(): dy_param_init_value[param.name] = param.numpy() for epoch in range(epoch_num): for batch_id in range(batch_num): label_in = to_variable(label_in_np) label_out = to_variable(label_out_np) label_out.stop_gradient = True img = to_variable(image_np) dy_prediction = ocr_attention(img, label_in) label_out = fluid.layers.reshape( label_out, [-1, 1], inplace=False) dy_prediction = fluid.layers.reshape( dy_prediction, [label_out.shape[0], -1], inplace=False) loss = fluid.layers.cross_entropy( input=dy_prediction, label=label_out) avg_loss = fluid.layers.reduce_sum(loss) dy_out = avg_loss.numpy() if epoch == 0 and batch_id == 0: for param in ocr_attention.parameters(): if param.name not in dy_param_init_value: dy_param_init_value[param.name] = param.numpy() avg_loss.backward() dy_grad_value = {} for param in ocr_attention.parameters(): if param.trainable: np_array = np.array(param._grad_ivar().value() .get_tensor()) dy_grad_value[param.name + core.grad_var_suffix( )] = np_array optimizer.minimize(avg_loss) ocr_attention.clear_gradients() dy_param_value = {} for param in ocr_attention.parameters(): dy_param_value[param.name] = param.numpy() with new_program_scope(): paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) ocr_attention = OCRAttention() if Config.learning_rate_decay == "piecewise_decay": learning_rate = fluid.layers.piecewise_decay( [50000], [Config.LR, Config.LR * 0.01]) else: learning_rate = Config.LR optimizer = fluid.optimizer.SGD(learning_rate=0.001) images = fluid.layers.data( name='pixel', shape=Config.DATA_SHAPE, dtype='float32') static_label_in = fluid.layers.data( name='label_in', shape=[1], dtype='int64', lod_level=0) static_label_out = fluid.layers.data( name='label_out', shape=[1], dtype='int64', lod_level=0) static_label_out.stop_gradient = True static_label_out.trainable = False static_prediction = ocr_attention(images, static_label_in) static_prediction = fluid.layers.reshape( static_prediction, shape=[-1, Config.num_classes + 2]) cost = fluid.layers.cross_entropy( input=static_prediction, label=static_label_out) static_avg_loss = fluid.layers.reduce_sum(cost) # param_grad_list = fluid.backward.append_backward(static_avg_loss) optimizer.minimize(static_avg_loss) static_param_init_value = {} static_param_name_list = [] static_grad_name_list = [] for param in ocr_attention.parameters(): static_param_name_list.append(param.name) if param.trainable: static_grad_name_list.append(param.name + core.grad_var_suffix()) out = exe.run(fluid.default_startup_program(), fetch_list=static_param_name_list) for i in range(len(static_param_name_list)): static_param_init_value[static_param_name_list[i]] = out[i] fetch_list = [static_avg_loss.name] fetch_list.extend(static_param_name_list) fetch_list.extend(static_grad_name_list) for epoch in range(epoch_num): for batch_id in range(batch_num): static_label_in = label_in_np static_label_out = label_out_np static_label_out = static_label_out.reshape((-1, 1)) out = exe.run(fluid.default_main_program(), feed={ "pixel": image_np, "label_in": static_label_in, "label_out": static_label_out }, fetch_list=fetch_list) static_param_value = {} static_grad_value = {} static_out = out[0] for i in range(1, len(static_param_name_list) + 1): static_param_value[static_param_name_list[i - 1]] = out[ i] grad_start_pos = len(static_param_name_list) + 1 for i in range(grad_start_pos, len(static_grad_name_list) + grad_start_pos): static_grad_value[static_grad_name_list[ i - grad_start_pos]] = out[i] self.assertTrue(np.allclose(static_out, dy_out)) for key, value in six.iteritems(static_param_init_value): self.assertTrue(np.array_equal(value, dy_param_init_value[key])) for key, value in six.iteritems(static_param_value): self.assertTrue(np.allclose(value, dy_param_value[key], rtol=1e-05))
def train(place): num_layers = 1 batch_size = 4 hidden_size = 10 num_steps = 3 init_scale = 0.1 max_epoch = 1 dropout = 0.0 vocab_size = 1000 batch_num = 200 with fluid.dygraph.guard(place): paddle.seed(SEED) paddle.framework.random._manual_program_seed(SEED) ptb_model = PtbModel(hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale, dropout=dropout) sgd = SGDOptimizer(learning_rate=1e-3, parameter_list=ptb_model.parameters()) for epoch_id in range(max_epoch): total_loss = 0.0 iters = 0.0 total_sample = 0 init_hidden_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) for step_id in range(batch_num): x_data = np.arange(12).reshape(4, 3).astype('int64') y_data = np.arange(1, 13).reshape(4, 3).astype('int64') y_data = y_data.reshape((-1, 1)) x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, num_steps, 1)) x = to_variable(x_data) y = to_variable(y_data) dy_loss, last_hidden, last_cell = ptb_model( x, y, init_hidden, init_cell) out_loss = dy_loss.numpy() dy_loss.backward() sgd.minimize(dy_loss) ptb_model.clear_gradients() total_loss += out_loss iters += num_steps total_sample += 1 if step_id % PRINT_STEP == 0: if step_id == 0: logging.info( "epoch %d | step %d, loss %0.3f" % (epoch_id, step_id, total_loss / total_sample)) avg_batch_time = time.time() else: speed = PRINT_STEP / (time.time() - avg_batch_time) logging.info( "epoch %d | step %d, loss %0.3f, speed %.3f steps/s" % (epoch_id, step_id, total_loss / total_sample, speed)) avg_batch_time = time.time() return out_loss, last_hidden.numpy(), last_cell.numpy()
def ptb_rnn_sort_gradient_cpu_float32(self, is_sparse): seed = 90 hidden_size = 10 vocab_size = 1000 num_layers = 1 num_steps = 3 init_scale = 0.1 batch_size = 4 batch_num = 200 with fluid.dygraph.guard(): fluid.set_flags({'FLAGS_sort_sum_gradient': True}) paddle.manual_seed(seed) paddle.framework.random._manual_program_seed(seed) # TODO: marsyang1993 Change seed to ptb_model = PtbModel(hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale, is_sparse=is_sparse) sgd = SGDOptimizer(learning_rate=1e-3, parameter_list=ptb_model.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None last_hidden = None last_cell = None for i in range(batch_num): x_data = np.arange(12).reshape(4, 3).astype('int64') y_data = np.arange(1, 13).reshape(4, 3).astype('int64') x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, 1)) init_hidden_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') x = to_variable(x_data) y = to_variable(y_data) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) dy_loss, last_hidden, last_cell = ptb_model( x, y, init_hidden, init_cell) if i == 0: for param in ptb_model.parameters(): dy_param_init[param.name] = param.numpy() dy_loss.backward() sgd.minimize(dy_loss) ptb_model.clear_gradients() if i == batch_num - 1: for param in ptb_model.parameters(): dy_param_updated[param.name] = param.numpy() dy_loss_value = dy_loss.numpy() dy_last_cell_value = last_cell.numpy() dy_last_hidden_value = last_hidden.numpy() with new_program_scope(): paddle.manual_seed(seed) paddle.framework.random._manual_program_seed(seed) ptb_model = PtbModel(hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale, is_sparse=is_sparse) exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) sgd = SGDOptimizer(learning_rate=1e-3) x = fluid.layers.data(name="x", shape=[-1, num_steps, 1], dtype='int64') y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32') init_hidden = fluid.layers.data(name="init_hidden", shape=[1], dtype='float32') init_cell = fluid.layers.data(name="init_cell", shape=[1], dtype='float32') static_loss, static_last_hidden, static_last_cell = ptb_model( x, y, init_hidden, init_cell) sgd.minimize(static_loss) static_param_updated = dict() static_param_init = dict() static_param_name_list = list() for param in ptb_model.parameters(): static_param_name_list.append(param.name) out = exe.run(framework.default_startup_program(), fetch_list=static_param_name_list) for i in range(len(static_param_name_list)): static_param_init[static_param_name_list[i]] = out[i] static_loss_value = None static_last_cell_value = None static_last_hidden_value = None for i in range(batch_num): x_data = np.arange(12).reshape(4, 3).astype('int64') y_data = np.arange(1, 13).reshape(4, 3).astype('int64') x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, 1)) init_hidden_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') init_cell_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') fetch_list = [ static_loss, static_last_hidden, static_last_cell ] fetch_list.extend(static_param_name_list) out = exe.run(fluid.default_main_program(), feed={ "x": x_data, "y": y_data, "init_hidden": init_hidden_data, "init_cell": init_cell_data }, fetch_list=fetch_list) static_loss_value = out[0] static_last_hidden_value = out[1] static_last_cell_value = out[2] if i == batch_num - 1: for k in range(3, len(out)): static_param_updated[static_param_name_list[ k - 3]] = out[k] self.assertTrue(np.array_equal(static_loss_value, dy_loss_value)) self.assertTrue( np.array_equal(static_last_cell_value, dy_last_cell_value)) self.assertTrue( np.array_equal(static_last_hidden_value, dy_last_hidden_value)) for key, value in six.iteritems(static_param_init): self.assertTrue(np.array_equal(value, dy_param_init[key])) for key, value in six.iteritems(static_param_updated): self.assertTrue(np.array_equal(value, dy_param_updated[key]))