def __init__(self, data_loader, epochs, save_epoch, model_path, numTransform, numRef): self.data_loader = data_loader self.epochs = epochs self.model_path = model_path self.save_epoch = save_epoch self.numTransform = numTransform self.numRef = numRef self.G = Generator(numTransform, numRef) self.D = Discriminator(numTransform, numRef) self.G_optim = optim.SGD(self.G.parameters(), lr=1e-3, momentum=0.9) self.D_optim = optim.SGD(self.D.parameters(), lr=1e-3, momentum=0.9) if self.gpu_mode: self.G.cuda() self.D.cuda() self.BCE_loss = nn.BCELoss().cuda() self.L1_Loss = nn.L1Loss().cuda() else: self.BCE_loss = nn.BCELoss() self.L1_Loss = nn.L1Loss() self.save_path = model_path + '/model_%d.weights' logdir = model_path + "/tmp" logger = LogWriter(logdir, sync_cycle=10000) with logger.mode("train"): self.log_D_real_loss = logger.scalar("D/real_loss") self.log_D_fake_loss = logger.scalar("D/fake_loss") self.log_D_total_loss = logger.scalar("D/total_loss") self.log_G_D_loss = logger.scalar("G/D_Loss") self.log_G_L1_loss = logger.scalar("G/L1_Loss") self.log_G_total_loss = logger.scalar("G/total_Loss") with logger.mode("test"): self.log_test_loss = logger.scalar("test/loss")
class MyLog(): ''' 本类用于适配PaddleHub在AIStudio中VisualDL的使用 使用方式: # 创建 LogWriter 对象 log_writer = MyLog(mode="role2") seq_label_task._tb_writer=log_writer ''' def __init__(self, mode="train", logDir="../log"): self.mode = mode self.varDic = {} self.log_writer = LogWriter(logDir, sync_cycle=10) def add_scalar(self, tag, scalar_value, global_step): if not tag in self.varDic: with self.log_writer.mode(self.mode) as writer: self.varDic[tag] = writer.scalar(tag) self.varDic[tag].add_record(global_step, scalar_value)
def main(): env = os.environ FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env if FLAGS.dist: trainer_id = int(env['PADDLE_TRAINER_ID']) import random local_seed = (99 + trainer_id) random.seed(local_seed) np.random.seed(local_seed) cfg = load_config(FLAGS.config) if 'architecture' in cfg: main_arch = cfg.architecture else: raise ValueError("'architecture' not specified in config file.") merge_config(FLAGS.opt) if 'log_iter' not in cfg: cfg.log_iter = 20 # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) # check if paddlepaddle version is satisfied check_version() if not FLAGS.dist or trainer_id == 0: print_total_cfg(cfg) if cfg.use_gpu: devices_num = fluid.core.get_cuda_device_count() else: devices_num = int(os.environ.get('CPU_NUM', 1)) if 'FLAGS_selected_gpus' in env: device_id = int(env['FLAGS_selected_gpus']) else: device_id = 0 place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) scheduler = cfg.LearningRate['schedulers'][0] if isinstance(scheduler, CosineDecayWithWarmup) and scheduler.max_iters is None: scheduler.max_iters = cfg.max_iters lr_builder = create('LearningRate') optim_builder = create('OptimizerBuilder') # build program startup_prog = fluid.Program() train_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) if FLAGS.fp16: assert (getattr(model.backbone, 'norm_type', None) != 'affine_channel'), \ '--fp16 currently does not support affine channel, ' \ ' please modify backbone settings to use batch norm' with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx: inputs_def = cfg['TrainReader']['inputs_def'] feed_vars, train_loader = model.build_inputs(**inputs_def) train_fetches = model.train(feed_vars) loss = train_fetches['loss'] if FLAGS.fp16: loss *= ctx.get_loss_scale_var() lr = lr_builder() optimizer = optim_builder(lr) optimizer.minimize(loss) if FLAGS.fp16: loss /= ctx.get_loss_scale_var() # parse train fetches train_keys, train_values, _ = parse_fetches(train_fetches) train_values.append(lr) if FLAGS.eval: eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg['EvalReader']['inputs_def'] feed_vars, eval_loader = model.build_inputs(**inputs_def) fetches = model.eval(feed_vars) eval_prog = eval_prog.clone(True) eval_reader = create_reader(cfg.EvalReader) eval_loader.set_sample_list_generator(eval_reader, place) # parse eval fetches extra_keys = [] if cfg.metric == 'COCO': extra_keys = ['im_info', 'im_id', 'im_shape'] if cfg.metric == 'VOC': extra_keys = ['gt_bbox', 'gt_class', 'is_difficult'] if cfg.metric == 'WIDERFACE': extra_keys = ['im_id', 'im_shape', 'gt_bbox'] eval_keys, eval_values, eval_cls = parse_fetches( fetches, eval_prog, extra_keys) # compile program for multi-devices build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_optimizer_ops = False # only enable sync_bn in multi GPU devices sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn' build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \ and cfg.use_gpu exec_strategy = fluid.ExecutionStrategy() # iteration number when CompiledProgram tries to drop local execution scopes. # Set it to be 1 to save memory usages, so that unused variables in # local execution scopes can be deleted after each iteration. exec_strategy.num_iteration_per_drop_scope = 1 if FLAGS.dist: dist_utils.prepare_for_multi_process(exe, build_strategy, startup_prog, train_prog) exec_strategy.num_threads = 1 exe.run(startup_prog) compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) if FLAGS.eval: compiled_eval_prog = fluid.compiler.CompiledProgram(eval_prog) fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel' ignore_params = cfg.finetune_exclude_pretrained_params \ if 'finetune_exclude_pretrained_params' in cfg else [] start_iter = 0 if FLAGS.resume_checkpoint: checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint) start_iter = checkpoint.global_step() elif cfg.pretrain_weights and fuse_bn and not ignore_params: checkpoint.load_and_fusebn(exe, train_prog, cfg.pretrain_weights) elif cfg.pretrain_weights: checkpoint.load_params(exe, train_prog, cfg.pretrain_weights, ignore_params=ignore_params) train_reader = create_reader(cfg.TrainReader, (cfg.max_iters - start_iter) * devices_num, cfg) train_loader.set_sample_list_generator(train_reader, place) # whether output bbox is normalized in model output layer is_bbox_normalized = False if hasattr(model, 'is_bbox_normalized') and \ callable(model.is_bbox_normalized): is_bbox_normalized = model.is_bbox_normalized() # if map_type not set, use default 11point, only use in VOC eval map_type = cfg.map_type if 'map_type' in cfg else '11point' train_stats = TrainingStats(cfg.log_smooth_window, train_keys) train_loader.start() start_time = time.time() end_time = time.time() cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) time_stat = deque(maxlen=cfg.log_smooth_window) best_box_ap_list = [0.0, 0] #[map, iter] # use tb-paddle to log data if FLAGS.use_tb: from tb_paddle import SummaryWriter tb_writer = SummaryWriter(FLAGS.tb_log_dir) tb_loss_step = 0 tb_mAP_step = 0 if FLAGS.use_vdl: from visualdl import LogWriter vdl_writer = LogWriter(FLAGS.vdl_log_dir, sync_cycle=5) with vdl_writer.mode("train"): scalars = [ vdl_writer.scalar(loss_name) for loss_name in train_keys ] mAP_scalar = vdl_writer.scalar("mAP") vdl_loss_step = 0 vdl_mAP_step = 0 for it in range(start_iter, cfg.max_iters): start_time = end_time end_time = time.time() time_stat.append(end_time - start_time) time_cost = np.mean(time_stat) eta_sec = (cfg.max_iters - it) * time_cost eta = str(datetime.timedelta(seconds=int(eta_sec))) outs = exe.run(compiled_train_prog, fetch_list=train_values) stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])} # use tb-paddle to log loss if FLAGS.use_tb: if it % cfg.log_iter == 0: for loss_name, loss_value in stats.items(): tb_writer.add_scalar(loss_name, loss_value, tb_loss_step) tb_loss_step += 1 if FLAGS.use_vdl: if it % cfg.log_iter == 0: for loss_name, scalar in zip(train_keys, scalars): loss_value = stats[loss_name] scalar.add_record(vdl_loss_step, loss_value) vdl_loss_step += 1 train_stats.update(stats) logs = train_stats.log() if it % cfg.log_iter == 0 and (not FLAGS.dist or trainer_id == 0): strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format( it, np.mean(outs[-1]), logs, time_cost, eta) logger.info(strs) if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \ and (not FLAGS.dist or trainer_id == 0): save_name = str(it) if it != cfg.max_iters - 1 else "model_final" checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name)) if FLAGS.eval: # evaluation results = eval_run(exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls) resolution = None if 'mask' in results[0]: resolution = model.mask_head.resolution box_ap_stats = eval_results(results, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type, cfg['EvalReader']['dataset']) # use tb_paddle to log mAP if FLAGS.use_tb: tb_writer.add_scalar("mAP", box_ap_stats[0], tb_mAP_step) tb_mAP_step += 1 if FLAGS.use_vdl: mAP_scalar.add_record(vdl_mAP_step, box_ap_stats[0]) vdl_mAP_step += 1 if box_ap_stats[0] > best_box_ap_list[0]: best_box_ap_list[0] = box_ap_stats[0] best_box_ap_list[1] = it checkpoint.save(exe, train_prog, os.path.join(save_dir, "best_model")) logger.info("Best test box ap: {}, in iter: {}".format( best_box_ap_list[0], best_box_ap_list[1])) train_loader.reset()
def train_loop(self, num_epochs, train_reader, train_batch_size, eval_reader=None, save_interval_epochs=1, log_interval_steps=10, save_dir='output', use_vdl=False): if not osp.isdir(save_dir): if osp.exists(save_dir): os.remove(save_dir) os.makedirs(save_dir) if use_vdl: from visualdl import LogWriter vdl_logdir = osp.join(save_dir, 'vdl_log') # 给transform添加arrange操作 self.arrange_transforms(transforms=train_reader.transforms, mode='train') # 构建train_data_loader self.build_train_data_loader(reader=train_reader, batch_size=train_batch_size) if eval_reader is not None: self.eval_transforms = eval_reader.transforms self.test_transforms = copy.deepcopy(eval_reader.transforms) # 获取实时变化的learning rate lr = self.optimizer._learning_rate if isinstance(lr, fluid.framework.Variable): self.train_outputs['lr'] = lr # 在多卡上跑训练 if self.parallel_train_prog is None: build_strategy = fluid.compiler.BuildStrategy() build_strategy.fuse_all_optimizer_ops = False if __init__.env_info['place'] != 'cpu' and len(self.places) > 1: build_strategy.sync_batch_norm = self.sync_bn exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_iteration_per_drop_scope = 1 self.parallel_train_prog = fluid.CompiledProgram( self.train_prog).with_data_parallel( loss_name=self.train_outputs['loss'].name, build_strategy=build_strategy, exec_strategy=exec_strategy) total_num_steps = math.floor(train_reader.num_samples / train_batch_size) num_steps = 0 time_stat = list() if use_vdl: # VisualDL component log_writer = LogWriter(vdl_logdir, sync_cycle=20) train_step_component = OrderedDict() eval_component = OrderedDict() best_accuracy_key = "" best_accuracy = -1.0 best_model_epoch = 1 for i in range(num_epochs): records = list() step_start_time = time.time() for step, data in enumerate(self.train_data_loader()): outputs = self.exe.run(self.parallel_train_prog, feed=data, fetch_list=list( self.train_outputs.values())) outputs_avg = np.mean(np.array(outputs), axis=1) records.append(outputs_avg) # 训练完成剩余时间预估 current_time = time.time() step_cost_time = current_time - step_start_time step_start_time = current_time if len(time_stat) < 20: time_stat.append(step_cost_time) else: time_stat[num_steps % 20] = step_cost_time eta = ((num_epochs - i) * total_num_steps - step - 1) * np.mean(time_stat) eta_h = math.floor(eta / 3600) eta_m = math.floor((eta - eta_h * 3600) / 60) eta_s = int(eta - eta_h * 3600 - eta_m * 60) eta_str = "{}:{}:{}".format(eta_h, eta_m, eta_s) # 每间隔log_interval_steps,输出loss信息 num_steps += 1 if num_steps % log_interval_steps == 0: step_metrics = OrderedDict( zip(list(self.train_outputs.keys()), outputs_avg)) if use_vdl: for k, v in step_metrics.items(): if k not in train_step_component.keys(): with log_writer.mode('Each_Step_while_Training' ) as step_logger: train_step_component[ k] = step_logger.scalar( 'Training: {}'.format(k)) train_step_component[k].add_record(num_steps, v) logging.info( "[TRAIN] Epoch={}/{}, Step={}/{}, {}, eta={}".format( i + 1, num_epochs, step + 1, total_num_steps, dict2str(step_metrics), eta_str)) train_metrics = OrderedDict( zip(list(self.train_outputs.keys()), np.mean(records, axis=0))) logging.info('[TRAIN] Epoch {} finished, {} .'.format( i + 1, dict2str(train_metrics))) # 每间隔save_interval_epochs, 在验证集上评估和对模型进行保存 if (i + 1) % save_interval_epochs == 0 or i == num_epochs - 1: current_save_dir = osp.join(save_dir, "epoch_{}".format(i + 1)) if not osp.isdir(current_save_dir): os.makedirs(current_save_dir) if eval_reader is not None: # 检测目前仅支持单卡评估,训练数据batch大小与显卡数量之商为验证数据batch大小。 eval_batch_size = train_batch_size self.eval_metrics, self.eval_details = self.evaluate( eval_reader=eval_reader, batch_size=eval_batch_size, verbose=True, epoch_id=i + 1, return_details=True) logging.info('[EVAL] Finished, Epoch={}, {} .'.format( i + 1, dict2str(self.eval_metrics))) # 保存最优模型 best_accuracy_key = list(self.eval_metrics.keys())[0] current_accuracy = self.eval_metrics[best_accuracy_key] if current_accuracy > best_accuracy: best_accuracy = current_accuracy best_model_epoch = i + 1 best_model_dir = osp.join(save_dir, "best_model") self.save_model(save_dir=best_model_dir) if use_vdl: for k, v in self.eval_metrics.items(): if isinstance(v, list): continue if isinstance(v, np.ndarray): if v.size > 1: continue if k not in eval_component: with log_writer.mode('Each_Epoch_on_Eval_Data' ) as eval_logger: eval_component[k] = eval_logger.scalar( 'Evaluation: {}'.format(k)) eval_component[k].add_record(i + 1, v) self.save_model(save_dir=current_save_dir) logging.info( 'Current evaluated best model in eval_reader is epoch_{}, {}={}' .format(best_model_epoch, best_accuracy_key, best_accuracy))
from visualdl import LogWriter # Download MNIST data mnist = mx.test_utils.get_mnist() batch_size = 100 # Provide a folder to store data for log, model, image, etc. VisualDL's visualization will be # based on this folder. logdir = "./tmp" # Initialize a logger instance. Parameter 'sync_cycle' means write a log every 10 operations on # memory. logger = LogWriter(logdir, sync_cycle=10) # mark the components with 'train' label. with logger.mode("train"): # scalar0 is used to record scalar metrics while MXNet is training. We will record accuracy. # In the visualization, we can see the accuracy is increasing as more training steps happen. scalar0 = logger.scalar("scalars/scalar0") image0 = logger.image("images/image0", 1) histogram0 = logger.histogram("histogram/histogram0", num_buckets=100) # Record training steps cnt_step = 0 # MXNet provides many callback interface. Here we define our own callback method and it is called # after every batch. # https://mxnet.incubator.apache.org/api/python/callback/callback.html def add_scalar(): def _callback(param):
def train(): img = fluid.layers.data(name="img", shape=[1, 28, 28], dtype="float32") label = fluid.layers.data(name="label", shape=[1], dtype="int64") avg_cost, acc = lenet_5(img, label) # get the mnist dataset train_reader = paddle.batch(paddle.dataset.mnist.train(), batch_size=64) # define the loss optimizer = fluid.optimizer.Adam(learning_rate=0.001) optimizer.minimize(avg_cost) # running on cpu place = fluid.CPUPlace() feeder = fluid.DataFeeder(feed_list=[img, label], place=place) exe = fluid.Executor(place) log_writter = LogWriter("./vdl_log", sync_cycle=10) with log_writter.mode("train") as logger: scalar_loss = logger.scalar(tag="loss") scalar_accuracy = logger.scalar(tag="accuracy") num_samples = 10 image_input = logger.image(tag="input", num_samples=num_samples) histogram = logger.histogram(tag="histogram", num_buckets=50) # init all param exe.run(fluid.default_startup_program()) step = 0 sample_num = 0 epochs = 5 param_name = fluid.default_startup_program().global_block().all_parameters( )[0].name # start to train for i in range(epochs): for batch in train_reader(): cost, accuracy, input, param = exe.run( feed=feeder.feed(batch), fetch_list=[avg_cost.name, acc.name, img.name, param_name]) step += 1 # record the loss and accuracy scalar_loss.add_record(step, cost) scalar_accuracy.add_record(step, accuracy) if sample_num % num_samples == 0: image_input.start_sampling() idx = image_input.is_sample_taken() if idx != -1: # the first image in the batch data image_data = input[0] # the image shape recrod in VDL is H * W * C image_data = image_data.reshape([28, 28, 1]) image_input.set_sample(idx, image_data.shape, 100 * image_data.flatten()) sample_num += 1 if sample_num % num_samples == 0: image_input.finish_sampling() sample_num = 0 # record the parameter trend histogram.add_record(step, param.flatten())
import paddle as paddle import paddle.dataset.cifar as cifar import paddle.fluid as fluid import mobilenet_v2 from visualdl import LogWriter # 创建VisualDL的记录器, # 通过这个记录器可以记录每次训练的数据,并存储在log/目录下。 # 创建记录器 log_writer = LogWriter(dir='log/', sync_cycle=10) # 创建训练和测试记录数据工具 with log_writer.mode('train') as writer: train_cost_writer = writer.scalar('cost') train_acc_writer = writer.scalar('accuracy') histogram = writer.histogram('histogram', num_buckets=50) with log_writer.mode('test') as writer: test_cost_writer = writer.scalar('cost') test_acc_writer = writer.scalar('accuracy') # 定义输入层,获取MobileNet V2的分类器, # 克隆预测程序,定义优化方法。 # 定义输入层 image = fluid.layers.data(name='image', shape=[3, 32, 32], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') # 获取分类器 model = mobilenet_v2.net(image, 10) # 获取损失函数和准确率函数
class SolverWrapper: def __init__(self, solver_prototxt, log_dir, pretrained_model=None): self.solver = caffe.SGDSolver(solver_prototxt) if pretrained_model is not None: print('Loading pretrained model weights from {:s}'.format(pretrained_model)) self.solver.net.copy_from(pretrained_model) self.solver_param = caffe_pb2.SolverParameter() with open(solver_prototxt, 'rt') as f: pb2.text_format.Merge(f.read(), self.solver_param) self.cur_epoch = 0 self.test_interval = 500 #用来替代self.solver_param.test_interval #self.test_interval = 2000 #用来替代self.solver_param.test_interval self.logw = LogWriter(log_dir, sync_cycle=100) with self.logw.mode('train') as logger: self.sc_train_acc = logger.scalar("Accuracy") self.sc_train_lr = logger.scalar("learning_rate") with self.logw.mode('val') as logger: self.sc_val_acc = logger.scalar("Accuracy") self.sc_val_lr = logger.scalar("learning_rate") def train_model(self): """执行训练的整个流程,穿插了validation""" cur_iter = 0 test_batch_size, num_classes = self.solver.test_nets[0].blobs['prob'].shape num_test_images_tot = test_batch_size * self.solver_param.test_iter[0] lr_policy = self.solver_param.lr_policy memo_t = 25 # 2 * 25(each epoch is 25) while cur_iter < self.solver_param.max_iter: #self.solver.step(self.test_interval) for i in range(self.test_interval): self.solver.step(1) cur_iter += 1 #loss = self.solver.net.blobs['loss'].data if (cur_iter==1 or cur_iter % memo_t==0): acc = float(self.solver.net.blobs['accuracy'].data) step = cur_iter lr = self.get_lr(lr_policy, cur_iter) #self.sc_train_loss.add_record(step, loss) self.sc_train_acc.add_record(step, acc) self.sc_train_lr.add_record(step, lr) self.eval_on_val(num_classes, num_test_images_tot, test_batch_size) #self.eval_on_val(num_classes, num_test_images_tot, test_batch_size) def eval_on_val(self, num_classes, num_test_images_tot, test_batch_size): """在整个验证集上执行inference和evaluation""" self.solver.test_nets[0].share_with(self.solver.net) self.cur_epoch += 1 scores = np.zeros((num_classes, num_test_images_tot), dtype=np.float32) gt_labels = np.zeros((1, num_test_images_tot), dtype=np.float32).squeeze() for t in range(self.solver_param.test_iter[0]): output = self.solver.test_nets[0].forward() probs = output['prob'] labels = self.solver.test_nets[0].blobs['label'].data gt_labels[t*test_batch_size:(t+1)*test_batch_size] = labels.T.astype(np.float32) scores[:,t*test_batch_size:(t+1)*test_batch_size] = probs.T # TODO: 处理最后一个batch样本少于num_test_images_per_batch的情况 ap, acc = perfeval.cls_eval(scores, gt_labels) print('====================================================================\n') print('\tDo validation after the {:d}-th training epoch\n'.format(self.cur_epoch)) print('>>>>', end='\t') #设定标记,方便于解析日志获取出数据 for i in range(num_classes): print('AP[{:d}]={:.4f}'.format(i, ap[i]), end=', ') mAP = np.average(ap) print('mAP={:.4f}, Accuracy={:.4f}'.format(mAP, acc)) print('\n====================================================================\n') step = self.solver.iter lr_policy = self.solver_param.lr_policy lr = self.get_lr(lr_policy, step) self.sc_val_acc.add_record(step, acc) self.sc_val_lr.add_record(step, lr) def get_lr(self, lr_policy, cur_iter): if lr_policy=="fixed": rate = self.solver_param.base_lr elif lr_policy=="step": cur_step = cur_iter / self.solver_param.stepsize rate = self.solver_param.base_lr * math.pow(self.solver_param.gamma, cur_step) elif lr_policy=="exp": rate = self.solver_param.base_lr * math.pow(self.solver_param.gamma, cur_iter) elif lr_policy=="triangular": cycle = cur_iter / (2*self.solver_param.stepsize) x = float(cur_iter - (2*cycle+1)*self.solver_param.stepsize) x = x / self.solver_param.stepsize rate = self.solver_param.base_lr + (self.solver_param.max_lr - self.solver_param.base_lr)*max(0, 1-abs(x)) return rate
class StorageTest(unittest.TestCase): def setUp(self): self.dir = "./tmp/storage_test" self.writer = LogWriter(self.dir, sync_cycle=1).as_mode("train") def test_scalar(self): print('test write') scalar = self.writer.scalar("model/scalar/min") # scalar.set_caption("model/scalar/min") for i in range(10): scalar.add_record(i, float(i)) print('test read') self.reader = LogReader(self.dir) with self.reader.mode("train") as reader: scalar = reader.scalar("model/scalar/min") self.assertEqual(scalar.caption(), "train") records = scalar.records() ids = scalar.ids() self.assertTrue( np.equal(records, [float(i) for i in range(10 - 1)]).all()) self.assertTrue(np.equal(ids, [float(i) for i in range(10)]).all()) print('records', records) print('ids', ids) def test_image(self): tag = "layer1/layer2/image0" image_writer = self.writer.image(tag, 10, 1) num_passes = 10 num_samples = 100 shape = [10, 10, 3] for pass_ in range(num_passes): image_writer.start_sampling() for ins in range(num_samples): data = np.random.random(shape) * 256 data = np.ndarray.flatten(data) image_writer.add_sample(shape, list(data)) image_writer.finish_sampling() self.reader = LogReader(self.dir) with self.reader.mode("train") as reader: image_reader = reader.image(tag) self.assertEqual(image_reader.caption(), tag) self.assertEqual(image_reader.num_records(), num_passes) image_record = image_reader.record(0, 1) self.assertTrue(np.equal(image_record.shape(), shape).all()) data = image_record.data() self.assertEqual(len(data), np.prod(shape)) image_tags = reader.tags("image") self.assertTrue(image_tags) self.assertEqual(len(image_tags), 1) def test_check_image(self): ''' check whether the storage will keep image data consistent ''' print('check image') tag = "layer1/check/image1" image_writer = self.writer.image(tag, 10) image = Image.open("./dog.jpg") shape = [image.size[1], image.size[0], 3] origin_data = np.array(image.getdata()).flatten() self.reader = LogReader(self.dir) with self.reader.mode("train") as reader: image_writer.start_sampling() image_writer.add_sample(shape, list(origin_data)) image_writer.finish_sampling() # read and check whether the original image will be displayed image_reader = reader.image(tag) image_record = image_reader.record(0, 0) data = image_record.data() shape = image_record.shape() PIL_image_shape = (shape[0] * shape[1], shape[2]) data = np.array(data, dtype='uint8').reshape(PIL_image_shape) print('origin', origin_data.flatten()) print('data', data.flatten()) image = Image.fromarray(data.reshape(shape)) # manully check the image and found that nothing wrong with the image storage. # image.show() def test_with_syntax(self): with self.writer.mode("train") as writer: scalar = writer.scalar("model/scalar/average") for i in range(10): scalar.add_record(i, float(i)) self.reader = LogReader(self.dir) with self.reader.mode("train") as reader: scalar = reader.scalar("model/scalar/average") self.assertEqual(scalar.caption(), "train") def test_modes(self): store = LogWriter(self.dir, sync_cycle=1) scalars = [] for i in range(10): with store.mode("mode-%d" % i) as writer: scalar = writer.scalar("add/scalar0") scalars.append(scalar) for scalar in scalars[:-1]: for i in range(10): scalar.add_record(i, float(i))
def train(): log_writter = LogWriter('./vdl_log', sync_cycle=10) with log_writter.mode("train") as logger: log_g_loss = logger.scalar(tag="g_loss") log_d_loss = logger.scalar(tag="d_loss") place = fluid.CUDAPlace(1) with fluid.dygraph.guard(place): random_vector_data = np.random.standard_normal( (num_examples_to_generate, noise_dim)).astype('float32') random_vector_for_generation = to_variable(random_vector_data) mnist_dcgan = dcgan('mnist_dcgan') discriminator_optimizer = fluid.optimizer.Adam(learning_rate=1e-4) generator_optimizer = fluid.optimizer.Adam(learning_rate=1e-4) train_data = paddle.dataset.mnist.train() for epoch in range(num_epochs): train_reader = paddle.batch(paddle.reader.shuffle( train_data, buf_size=buffer_size), batch_size=batch_size, drop_last=True) print("Epoch id: ", epoch) total_loss_gen = [] total_loss_disc = [] for batch_id, data in enumerate(train_reader()): noise_data = np.random.standard_normal( (batch_size, noise_dim)).astype('float32') noise = to_variable(noise_data) img_data = np.array([x[0].reshape(1, 28, 28) for x in data]).astype('float32') img = to_variable(img_data) gen_loss, generated_images = mnist_dcgan( noise, img, None, True) gen_loss = fluid.layers.reduce_mean(gen_loss) gen_loss.backward() vars_G = [] for parm in mnist_dcgan.parameters(): if parm.name[:31] == 'mnist_dcgan/dcgan_0/generator_0': vars_G.append(parm) generator_optimizer.minimize(gen_loss, parameter_list=vars_G) mnist_dcgan.clear_gradients() disc_loss = mnist_dcgan(noise, img, generated_images, False) disc_loss = fluid.layers.reduce_mean(disc_loss) disc_loss.backward() vars_D = [] for parm in mnist_dcgan.parameters(): if parm.name[:35] == 'mnist_dcgan/dcgan_0/discriminator_0': vars_D.append(parm) discriminator_optimizer.minimize(disc_loss, parameter_list=vars_D) mnist_dcgan.clear_gradients() total_loss_gen.append(gen_loss.numpy()[0]) total_loss_disc.append(disc_loss.numpy()[0]) if epoch % 10 == 0: generate_and_save_images(epoch, mnist_dcgan, random_vector_for_generation) print("Generator loss: ", np.mean(np.array(total_loss_gen).astype('float32'))) print("Discriminator loss: ", np.mean(np.array(total_loss_disc).astype('float32'))) log_g_loss.add_record( epoch, np.mean(np.array(total_loss_gen).astype('float32'))) log_d_loss.add_record( epoch, np.mean(np.array(total_loss_disc).astype('float32')))
import random from visualdl import LogWriter import ca logdir='./temp' logger = LogWriter(logdir,sync_cycle=10) with logger.mode('train'): scalar0 = logger.scalar('scalar0') for step in range(0,1000): scalar0.add_record(step,random.random())
def train(): img = fluid.layers.data(name="img", shape=[1, 28, 28], dtype="float32") label = fluid.layers.data(name="label", shape=[1], dtype="int64") avg_cost, acc, prediction = lenet_5(img, label) # get the mnist dataset train_reader = paddle.batch(paddle.dataset.mnist.train(), batch_size=64) test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=64) test_program = fluid.default_main_program().clone(for_test=True) optimizer = fluid.optimizer.Adam(learning_rate=0.001) optimizer.minimize(avg_cost) # running on cpu place = fluid.CPUPlace() #place = fluid.CUDAPlace(0) feeder = fluid.DataFeeder(feed_list=[img, label], place=place) exe = fluid.Executor(place) # init all param exe.run(fluid.default_startup_program()) step = 0 sample_num = 0 epochs = 6 log_writter = LogWriter("./vdl_log", sync_cycle=100000) with log_writter.mode("train") as logger: trn_scalar_loss = logger.scalar("loss") trn_scalar_acc = logger.scalar("acc") with log_writter.mode('test') as logger: tst_scalar_loss = logger.scalar("loss") tst_scalar_acc = logger.scalar("acc") # start to train off = 0 for i in range(epochs): train_acc, train_cost = [], [] for step, batch in enumerate(train_reader()): res_cost, res_acc = exe.run(fluid.default_main_program(), feed=feeder.feed(batch), fetch_list=[avg_cost.name, acc.name]) train_cost.append(res_cost) train_acc.append(res_acc) if (step % 50 == 0 and step != 0) or (step == 0 and i == 0): # record the loss and accuracy st = step + off mloss = np.mean(np.array(train_cost)) macc = np.mean(np.array(train_acc)) trn_scalar_loss.add_record(st, mloss) trn_scalar_acc.add_record(st, macc) train_acc, train_cost = [], [] print("Epoc:{}, Iter:{}, loss:{}, acc{}".format( i, step, mloss, macc)) test_acc, test_cost = [], [] for data in test_reader(): res_cost, res_acc = exe.run( test_program, feed=feeder.feed(data), fetch_list=[avg_cost.name, acc.name]) test_cost.append(res_cost) test_acc.append(res_acc) mloss = np.mean(np.array(test_cost)) macc = np.mean(np.array(test_acc)) tst_scalar_loss.add_record(st, mloss) tst_scalar_acc.add_record(st, macc) test_acc, test_cost = [], [] print("Test Epoc:{}, loss:{}, acc{}".format(i, mloss, macc)) off = off + step fluid.io.save_persistables(exe, "mnist_model") fluid.io.save_inference_model("mnist_save_model", ['img'], [predition], exe, model_filename='model', params_filename='params')
print("setting".center(50, "=")) print("lr = {}, rc = {}, epochs = {}, batch_size = {}".format(args.lr, args.rc, args.epochs, args.batch_size)) print("Experiment ID: {}".format(args.exp_id).center(50, "=")) print("training in GPU: {}".format(args.gpu_id).center(50, "=")) d_name = args.d_name # get data g, label, train_idx, valid_idx, test_idx, evaluator = get_graph_data( d_name=d_name, mini_data=eval(args.mini_data)) # create log writer log_writer = LogWriter(args.log_path, sync_cycle=10) with log_writer.mode("train") as logger: log_train_loss_epoch = logger.scalar("loss") log_train_rocauc_epoch = logger.scalar("rocauc") with log_writer.mode("valid") as logger: log_valid_loss_epoch = logger.scalar("loss") log_valid_rocauc_epoch = logger.scalar("rocauc") log_text = log_writer.text("text") log_time = log_writer.scalar("time") log_test_loss = log_writer.scalar("test_loss") log_test_rocauc = log_writer.scalar("test_rocauc") # training samples = [25, 10] # 2-hop sample size batch_size = args.batch_size sample_workers = 1
def main(): # 配置 cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) if 'architecture' in cfg: main_arch = cfg.architecture else: raise ValueError("'architecture' not specified in config file.") check_gpu(cfg.use_gpu) check_version() # 执行器 place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) # 模型 lr_builder = create('LearningRate') optim_builder = create('OptimizerBuilder') startup_prog = fluid.Program() train_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg.TrainReader['inputs_def'] feed_vars, train_loader = model.build_inputs(**inputs_def) train_fetches = model.train(feed_vars) loss = train_fetches['loss'] lr = lr_builder() optimizer = optim_builder(lr) optimizer.minimize(loss) train_keys, train_values, _ = parse_fetches(train_fetches) train_values.append(lr) if FLAGS.eval: eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg.EvalReader['inputs_def'] feed_vars, eval_loader = model.build_inputs(**inputs_def) fetches = model.eval(feed_vars) eval_prog = eval_prog.clone(True) extra_keys = ['gt_bbox', 'gt_class', 'is_difficult'] eval_keys, eval_values, _ = parse_fetches(fetches, eval_prog, extra_keys) eval_reader = create_reader(cfg.EvalReader) eval_loader.set_sample_list_generator(eval_reader, place) ##### 运行 #### exe.run(startup_prog) ## 恢复与迁移 ignore_params = cfg.finetune_exclude_pretrained_params \ if 'finetune_exclude_pretrained_params' in cfg else [] start_iter = 0 if FLAGS.resume_checkpoint: checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint) start_iter = checkpoint.global_step() + 1 elif cfg.pretrain_weights: checkpoint.load_params( exe, train_prog, cfg.pretrain_weights, ignore_params=ignore_params) ## 数据迭代器 train_reader = create_reader(cfg.TrainReader, cfg.max_iters - start_iter, cfg) train_loader.set_sample_list_generator(train_reader, place) ## 训练循环 train_loader.start() # 过程跟踪 train_stats = TrainingStats(cfg.log_smooth_window, train_keys) start_time = time.time() end_time = time.time() time_stat = deque(maxlen=cfg.log_smooth_window) cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) best_box_ap_list = [0.0, 0] if FLAGS.use_vdl: log_writter = LogWriter(FLAGS.vdl_log_dir, sync_cycle=5) with log_writter.mode("train") as vdl_logger: train_scalar_loss = vdl_logger.scalar(tag="loss") with log_writter.mode("val") as vdl_logger: val_scalar_map = vdl_logger.scalar(tag="map") for it in range(start_iter, cfg.max_iters): # 运行程序 outs = exe.run(train_prog, fetch_list=train_values) stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])} # 日志与可视化窗口 start_time = end_time end_time = time.time() time_stat.append(end_time - start_time) time_cost = np.mean(time_stat) eta_sec = (cfg.max_iters - it) * time_cost eta = str(datetime.timedelta(seconds=int(eta_sec))) train_stats.update(stats) logs = train_stats.log() if it % cfg.log_iter == 0: # log strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format( it, np.mean(outs[-1]), logs, time_cost, eta) logger.info(strs) # vdl if FLAGS.use_vdl: train_scalar_loss.add_record(it//cfg.log_iter, stats['loss']) # 模型保存与评价窗口 if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1): # 模型保存 save_name = str(it) if it != cfg.max_iters - 1 else "final" checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name)) ## 模型评价 if FLAGS.eval: current_step = it//cfg.snapshot_iter if it % cfg.snapshot_iter == 0 \ else it//cfg.snapshot_iter+1 ## 训练集评价 ## 验证集评价 results = eval_run(exe, eval_prog, eval_loader, eval_keys, eval_values) box_ap_stats = eval_results(results, cfg.num_classes) logger.info("eval box op: {}, in iter: {}".format( box_ap_stats, it)) if FLAGS.use_vdl: val_scalar_map.add_record(current_step, box_ap_stats) ## 保存最佳模型 if box_ap_stats > best_box_ap_list[0]: best_box_ap_list[0] = box_ap_stats best_box_ap_list[1] = it checkpoint.save(exe, train_prog, os.path.join(save_dir, "best_model")) # 日志 logger.info("Best eval box ap: {}, in iter: {}".format( best_box_ap_list[0], best_box_ap_list[1])) train_loader.reset()
# coding=utf-8 from visualdl import LogWriter # 创建 LogWriter 对象 log_writter = LogWriter("./log", sync_cycle=10) # 创建 text 组件,模式为 train, 标签为 test with log_writter.mode("train") as logger: vdl_text_comp = logger.text(tag="test") # 使用 add_record() 函数添加数据 for i in range(1, 6): vdl_text_comp.add_record(i, "这是第 %d 个 Step 的数据。" % i) vdl_text_comp.add_record(i, "This is data %d ." % i)
class Trainer(object): @classmethod def add_cmdline_argument(cls, parser): """ Add the cmdline arguments of trainer. """ group = parser.add_argument_group("Trainer") group.add_argument( '--infer_network', type=str, default='ResNet32', help= "Set inference network. Default is ResNet32. [ResNet10, ResNet32, ResNet110, VGG]" ) group.add_argument( '--dataset', type=str, default='cifar-10', help='The dataset name. Default is cifar-10. [cifar-10, cifar-100]' ) group.add_argument('--num_epochs', type=int, default=1, help='Number of epoch. Default is 1.') group.add_argument('--batch_size', type=int, default=128, help="Batch size. Default is 128.") group.add_argument( '-c', '--enable_ce', action='store_true', help='If set, run the task with continuous evaluation logs.') group.add_argument( '--logger', type=str, default='', help='Path to log data generated in deep learning tasks.') group.add_argument( '--cpu_num', type=int, default=1, help='Specify the number of the logic core. Default is 1.') group.add_argument( '--cuda_devices', type=list, default=1, help='Specify the number of the CUDA devices. Default is 1.') group.add_argument( '-m', '--multi_card', action='store_true', help= 'In the mode of multi graphics card training, all graphics card will be occupied.' + 'If --use_cuda is false, the model will be run in CPU. In this situation, the multi-threads' + 'are used to run the model, and the number of threads is equal to the number of logic cores.' + 'You can configure --cpu_num to change the number of threads that are being used.' ) return group def __init__(self, hparams): # Use data distributed self.infer_network = hparams.infer_network self.dataset = hparams.dataset self.num_epochs = hparams.num_epochs self.batch_size = hparams.batch_size self.enable_ce = hparams.enable_ce self.logger = hparams.logger self.cpu_num = hparams.cpu_num self.cuda_devices = hparams.cuda_devices self.multi_card = hparams.multi_card self.num_class = 10 # default is cifar-10 if self.logger: from visualdl import LogWriter self.log_writer = LogWriter(self.logger, sync_cycle=20) # Create two ScalarWriter instances, whose mode is set to be "train" with self.log_writer.mode("train") as logger: self.train_cost = logger.scalar("cost") self.train_acc = logger.scalar("acc") # Create a ScalarWriter instance, whose mode is set to be "test" with self.log_writer.mode("test") as logger: self.test_loss = logger.scalar("loss") self.test_acc = logger.scalar("acc") if self.dataset is "cifar-100": self.num_class = 100 #if not os.path.exists(self.save_dir): # os.makedirs(self.save_dir) def inference_network(self): # The image is 32 * 32 with RGB representation. data_shape = [None, 3, 32, 32] images = fluid.data(name='pixel', shape=data_shape, dtype='float32') if self.infer_network == 'ResNet20': predict = resnet_cifar10(images, 20, self.num_class) elif self.infer_network == 'ResNet32': predict = resnet_cifar10(images, 32, self.num_class) elif self.infer_network == 'ResNet110': predict = resnet_cifar10(images, 110, self.num_class) elif self.infer_network == 'VGG': predict = vgg_bn_drop(images, self.num_class) else: logging.error( 'The following inference network is not supported! Choose on of: resnet, vgg.' ) sys.exit(1) return predict def train_network(self, predict): label = fluid.data(name='label', shape=[None, 1], dtype='int64') cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(cost) accuracy = fluid.layers.accuracy(input=predict, label=label) return [avg_cost, accuracy] def optimizer_program(self): return fluid.optimizer.Adam(learning_rate=0.001) def train(self, use_cuda, params_dirname): train_start = datetime.utcnow() if use_cuda: # NOTE: for multi process mode: one process per GPU device. # For example: CUDA_VISIBLE_DEVICES="0,1,2,3". # os.environ['CUDA_VISIBLE_DEVICES'] = self.cuda_devices # print("CUDA_VISIBLE_DEVICES:" + str(os.getenv("CUDA_VISIBLE_DEVICES"))) pass else: # NOTE: If you use CPU to run the program, you need # to specify the CPU_NUM, otherwise, fluid will use # all the number of the logic core as the CPU_NUM, # in that case, the batch size of the input should be # greater than CPU_NUM, if not, the process will be # failed by an exception. if not use_cuda: os.environ['CPU_NUM'] = str(self.cpu_num) print("CPU_NUM:" + str(os.getenv("CPU_NUM"))) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() if self.enable_ce: train_reader = paddle.batch(paddle.dataset.cifar.train10(), batch_size=self.batch_size) test_reader = paddle.batch(paddle.dataset.cifar.test10(), batch_size=self.batch_size) else: test_reader = paddle.batch(paddle.dataset.cifar.test10(), batch_size=self.batch_size) train_reader = paddle.batch(paddle.reader.shuffle( paddle.dataset.cifar.train10(), buf_size=128 * 100), batch_size=self.batch_size) feed_order = ['pixel', 'label'] main_program = fluid.default_main_program() start_program = fluid.default_startup_program() if self.enable_ce: main_program.random_seed = 90 start_program.random_seed = 90 predict = self.inference_network() avg_cost, acc = self.train_network(predict) # Test program test_program = main_program.clone(for_test=True) optimizer = self.optimizer_program() optimizer.minimize(avg_cost) exe = fluid.Executor(place) EPOCH_NUM = self.num_epochs # For training test cost def train_test(program, reader): count = 0 feed_var_list = [ program.global_block().var(var_name) for var_name in feed_order ] feeder_test = fluid.DataFeeder(feed_list=feed_var_list, place=place) test_exe = fluid.Executor(place) accumulated = len([avg_cost, acc]) * [0] for tid, test_data in enumerate(reader()): if self.multi_card: compiled_prog = fluid.compiler.CompiledProgram( main_program) avg_cost_np = test_exe.run( program=program, feed=feeder_test.feed(test_data), fetch_list=[avg_cost, acc]) else: avg_cost_np = test_exe.run( program=program, feed=feeder_test.feed(test_data), fetch_list=[avg_cost, acc]) accumulated = [ x[0] + x[1][0] for x in zip(accumulated, avg_cost_np) ] count += 1 return [x / count for x in accumulated] # main train loop. def train_loop(): feed_var_list_loop = [ main_program.global_block().var(var_name) for var_name in feed_order ] feeder = fluid.DataFeeder(feed_list=feed_var_list_loop, place=place) exe.run(start_program) # 1. MP mode, batch size for current process should be self.batch_size / GPUs # 2. SP/PG mode, batch size for each process should be original self.batch_size #if os.getenv("FLAGS_selected_gpus"): # steps_per_pass = images / ( # self.batch_size / get_device_num()) / num_trainers #else: # steps_per_pass = images / self.batch_size / num_trainers print('Train started at {}'.format( train_start.strftime('%Y-%m-%d %H:%M:%S.%f'))) step = 0 for pass_id in range(EPOCH_NUM): for step_id, data_train in enumerate(train_reader()): if self.multi_card: compiled_prog = fluid.compiler.CompiledProgram( main_program) avg_loss_value = exe.run(compiled_prog, feed=feeder.feed(data_train), fetch_list=[avg_cost, acc]) else: avg_loss_value = exe.run(main_program, feed=feeder.feed(data_train), fetch_list=[avg_cost, acc]) if step_id % 100 == 0: if self.logger is not '': self.train_cost.add_record(pass_id, avg_loss_value[0]) self.train_acc.add_record(pass_id, avg_loss_value[1]) print("\nPass %d, Batch %d, Cost %f, Acc %f" % (step_id, pass_id, avg_loss_value[0], avg_loss_value[1])) else: sys.stdout.write('.') sys.stdout.flush() step += 1 #if step >= steps_per_pass: # break avg_cost_test, accuracy_test = train_test(test_program, reader=test_reader) train_end = datetime.utcnow() elapsed_time = train_end - train_start if self.logger is not '': self.test_loss.add_record(pass_id, avg_cost_test) self.test_acc.add_record(pass_id, accuracy_test) print('\nTest with Pass {0}, Loss {1:2.2}, Acc {2:2.2}'.format( pass_id, avg_cost_test, accuracy_test)) if params_dirname is not None: fluid.io.save_inference_model(params_dirname, ["pixel"], [predict], exe) if pass_id == EPOCH_NUM - 1: print('Train ended at {}'.format( train_end.strftime('%Y-%m-%d %H:%M:%S.%f'))) print( 'Elapsed time for training is {}'.format(elapsed_time)) if self.enable_ce and pass_id == EPOCH_NUM - 1: print("kpis\ttrain_cost\t%f" % avg_loss_value[0]) print("kpis\ttrain_acc\t%f" % avg_loss_value[1]) print("kpis\ttest_cost\t%f" % avg_cost_test) print("kpis\ttest_acc\t%f" % accuracy_test) train_loop()
class Trainer(object): @classmethod def add_cmdline_argument(cls, parser): """ Add the cmdline arguments of trainer. """ group = parser.add_argument_group("Trainer") group.add_argument( "--use_data_distributed", type=str2bool, default=False, help="Whether to use data distributed for parallel training.") group.add_argument( "--valid_metric_name", type=str, default="-loss", help= "The validation metric determining which checkpoint is the best.") group.add_argument("--num_epochs", type=int, default=10, help="Total number of training epochs to perform.") group.add_argument( "--save_dir", type=str, required=True, help="The output directory where the model will be saved.") group.add_argument( "--batch_size", type=int, default=8, help="Total batch size for training/evaluation/inference.") group.add_argument( "--log_steps", type=int, default=100, help="The number of training steps to output current metrics " "on past training dataset.") group.add_argument( "--valid_steps", type=int, default=2000, help="The number of training steps to perform a evaluation " "on validation datasets.") group.add_argument( "--save_checkpoint", type=str2bool, default=True, help="Whether to save one checkpoints for each training epoch.") group.add_argument( "--save_summary", type=str2bool, default=False, help="Whether to save metrics summary for visualDL module.") DataLoader.add_cmdline_argument(group) return group def __init__(self, model, to_tensor, hparams, logger=None): # Use data distributed if hparams.use_data_distributed: strategy = parallel.prepare_context() if strategy is not None: parallel_model = parallel.DataParallel(model, strategy) model.before_backward_fn = parallel_model.scale_loss model.after_backward_fn = parallel_model.apply_collective_grads model = parallel_model self.model = model self.to_tensor = to_tensor self.is_decreased_valid_metric = hparams.valid_metric_name[0] == "-" self.valid_metric_name = hparams.valid_metric_name[1:] self.num_epochs = hparams.num_epochs self.save_dir = hparams.save_dir self.log_steps = hparams.log_steps self.valid_steps = hparams.valid_steps self.save_checkpoint = hparams.save_checkpoint self.save_summary = hparams.save_summary if not os.path.exists(self.save_dir): os.makedirs(self.save_dir) self.logger = logger or get_logger( os.path.join(self.save_dir, "trainer.log"), "trainer") if self.save_summary: from visualdl import LogWriter self.summary_logger = LogWriter(os.path.join( self.save_dir, "summary"), sync_cycle=10000) self.train_summary = {} self.valid_summary = {} self.batch_metrics_tracker = MetricsTracker() self.token_metrics_tracker = MetricsTracker() self.best_valid_metric = float( "inf" if self.is_decreased_valid_metric else "-inf") self.epoch = 0 self.batch_num = 0 def train_epoch(self, train_iter, valid_iter, infer_iter=None, infer_parse_dict=None): """ Train an epoch. @param train_iter @type : DataLoader @param valid_iter @type : DataLoader @param infer_iter @type : DataLoader @param infer_parse_dict @type : dict of function """ self.epoch += 1 num_batches = len(train_iter) self.batch_metrics_tracker.clear() self.token_metrics_tracker.clear() times = [] for batch_id, (batch, batch_size) in enumerate(train_iter, 1): batch = type(batch)(map(lambda kv: (kv[0], self.to_tensor(kv[1])), batch.items())) batch["epoch"] = self.epoch batch["num_steps"] = self.batch_num # Do a training iteration start_time = time.time() metrics = self.model(batch, is_training=True) token_num = metrics.pop("token_num", None) elapsed = time.time() - start_time times.append(elapsed) batch_metrics = { k: v for k, v in metrics.items() if "token" not in k } token_metrics = {k: v for k, v in metrics.items() if "token" in k} self.batch_metrics_tracker.update(batch_metrics, batch_size) self.token_metrics_tracker.update(token_metrics, token_num) self.batch_num += 1 if self.log_steps and batch_id % self.log_steps == 0: batch_metrics_message = self.batch_metrics_tracker.value() token_metrics_message = self.token_metrics_tracker.value() message_prefix = f"[Train][{self.epoch}][{batch_id}/{num_batches}]" avg_time = f"AVG_Time-{sum(times[-self.log_steps:]) / self.log_steps:.3f}" message = " ".join([ message_prefix, batch_metrics_message, token_metrics_message, avg_time ]) self.logger.info(message) if self.save_summary: with self.summary_logger.mode("train"): for k, v in self.batch_metrics_tracker.items(): if k not in self.train_summary: self.train_summary[k] = self.summary_logger.scalar( k) scalar = self.train_summary[k] scalar.add_record(self.batch_num, v) for k, v in self.token_metrics_tracker.items(): if k not in self.train_summary: self.train_summary[k] = self.summary_logger.scalar( k) scalar = self.train_summary[k] scalar.add_record(self.batch_num, v) if self.valid_steps and valid_iter is not None and \ batch_id % self.valid_steps == 0: self.evaluate(valid_iter) if valid_iter is not None: self.evaluate(valid_iter) if infer_iter is not None and infer_parse_dict is not None: self.infer(infer_iter, infer_parse_dict) return def infer(self, data_iter, parse_dict, num_batches=None): """ Inference interface. @param : data_iter @type : DataLoader @param : parse_dict @type : dict of function @param : num_batches : the number of batch to infer @type : int/None """ self.logger.info("Generation starts ...") infer_save_file = os.path.join(self.save_dir, f"infer_{self.epoch}.result.json") # Inference infer_results = [] batch_cnt = 0 begin_time = time.time() for batch, batch_size in tqdm(data_iter, total=num_batches): batch = type(batch)(map(lambda kv: (kv[0], self.to_tensor(kv[1])), batch.items())) result = self.model.infer(inputs=batch) batch_result = {} def to_list(batch): """ Parse list. """ return batch.tolist() # parse for k in result: if k in parse_dict: parse_fn = parse_dict[k] else: parse_fn = to_list if result[k] is not None: batch_result[k] = parse_fn(result[k]) for vs in zip(*batch_result.values()): infer_result = {} for k, v in zip(batch_result.keys(), vs): infer_result[k] = v infer_results.append(infer_result) batch_cnt += 1 if batch_cnt == num_batches: break self.logger.info(f"Saved inference results to {infer_save_file}") with open(infer_save_file, "w") as fp: json.dump(infer_results, fp, indent=2) infer_metrics_tracker = evaluate_generation_result(infer_results) metrics_message = infer_metrics_tracker.summary() message_prefix = f"[Infer][{self.epoch}]" time_cost = f"TIME-{time.time() - begin_time:.3f}" message = " ".join([message_prefix, metrics_message, time_cost]) self.logger.info(message) return def evaluate(self, data_iter, need_save=True): """ Evaluation interface @param : data_iter @type : DataLoader @param : need_save @type : bool """ if isinstance(self.model, parallel.DataParallel): need_save = need_save and parallel.Env().local_rank == 0 # Evaluation begin_time = time.time() batch_metrics_tracker = MetricsTracker() token_metrics_tracker = MetricsTracker() for batch, batch_size in data_iter: batch = type(batch)(map(lambda kv: (kv[0], self.to_tensor(kv[1])), batch.items())) metrics = self.model(batch, is_training=False) token_num = int(metrics.pop("token_num")) batch_metrics = { k: v for k, v in metrics.items() if "token" not in k } token_metrics = {k: v for k, v in metrics.items() if "token" in k} batch_metrics_tracker.update(batch_metrics, batch_size) token_metrics_tracker.update(token_metrics, token_num) batch_metrics_message = batch_metrics_tracker.summary() token_metrics_message = token_metrics_tracker.summary() message_prefix = f"[Valid][{self.epoch}]" time_cost = f"TIME-{time.time() - begin_time:.3f}" message = " ".join([ message_prefix, batch_metrics_message, token_metrics_message, time_cost ]) self.logger.info(message) if need_save: # Check valid metric cur_valid_metric = batch_metrics_tracker.get( self.valid_metric_name) if self.is_decreased_valid_metric: is_best = cur_valid_metric < self.best_valid_metric else: is_best = cur_valid_metric > self.best_valid_metric if is_best: # Save current best model self.best_valid_metric = cur_valid_metric best_model_path = os.path.join(self.save_dir, "best.model") save(self.model, best_model_path) self.logger.info( f"Saved best model to '{best_model_path}' with new best valid metric " f"{self.valid_metric_name.upper()}-{self.best_valid_metric:.3f}" ) # Save checkpoint if self.save_checkpoint: model_file = os.path.join(self.save_dir, f"epoch_{self.epoch}.model") save(self.model, model_file) if self.save_summary: with self.summary_logger.mode("valid"): for k, v in self.batch_metrics_tracker.items(): if k not in self.valid_summary: self.valid_summary[k] = self.summary_logger.scalar( k) scalar = self.valid_summary[k] scalar.add_record(self.batch_num, v) for k, v in self.token_metrics_tracker.items(): if k not in self.valid_summary: self.valid_summary[k] = self.summary_logger.scalar( k) scalar = self.valid_summary[k] scalar.add_record(self.batch_num, v) return
class SolverWrapper: def __init__(self, solver_prototxt, log_dir, pretrained_model=None): self.solver = caffe.SGDSolver(solver_prototxt) if pretrained_model is not None: print('Loading pretrained model weights from {:s}'.format(pretrained_model)) self.solver.net.copy_from(pretrained_model) self.solver_param = caffe_pb2.SolverParameter() with open(solver_prototxt, 'rt') as f: pb2.text_format.Merge(f.read(), self.solver_param) self.cur_epoch = 0 self.test_interval = 30 #用来替代self.solver_param.test_interval self.logw = LogWriter(log_dir, sync_cycle=10) with self.logw.mode('train') as logger: self.sc_train_loss = logger.scalar("loss") self.sc_train_acc = logger.scalar("Accuracy") with self.logw.mode('val') as logger: self.sc_val_acc = logger.scalar("Accuracy(acc)") self.sc_val_auc = logger.scalar("Area Under Roc Curve(auc)") self.sc_val_ap = logger.scalar("Average Precision(ap)") self.sc_val_se = logger.scalar("Sensitivity(se)") self.sc_val_sp = logger.scalar("Specificity(sp)") def train_model(self): """执行训练的整个流程,穿插了validation""" cur_iter = 0 test_batch_size, num_classes = self.solver.test_nets[0].blobs['prob'].shape num_test_images_tot = test_batch_size * self.solver_param.test_iter[0] while cur_iter < self.solver_param.max_iter: #self.solver.step(self.test_interval) for i in range(self.test_interval): self.solver.step(1) loss = self.solver.net.blobs['loss'].data acc = self.solver.net.blobs['accuracy'].data step = self.solver.iter self.sc_train_loss.add_record(step, loss) self.sc_train_acc.add_record(step, acc) self.eval_on_val(num_classes, num_test_images_tot, test_batch_size) cur_iter += self.test_interval def eval_on_val(self, num_classes, num_test_images_tot, test_batch_size): """在整个验证集上执行inference和evaluation""" self.solver.test_nets[0].share_with(self.solver.net) self.cur_epoch += 1 scores = np.zeros((num_classes, num_test_images_tot), dtype=float) gt_labels = np.zeros((1, num_test_images_tot), dtype=float).squeeze() for t in range(self.solver_param.test_iter[0]): output = self.solver.test_nets[0].forward() probs = output['prob'] labels = self.solver.test_nets[0].blobs['label'].data gt_labels[t*test_batch_size:(t+1)*test_batch_size] = labels.T.astype(float) scores[:,t*test_batch_size:(t+1)*test_batch_size] = probs.T # TODO: 处理最后一个batch样本少于num_test_images_per_batch的情况 acc, auc, ap, se, sp = perfeval.isic_cls_eval(scores, gt_labels) print('====================================================================\n') print('\tDo validation after the {:d}-th training epoch\n'.format(self.cur_epoch)) print('>>>>', end='\t') #设定标记,方便于解析日志获取出数据 print('acc={:.3f}, auc={:.3f}, ap={:.3f}, se={:.3f}, sp={:.3f}\n'.format(acc, auc, ap, se, sp)) print('\n====================================================================\n') step = self.solver.iter self.sc_val_acc.add_record(step, acc) self.sc_val_auc.add_record(step, auc) self.sc_val_ap.add_record(step, ap) self.sc_val_se.add_record(step, se) self.sc_val_sp.add_record(step, sp)
def train(args): """OCR training""" if args.model == "crnn_ctc": train_net = ctc_train_net get_feeder_data = get_ctc_feeder_data num_classes = None train_images = args.train_images train_list = args.train_list test_images = args.test_images test_list = args.test_list num_classes = data_reader.num_classes() if num_classes is None else num_classes data_shape = data_reader.data_shape() # define network sum_cost, error_evaluator, inference_program, model_average = train_net( args, data_shape, num_classes) logger = LogWriter('./log', sync_cycle=10) with logger.mode("train") as train_logger: train_acc = train_logger.scalar("train_acc") train_loss = train_logger.scalar("train_loss") val_loss = train_logger.scalar("val_loss") val_acc = train_logger.scalar("val_acc") # data reader train_reader = data_reader.train( args.batch_size, train_images_dir=train_images, train_list_file=train_list, cycle=args.total_step > 0, model=args.model) test_reader = data_reader.test( test_images_dir=test_images, test_list_file=test_list, model=args.model) # prepare environment place = fluid.CPUPlace() if args.use_gpu: place = fluid.CUDAPlace(0) exe = fluid.Executor(place) if 'ce_mode' in os.environ: fluid.default_startup_program().random_seed = 90 exe.run(fluid.default_startup_program()) # init_list=[] #for param in fluid.default_main_program().global_block().all_parameters(): # if "batch_norm" in param.name or "conv2d" in param.name: # init_list.append(param.name) # print ("%s=%s=%s" % (param.name, param.name, param.shape)) # load init model print("Initing Model:****************") if args.init_model is not None: model_dir = args.init_model model_file_name = None if not os.path.isdir(args.init_model): model_dir = os.path.dirname(args.init_model) model_file_name = os.path.basename(args.init_model) model_file_name = os.path.basename(args.init_model) fluid.io.load_params(exe, dirname=args.init_model, filename="model_369000") print("Init model from: %s." % args.init_model) train_exe = exe error_evaluator.reset(exe) if args.parallel: train_exe = fluid.ParallelExecutor( use_cuda=True if args.use_gpu else False, loss_name=sum_cost.name) fetch_vars = [sum_cost] + error_evaluator.metrics def train_one_batch(data): var_names = [var.name for var in fetch_vars] if args.parallel: results = train_exe.run(var_names, feed=get_feeder_data(data, place)) results = [np.array(result).sum() for result in results] else: results = train_exe.run(feed=get_feeder_data(data, place), fetch_list=fetch_vars) results = [result[0] for result in results] return results def test(iter_num): error_evaluator.reset(exe) res = 0 i = 0 for data in test_reader(): cost = exe.run(inference_program, feed=get_feeder_data(data, place), fetch_list=[sum_cost]) # if i == 0: # print(cost[0]) res += cost[0][0] i += 1 val_loss.add_record(iter_num, res / i) _, test_seq_error = error_evaluator.eval(exe) print("\nTime: %s; Iter[%d]; Test seq error: %s.\n" % ( time.time(), iter_num, str(test_seq_error[0]))) val_acc.add_record(iter_num, 1 - test_seq_error[0]) #Note: The following logs are special for CE monitoring. #Other situations do not need to care about these logs. print("kpis test_acc %f" % (1 - test_seq_error[0])) def save_model(args, exe, iter_num): filename = "model_%05d" % iter_num fluid.io.save_params( exe, dirname=args.save_model_dir, filename=filename) print("Saved model to: %s/%s." % (args.save_model_dir, filename)) iter_num = 0 stop = False start_time = time.time() while not stop: total_loss = 0.0 total_seq_error = 0.0 batch_times = [] # train a pass for data in train_reader(): if args.total_step > 0 and iter_num == args.total_step + args.skip_batch_num: stop = True break if iter_num < args.skip_batch_num: print("Warm-up iteration") if iter_num == args.skip_batch_num: profiler.reset_profiler() start = time.time() results = train_one_batch(data) batch_time = time.time() - start fps = args.batch_size / batch_time batch_times.append(batch_time) total_loss += results[0] total_seq_error += results[2] iter_num += 1 # training log if iter_num % args.log_period == 0: avg_loss = total_loss / (args.log_period) avg_err = total_seq_error / (args.log_period * args.batch_size) print("\nTime: %s; Iter[%d]; Avg loss: %.3f; Avg seq err: %.3f" % ( time.time(), iter_num, avg_loss, avg_err)) print("kpis train_cost %f" % (avg_loss)) print("kpis train_acc %f" % (1 - avg_err)) train_loss.add_record(iter_num, avg_loss) train_acc.add_record(iter_num, 1 - avg_err ) total_loss = 0.0 total_seq_error = 0.0 # evaluate if not args.skip_test and iter_num % args.eval_period == 0: if model_average: with model_average.apply(exe): test(iter_num) else: test(iter_num) # save model if iter_num % args.save_model_period == 0: if model_average: with model_average.apply(exe): save_model(args, exe, iter_num) else: save_model(args, exe, iter_num) end_time = time.time() print("kpis train_duration %f" % (end_time - start_time)) # Postprocess benchmark data latencies = batch_times[args.skip_batch_num:] latency_avg = np.average(latencies) latency_pc99 = np.percentile(latencies, 99) fpses = np.divide(args.batch_size, latencies) fps_avg = np.average(fpses) fps_pc99 = np.percentile(fpses, 1) # Benchmark output print('\nTotal examples (incl. warm-up): %d' % (iter_num * args.batch_size)) print('average latency: %.5f s, 99pc latency: %.5f s' % (latency_avg, latency_pc99)) print('average fps: %.5f, fps for 99pc latency: %.5f' % (fps_avg, fps_pc99))
def train(model, args): # 1. Create VisualDL logger logwriter = LogWriter(os.path.join(args.logdir, "visualdl_log"), sync_cycle=10) with logwriter.mode("Train") as writer: train_loss_scalar = writer.scalar("loss") train_acc_scalar = writer.scalar("acc") histogram1 = writer.histogram("Relation-BiLinear-W", 100) histogram2 = writer.histogram("Relation-BiLinear-b", 10) histogram3 = writer.histogram("Relation-FC-W", 100) with logwriter.mode("Val") as writer: val_acc_scalar = writer.scalar("acc") # 2. Setup program train_prog = fluid.default_main_program() train_startup = fluid.default_startup_program() train_reader = model.train_reader val_reader = model.val_reader test_reader = model.test_reader loss = model.loss mean_acc = model.mean_acc # Clone for val / test val_prog = train_prog.clone(for_test=True) test_prog = train_prog.clone(for_test=True) optimizer = fluid.optimizer.Adam(learning_rate=args.lr) optimizer.minimize(loss) # 3. Setup executor place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(train_startup) # 4. Get Relation Module params for VisualDL # print(fluid.io.get_program_parameter(train_startup)) relation_BL_w = train_startup.global_block().var("Relation-BiLinear.w_0") relation_BL_b = train_startup.global_block().var("Relation-BiLinear.b_0") relation_FC_w = train_startup.global_block().var("Relation-FC.w_0") # 5. Compile print("Compilling...") compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=loss.name) compiled_val_prog = fluid.CompiledProgram(val_prog).with_data_parallel( share_vars_from=compiled_train_prog) compiled_test_prog = fluid.CompiledProgram(test_prog).with_data_parallel( share_vars_from=compiled_train_prog) # 6. Setup data source token2idx_dict, unk_idx, pad_idx = get_token2id_dict(args.emb_path) print("Setup dataloader...") places = fluid.cuda_places() if args.use_cuda else fluid.cpu_places() train_reader.set_sample_generator(train_loader(args.train_data_path, args.N, args.K, args.Q, token2idx_dict, unk_idx, pad_idx, args.max_length), batch_size=args.batch_size, places=places) val_reader.set_sample_generator(val_test_loader(args.val_data_path, args.N, args.K, args.Q, token2idx_dict, unk_idx, pad_idx, args.max_length, data_type="val"), batch_size=1, places=places) test_reader.set_sample_generator(val_test_loader(args.test_data_path, args.N, args.K, args.Q, token2idx_dict, unk_idx, pad_idx, args.max_length, data_type="test"), batch_size=1, places=places) # 7. Train loop # Record the best model best_val_acc = 0 # Record the train loss/acc by sliding window loss_record, acc_record = [], [] loss_window = acc_window = 0 # Sum of sliding window window = 50 # The size of sliding window for epi, train_data in zip(range(1, args.train_episodes + 1), train_reader()): # 7.1 Run (train_cur_loss, train_cur_acc, relation_BL_w_value, relation_BL_b_value, relation_FC_w_value) = exe.run(program=compiled_train_prog, feed=train_data, fetch_list=[ loss.name, mean_acc.name, relation_BL_w.name, relation_BL_b.name, relation_FC_w.name ]) # print(train_cur_loss[0], train_cur_acc[0]) loss_record.append(train_cur_loss[0]) acc_record.append(train_cur_acc[0]) # + right - left loss_window += train_cur_loss[0] acc_window += train_cur_acc[0] if epi - window - 1 >= 0: # Ensure that the left side is in the sliding window loss_window -= loss_record[epi - window - 1] acc_window -= acc_record[epi - window - 1] if epi % window == 0: print( "{} [Train episode: {:5d}/{:5d}] ==> Loss: {:2.6f} Mean acc: {:2.4f}" .format( str(datetime.datetime.now())[:-7], epi, args.train_episodes, loss_window / window, 100 * acc_window / window)) # 7.2 Add metrics/params to VisualDL train_loss_scalar.add_record(epi, loss_window / window) train_acc_scalar.add_record(epi, acc_window / window) histogram1.add_record(epi, relation_BL_w_value.flatten()) histogram2.add_record(epi, relation_BL_b_value.flatten()) histogram3.add_record(epi, relation_FC_w_value.flatten()) # 7.3 Validation if args.val_data_path and epi % args.val_steps == 0: # 7.3.1 Run val once val_acc_mean = eval(exe, compiled_val_prog, val_reader, [mean_acc.name], run_type="Val") print("{} [Val result: {:5d}/{:5d}] ==> Mean acc: {:2.4f}".format( str(datetime.datetime.now())[:-7], epi, args.train_episodes, 100 * val_acc_mean)) # Add val acc to VisualDL val_acc_scalar.add_record(epi, val_acc_mean) # 7.3.2 Save best model if val_acc_mean > best_val_acc: best_val_acc = val_acc_mean fluid.io.save_inference_model( os.path.join(args.logdir, "infer_model"), ["totalQ", "support", "support_len", "query", "query_len"], [model.prediction], exe, main_program=train_prog, params_filename="__params__") print( "{} [Save model of val mean acc: {:2.4f}] ==> {}".format( str(datetime.datetime.now())[:-7], 100 * best_val_acc, os.path.join(args.logdir, "infer_model"))) # 8. Test if args.test_data_path: test_acc_mean = eval(exe, compiled_test_prog, test_reader, [mean_acc.name], run_type="Test") print("{} [Test result] ==> Mean acc: {:2.4f}".format( str(datetime.datetime.now())[:-7], 100 * test_acc_mean))
import os import paddle.fluid as fluid import paddle.fluid.framework as framework import paddle.v2 as paddle from paddle.fluid.initializer import NormalInitializer from paddle.fluid.param_attr import ParamAttr from visualdl import LogWriter from dataset import Dataset from net_fluid import simplenet # 创建VisualDL,并指定当前该项目的VisualDL的路径 logdir = "./logs" logwriter = LogWriter(logdir, sync_cycle=10) # 创建loss的趋势图 with logwriter.mode("train") as writer: loss_scalar = writer.scalar("loss") # 创建acc的趋势图 with logwriter.mode("train") as writer: acc_scalar = writer.scalar("acc") # 定义输出频率 num_samples = 4 # 创建卷积层和输出图像的图形化展示 with logwriter.mode("train") as writer: conv_image = writer.image("conv_image", num_samples, 1) input_image = writer.image("input_image", num_samples, 1) # 创建可视化的训练模型结构 with logwriter.mode("train") as writer:
def train(): model = BaseModel(batch_size=batch_size, maxlen=n_frames) loss, acc, output, no_grad_set = model.build_graph() main_program = fluid.default_main_program() inference_program = fluid.default_main_program().clone(for_test=True) optimizer = fluid.optimizer.Adadelta(0.001) optimizer.minimize(loss, no_grad_set=no_grad_set) place = fluid.CPUPlace() exe = Executor(place) exe.run(framework.default_startup_program()) log_writter = LogWriter(log_path, sync_cycle=10) with log_writter.mode("train") as logger: log_train_loss = logger.scalar(tag="train_loss") log_train_acc = logger.scalar(tag="train_acc") with log_writter.mode("validation") as logger: log_valid_loss = logger.scalar(tag="validation_loss") log_valid_acc = logger.scalar(tag="validation_acc") def prepare_input(batch): x, y, x_seqlen = batch res = {} res['input'] = np.array(x).astype("float32") res['input_seqlen'] = np.array(x_seqlen).astype("int64") res['label'] = np.array(y).astype("float32") return res # (samples, seq, width, height, pixel) noisy_movies, shifted_movies = reader.generate_movies(n_samples, n_frames) data = noisy_movies[:1000], shifted_movies[:1000] train_data, validation_data = split(data, validation_split) step_id = 0 for epoch_id in range(max_epoch): start_time = time.time() print("epoch id", epoch_id) valid_data_iter = reader.get_data_iter(validation_data, batch_size) train_data_iter = reader.get_data_iter(train_data, batch_size) # train total_loss = 0 batch_id = 0 for batch in train_data_iter: input_data_feed = prepare_input(batch) fetch_outs = exe.run(program=main_program, feed=input_data_feed, fetch_list=[loss.name, acc.name], use_program_cache=False) cost_train = np.array(fetch_outs[0]) acc_train = fetch_outs[1] total_loss += cost_train if batch_id > 0 and batch_id % 5 == 0: log_train_loss.add_record(step_id, total_loss) log_train_acc.add_record(step_id, acc_train) step_id += 1 print("current loss: %.7f, for batch %d" % (total_loss, batch_id)) total_loss = 0.0 batch_id += 1 # validate total_loss = 0 total_acc = 0 batch_id = 0 for batch in valid_data_iter: input_data_feed = prepare_input(batch) fetch_outs = exe.run(program=inference_program, feed=input_data_feed, fetch_list=[loss.name, acc.name], use_program_cache=False) cost_train = np.array(fetch_outs[0]) acc_train = fetch_outs[1] total_loss += cost_train batch_id += 1 log_valid_loss.add_record(epoch_id, total_loss) log_valid_acc.add_record(epoch_id, total_acc / batch_id) print("validation loss: %.7f" % (total_loss)) fluid.io.save_inference_model( dirname=params_path, feeded_var_names=['input', 'input_seqlen'], target_vars=[loss, acc], executor=exe)
# Step 3. Run the forward pass, getting log probabilities over next # words log_probs = model(context_idxs) # Step 4. Compute your loss function. (Again, Torch wants the target # word wrapped in a variable) loss = loss_function( log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long)) # Step 5. Do the backward pass and update the gradient loss.backward() optimizer.step() # Get the Python number from a 1-element Tensor by calling tensor.item() total_loss += loss.item() losses.append(total_loss) print(losses) # The loss decreased every iteration over the training data! # VisualDL setup logw = LogWriter("./embedding_log", sync_cycle=10000) with logw.mode('train') as logger: embedding = logger.embedding() embeddings_list = model.embeddings.weight.data.numpy( ) # convert to numpy array # VisualDL embedding log writer takes two parameters # The first parameter is embedding list. The type is list[list[float]] # The second parameter is word_dict. The type is dictionary<string, int>. embedding.add_embeddings_with_word_dict(embeddings_list, word_to_ix)
# coding=utf-8 from visualdl import LogWriter # 创建 LogWriter 对象 log_writer = LogWriter(".", sync_cycle=20) # 创建 scalar 组件,模式为 train with log_writer.mode("train") as logger: train_acc = logger.scalar("acc") train_loss = logger.scalar("loss") # 创建 scalar 组件,模式设为 test, tag 设为 acc with log_writer.mode("test") as logger: test_acc = logger.scalar("acc") value = [i / 1000.0 for i in range(1000)] for step in range(1000): # 向名称为 acc 的图中添加模式为train的数据 train_acc.add_record(step, value[step]) # 向名称为 loss 的图中添加模式为train的数据 train_loss.add_record(step, 1 / (value[step] + 1)) # 向名称为 acc 的图中添加模式为test的数据 test_acc.add_record(step, 1 - value[step])
def get_result(test_for): """ get log from db and produce protobuf logs :return: """ result_logs = bm.ViewVisualDLLog.objects.filter(test_for=test_for) if not result_logs: print("no {} results in latest paddle version".format(test_for)) return paddle_version = result_logs[0].paddle_version if result_logs else '' version_path = os.path.join(conf.ROOT_PATH, 'visualdl_logs', paddle_version) cmd = "if [ ! -d %s ]; then mkdir %s; fi" % (version_path, version_path) os.system(cmd) logdir = os.path.join(version_path, test_for) #logdir_des = conf.ROOT_PATH + '/visualdl_logs/latest' logdir_des = os.path.join(conf.ROOT_PATH, 'visualdl_logs', 'latest', test_for) cmd = "if [ -e %s ]; then rm -rf %s; fi; mkdir %s" % (logdir, logdir, logdir) os.system(cmd) logge = LogWriter(logdir, sync_cycle=1) def sample_log(result_log_dict, model, run_machine_type): """sample log from db log depends on model and run_machine_type""" if model == 'ocr': sample_ratio = 1 if run_machine_type.startswith("MULTI_MACHINE_MULTI"): sample_ratio = 62 elif run_machine_type.startswith("MULTI_MACHINE_ONE"): sample_ratio = 15 elif run_machine_type.startswith("ONE"): sample_ratio = 15 elif run_machine_type.startswith("FOUR"): sample_ratio = 15 elif run_machine_type.startswith("MULTI_GPU"): sample_ratio = 15 for k, v in result_log_dict.items(): sample_list = [ v[index] for index in range(len(v)) if index % sample_ratio == 0 ] result_log_dict[k] = [[index + 1, sample_list[index][1]] for index in range(len(sample_list))] return result_log_dict for log in result_logs: model = log.model test_for = log.test_for #code_from = log.code_from run_rpc_type = log.run_rpc_type.lower() run_machine_type = log.run_machine_type.lower() tag = "%s_%s_%s" % (test_for.split('_')[0], run_machine_type, run_rpc_type) result_log_dict = json.loads(log.result_log) #sample_log_dict = sample_log(result_log_dict, model, run_machine_type) print("visualdl_paint cur is: %s_%s_%s" % (model, tag, log.cloud_job_id)) for indicant, values in result_log_dict.items(): with logge.mode(indicant) as logge: val_tag = logge.scalar("%s/%s" % (model, tag)) for step, value in values: if value != 'NaN': val_tag.add_record(int(step), float(value)) cmd = "rm -rf %s && cp -r %s %s" % (logdir_des, logdir, logdir_des) os.system(cmd)