def __init__(self, data_loader, epochs, save_epoch, model_path, numTransform, numRef): self.data_loader = data_loader self.epochs = epochs self.model_path = model_path self.save_epoch = save_epoch self.numTransform = numTransform self.numRef = numRef self.G = Generator(numTransform, numRef) self.D = Discriminator(numTransform, numRef) self.G_optim = optim.SGD(self.G.parameters(), lr=1e-3, momentum=0.9) self.D_optim = optim.SGD(self.D.parameters(), lr=1e-3, momentum=0.9) if self.gpu_mode: self.G.cuda() self.D.cuda() self.BCE_loss = nn.BCELoss().cuda() self.L1_Loss = nn.L1Loss().cuda() else: self.BCE_loss = nn.BCELoss() self.L1_Loss = nn.L1Loss() self.save_path = model_path + '/model_%d.weights' logdir = model_path + "/tmp" logger = LogWriter(logdir, sync_cycle=10000) with logger.mode("train"): self.log_D_real_loss = logger.scalar("D/real_loss") self.log_D_fake_loss = logger.scalar("D/fake_loss") self.log_D_total_loss = logger.scalar("D/total_loss") self.log_G_D_loss = logger.scalar("G/D_Loss") self.log_G_L1_loss = logger.scalar("G/L1_Loss") self.log_G_total_loss = logger.scalar("G/total_Loss") with logger.mode("test"): self.log_test_loss = logger.scalar("test/loss")
mnist = mx.test_utils.get_mnist() batch_size = 100 # Provide a folder to store data for log, model, image, etc. VisualDL's visualization will be # based on this folder. logdir = "./tmp" # Initialize a logger instance. Parameter 'sync_cycle' means write a log every 10 operations on # memory. logger = LogWriter(logdir, sync_cycle=10) # mark the components with 'train' label. with logger.mode("train"): # scalar0 is used to record scalar metrics while MXNet is training. We will record accuracy. # In the visualization, we can see the accuracy is increasing as more training steps happen. scalar0 = logger.scalar("scalars/scalar0") image0 = logger.image("images/image0", 1) histogram0 = logger.histogram("histogram/histogram0", num_buckets=100) # Record training steps cnt_step = 0 # MXNet provides many callback interface. Here we define our own callback method and it is called # after every batch. # https://mxnet.incubator.apache.org/api/python/callback/callback.html def add_scalar(): def _callback(param): with logger.mode("train"): global cnt_step # Here the value is the accuracy we want to record
def main(): env = os.environ FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env if FLAGS.dist: trainer_id = int(env['PADDLE_TRAINER_ID']) import random local_seed = (99 + trainer_id) random.seed(local_seed) np.random.seed(local_seed) cfg = load_config(FLAGS.config) if 'architecture' in cfg: main_arch = cfg.architecture else: raise ValueError("'architecture' not specified in config file.") merge_config(FLAGS.opt) if 'log_iter' not in cfg: cfg.log_iter = 20 # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) # check if paddlepaddle version is satisfied check_version() if not FLAGS.dist or trainer_id == 0: print_total_cfg(cfg) if cfg.use_gpu: devices_num = fluid.core.get_cuda_device_count() else: devices_num = int(os.environ.get('CPU_NUM', 1)) if 'FLAGS_selected_gpus' in env: device_id = int(env['FLAGS_selected_gpus']) else: device_id = 0 place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) scheduler = cfg.LearningRate['schedulers'][0] if isinstance(scheduler, CosineDecayWithWarmup) and scheduler.max_iters is None: scheduler.max_iters = cfg.max_iters lr_builder = create('LearningRate') optim_builder = create('OptimizerBuilder') # build program startup_prog = fluid.Program() train_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) if FLAGS.fp16: assert (getattr(model.backbone, 'norm_type', None) != 'affine_channel'), \ '--fp16 currently does not support affine channel, ' \ ' please modify backbone settings to use batch norm' with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx: inputs_def = cfg['TrainReader']['inputs_def'] feed_vars, train_loader = model.build_inputs(**inputs_def) train_fetches = model.train(feed_vars) loss = train_fetches['loss'] if FLAGS.fp16: loss *= ctx.get_loss_scale_var() lr = lr_builder() optimizer = optim_builder(lr) optimizer.minimize(loss) if FLAGS.fp16: loss /= ctx.get_loss_scale_var() # parse train fetches train_keys, train_values, _ = parse_fetches(train_fetches) train_values.append(lr) if FLAGS.eval: eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg['EvalReader']['inputs_def'] feed_vars, eval_loader = model.build_inputs(**inputs_def) fetches = model.eval(feed_vars) eval_prog = eval_prog.clone(True) eval_reader = create_reader(cfg.EvalReader) eval_loader.set_sample_list_generator(eval_reader, place) # parse eval fetches extra_keys = [] if cfg.metric == 'COCO': extra_keys = ['im_info', 'im_id', 'im_shape'] if cfg.metric == 'VOC': extra_keys = ['gt_bbox', 'gt_class', 'is_difficult'] if cfg.metric == 'WIDERFACE': extra_keys = ['im_id', 'im_shape', 'gt_bbox'] eval_keys, eval_values, eval_cls = parse_fetches( fetches, eval_prog, extra_keys) # compile program for multi-devices build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_optimizer_ops = False # only enable sync_bn in multi GPU devices sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn' build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \ and cfg.use_gpu exec_strategy = fluid.ExecutionStrategy() # iteration number when CompiledProgram tries to drop local execution scopes. # Set it to be 1 to save memory usages, so that unused variables in # local execution scopes can be deleted after each iteration. exec_strategy.num_iteration_per_drop_scope = 1 if FLAGS.dist: dist_utils.prepare_for_multi_process(exe, build_strategy, startup_prog, train_prog) exec_strategy.num_threads = 1 exe.run(startup_prog) compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) if FLAGS.eval: compiled_eval_prog = fluid.compiler.CompiledProgram(eval_prog) fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel' ignore_params = cfg.finetune_exclude_pretrained_params \ if 'finetune_exclude_pretrained_params' in cfg else [] start_iter = 0 if FLAGS.resume_checkpoint: checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint) start_iter = checkpoint.global_step() elif cfg.pretrain_weights and fuse_bn and not ignore_params: checkpoint.load_and_fusebn(exe, train_prog, cfg.pretrain_weights) elif cfg.pretrain_weights: checkpoint.load_params(exe, train_prog, cfg.pretrain_weights, ignore_params=ignore_params) train_reader = create_reader(cfg.TrainReader, (cfg.max_iters - start_iter) * devices_num, cfg) train_loader.set_sample_list_generator(train_reader, place) # whether output bbox is normalized in model output layer is_bbox_normalized = False if hasattr(model, 'is_bbox_normalized') and \ callable(model.is_bbox_normalized): is_bbox_normalized = model.is_bbox_normalized() # if map_type not set, use default 11point, only use in VOC eval map_type = cfg.map_type if 'map_type' in cfg else '11point' train_stats = TrainingStats(cfg.log_smooth_window, train_keys) train_loader.start() start_time = time.time() end_time = time.time() cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) time_stat = deque(maxlen=cfg.log_smooth_window) best_box_ap_list = [0.0, 0] #[map, iter] # use tb-paddle to log data if FLAGS.use_tb: from tb_paddle import SummaryWriter tb_writer = SummaryWriter(FLAGS.tb_log_dir) tb_loss_step = 0 tb_mAP_step = 0 if FLAGS.use_vdl: from visualdl import LogWriter vdl_writer = LogWriter(FLAGS.vdl_log_dir, sync_cycle=5) with vdl_writer.mode("train"): scalars = [ vdl_writer.scalar(loss_name) for loss_name in train_keys ] mAP_scalar = vdl_writer.scalar("mAP") vdl_loss_step = 0 vdl_mAP_step = 0 for it in range(start_iter, cfg.max_iters): start_time = end_time end_time = time.time() time_stat.append(end_time - start_time) time_cost = np.mean(time_stat) eta_sec = (cfg.max_iters - it) * time_cost eta = str(datetime.timedelta(seconds=int(eta_sec))) outs = exe.run(compiled_train_prog, fetch_list=train_values) stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])} # use tb-paddle to log loss if FLAGS.use_tb: if it % cfg.log_iter == 0: for loss_name, loss_value in stats.items(): tb_writer.add_scalar(loss_name, loss_value, tb_loss_step) tb_loss_step += 1 if FLAGS.use_vdl: if it % cfg.log_iter == 0: for loss_name, scalar in zip(train_keys, scalars): loss_value = stats[loss_name] scalar.add_record(vdl_loss_step, loss_value) vdl_loss_step += 1 train_stats.update(stats) logs = train_stats.log() if it % cfg.log_iter == 0 and (not FLAGS.dist or trainer_id == 0): strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format( it, np.mean(outs[-1]), logs, time_cost, eta) logger.info(strs) if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \ and (not FLAGS.dist or trainer_id == 0): save_name = str(it) if it != cfg.max_iters - 1 else "model_final" checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name)) if FLAGS.eval: # evaluation results = eval_run(exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls) resolution = None if 'mask' in results[0]: resolution = model.mask_head.resolution box_ap_stats = eval_results(results, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type, cfg['EvalReader']['dataset']) # use tb_paddle to log mAP if FLAGS.use_tb: tb_writer.add_scalar("mAP", box_ap_stats[0], tb_mAP_step) tb_mAP_step += 1 if FLAGS.use_vdl: mAP_scalar.add_record(vdl_mAP_step, box_ap_stats[0]) vdl_mAP_step += 1 if box_ap_stats[0] > best_box_ap_list[0]: best_box_ap_list[0] = box_ap_stats[0] best_box_ap_list[1] = it checkpoint.save(exe, train_prog, os.path.join(save_dir, "best_model")) logger.info("Best test box ap: {}, in iter: {}".format( best_box_ap_list[0], best_box_ap_list[1])) train_loader.reset()
class Trainer(object): @classmethod def add_cmdline_argument(cls, parser): """ Add the cmdline arguments of trainer. """ group = parser.add_argument_group("Trainer") group.add_argument( "--use_data_distributed", type=str2bool, default=False, help="Whether to use data distributed for parallel training.") group.add_argument( "--valid_metric_name", type=str, default="-loss", help= "The validation metric determining which checkpoint is the best.") group.add_argument("--num_epochs", type=int, default=10, help="Total number of training epochs to perform.") group.add_argument( "--save_dir", type=str, required=True, help="The output directory where the model will be saved.") group.add_argument( "--batch_size", type=int, default=8, help="Total batch size for training/evaluation/inference.") group.add_argument( "--log_steps", type=int, default=100, help="The number of training steps to output current metrics " "on past training dataset.") group.add_argument( "--valid_steps", type=int, default=2000, help="The number of training steps to perform a evaluation " "on validation datasets.") group.add_argument( "--save_checkpoint", type=str2bool, default=True, help="Whether to save one checkpoints for each training epoch.") group.add_argument( "--save_summary", type=str2bool, default=False, help="Whether to save metrics summary for visualDL module.") DataLoader.add_cmdline_argument(group) return group def __init__(self, model, to_tensor, hparams, logger=None): # Use data distributed if hparams.use_data_distributed: strategy = parallel.prepare_context() if strategy is not None: parallel_model = parallel.DataParallel(model, strategy) model.before_backward_fn = parallel_model.scale_loss model.after_backward_fn = parallel_model.apply_collective_grads model = parallel_model self.model = model self.to_tensor = to_tensor self.is_decreased_valid_metric = hparams.valid_metric_name[0] == "-" self.valid_metric_name = hparams.valid_metric_name[1:] self.num_epochs = hparams.num_epochs self.save_dir = hparams.save_dir self.log_steps = hparams.log_steps self.valid_steps = hparams.valid_steps self.save_checkpoint = hparams.save_checkpoint self.save_summary = hparams.save_summary if not os.path.exists(self.save_dir): os.makedirs(self.save_dir) self.logger = logger or get_logger( os.path.join(self.save_dir, "trainer.log"), "trainer") if self.save_summary: from visualdl import LogWriter self.summary_logger = LogWriter(os.path.join( self.save_dir, "summary"), sync_cycle=10000) self.train_summary = {} self.valid_summary = {} self.batch_metrics_tracker = MetricsTracker() self.token_metrics_tracker = MetricsTracker() self.best_valid_metric = float( "inf" if self.is_decreased_valid_metric else "-inf") self.epoch = 0 self.batch_num = 0 def train_epoch(self, train_iter, valid_iter, infer_iter=None, infer_parse_dict=None): """ Train an epoch. @param train_iter @type : DataLoader @param valid_iter @type : DataLoader @param infer_iter @type : DataLoader @param infer_parse_dict @type : dict of function """ self.epoch += 1 num_batches = len(train_iter) self.batch_metrics_tracker.clear() self.token_metrics_tracker.clear() times = [] for batch_id, (batch, batch_size) in enumerate(train_iter, 1): batch = type(batch)(map(lambda kv: (kv[0], self.to_tensor(kv[1])), batch.items())) batch["epoch"] = self.epoch batch["num_steps"] = self.batch_num # Do a training iteration start_time = time.time() metrics = self.model(batch, is_training=True) token_num = metrics.pop("token_num", None) elapsed = time.time() - start_time times.append(elapsed) batch_metrics = { k: v for k, v in metrics.items() if "token" not in k } token_metrics = {k: v for k, v in metrics.items() if "token" in k} self.batch_metrics_tracker.update(batch_metrics, batch_size) self.token_metrics_tracker.update(token_metrics, token_num) self.batch_num += 1 if self.log_steps and batch_id % self.log_steps == 0: batch_metrics_message = self.batch_metrics_tracker.value() token_metrics_message = self.token_metrics_tracker.value() message_prefix = f"[Train][{self.epoch}][{batch_id}/{num_batches}]" avg_time = f"AVG_Time-{sum(times[-self.log_steps:]) / self.log_steps:.3f}" message = " ".join([ message_prefix, batch_metrics_message, token_metrics_message, avg_time ]) self.logger.info(message) if self.save_summary: with self.summary_logger.mode("train"): for k, v in self.batch_metrics_tracker.items(): if k not in self.train_summary: self.train_summary[k] = self.summary_logger.scalar( k) scalar = self.train_summary[k] scalar.add_record(self.batch_num, v) for k, v in self.token_metrics_tracker.items(): if k not in self.train_summary: self.train_summary[k] = self.summary_logger.scalar( k) scalar = self.train_summary[k] scalar.add_record(self.batch_num, v) if self.valid_steps and valid_iter is not None and \ batch_id % self.valid_steps == 0: self.evaluate(valid_iter) if valid_iter is not None: self.evaluate(valid_iter) if infer_iter is not None and infer_parse_dict is not None: self.infer(infer_iter, infer_parse_dict) return def infer(self, data_iter, parse_dict, num_batches=None): """ Inference interface. @param : data_iter @type : DataLoader @param : parse_dict @type : dict of function @param : num_batches : the number of batch to infer @type : int/None """ self.logger.info("Generation starts ...") infer_save_file = os.path.join(self.save_dir, f"infer_{self.epoch}.result.json") # Inference infer_results = [] batch_cnt = 0 begin_time = time.time() for batch, batch_size in tqdm(data_iter, total=num_batches): batch = type(batch)(map(lambda kv: (kv[0], self.to_tensor(kv[1])), batch.items())) result = self.model.infer(inputs=batch) batch_result = {} def to_list(batch): """ Parse list. """ return batch.tolist() # parse for k in result: if k in parse_dict: parse_fn = parse_dict[k] else: parse_fn = to_list if result[k] is not None: batch_result[k] = parse_fn(result[k]) for vs in zip(*batch_result.values()): infer_result = {} for k, v in zip(batch_result.keys(), vs): infer_result[k] = v infer_results.append(infer_result) batch_cnt += 1 if batch_cnt == num_batches: break self.logger.info(f"Saved inference results to {infer_save_file}") with open(infer_save_file, "w") as fp: json.dump(infer_results, fp, indent=2) infer_metrics_tracker = evaluate_generation_result(infer_results) metrics_message = infer_metrics_tracker.summary() message_prefix = f"[Infer][{self.epoch}]" time_cost = f"TIME-{time.time() - begin_time:.3f}" message = " ".join([message_prefix, metrics_message, time_cost]) self.logger.info(message) return def evaluate(self, data_iter, need_save=True): """ Evaluation interface @param : data_iter @type : DataLoader @param : need_save @type : bool """ if isinstance(self.model, parallel.DataParallel): need_save = need_save and parallel.Env().local_rank == 0 # Evaluation begin_time = time.time() batch_metrics_tracker = MetricsTracker() token_metrics_tracker = MetricsTracker() for batch, batch_size in data_iter: batch = type(batch)(map(lambda kv: (kv[0], self.to_tensor(kv[1])), batch.items())) metrics = self.model(batch, is_training=False) token_num = int(metrics.pop("token_num")) batch_metrics = { k: v for k, v in metrics.items() if "token" not in k } token_metrics = {k: v for k, v in metrics.items() if "token" in k} batch_metrics_tracker.update(batch_metrics, batch_size) token_metrics_tracker.update(token_metrics, token_num) batch_metrics_message = batch_metrics_tracker.summary() token_metrics_message = token_metrics_tracker.summary() message_prefix = f"[Valid][{self.epoch}]" time_cost = f"TIME-{time.time() - begin_time:.3f}" message = " ".join([ message_prefix, batch_metrics_message, token_metrics_message, time_cost ]) self.logger.info(message) if need_save: # Check valid metric cur_valid_metric = batch_metrics_tracker.get( self.valid_metric_name) if self.is_decreased_valid_metric: is_best = cur_valid_metric < self.best_valid_metric else: is_best = cur_valid_metric > self.best_valid_metric if is_best: # Save current best model self.best_valid_metric = cur_valid_metric best_model_path = os.path.join(self.save_dir, "best.model") save(self.model, best_model_path) self.logger.info( f"Saved best model to '{best_model_path}' with new best valid metric " f"{self.valid_metric_name.upper()}-{self.best_valid_metric:.3f}" ) # Save checkpoint if self.save_checkpoint: model_file = os.path.join(self.save_dir, f"epoch_{self.epoch}.model") save(self.model, model_file) if self.save_summary: with self.summary_logger.mode("valid"): for k, v in self.batch_metrics_tracker.items(): if k not in self.valid_summary: self.valid_summary[k] = self.summary_logger.scalar( k) scalar = self.valid_summary[k] scalar.add_record(self.batch_num, v) for k, v in self.token_metrics_tracker.items(): if k not in self.valid_summary: self.valid_summary[k] = self.summary_logger.scalar( k) scalar = self.valid_summary[k] scalar.add_record(self.batch_num, v) return
import random from visualdl import LogWriter import ca logdir='./temp' logger = LogWriter(logdir,sync_cycle=10) with logger.mode('train'): scalar0 = logger.scalar('scalar0') for step in range(0,1000): scalar0.add_record(step,random.random())
class StorageTest(unittest.TestCase): def setUp(self): self.dir = "./tmp/storage_test" self.writer = LogWriter(self.dir, sync_cycle=1).as_mode("train") def test_scalar(self): print('test write') scalar = self.writer.scalar("model/scalar/min") # scalar.set_caption("model/scalar/min") for i in range(10): scalar.add_record(i, float(i)) print('test read') self.reader = LogReader(self.dir) with self.reader.mode("train") as reader: scalar = reader.scalar("model/scalar/min") self.assertEqual(scalar.caption(), "train") records = scalar.records() ids = scalar.ids() self.assertTrue( np.equal(records, [float(i) for i in range(10 - 1)]).all()) self.assertTrue(np.equal(ids, [float(i) for i in range(10)]).all()) print('records', records) print('ids', ids) def test_image(self): tag = "layer1/layer2/image0" image_writer = self.writer.image(tag, 10, 1) num_passes = 10 num_samples = 100 shape = [10, 10, 3] for pass_ in range(num_passes): image_writer.start_sampling() for ins in range(num_samples): data = np.random.random(shape) * 256 data = np.ndarray.flatten(data) image_writer.add_sample(shape, list(data)) image_writer.finish_sampling() self.reader = LogReader(self.dir) with self.reader.mode("train") as reader: image_reader = reader.image(tag) self.assertEqual(image_reader.caption(), tag) self.assertEqual(image_reader.num_records(), num_passes) image_record = image_reader.record(0, 1) self.assertTrue(np.equal(image_record.shape(), shape).all()) data = image_record.data() self.assertEqual(len(data), np.prod(shape)) image_tags = reader.tags("image") self.assertTrue(image_tags) self.assertEqual(len(image_tags), 1) def test_check_image(self): ''' check whether the storage will keep image data consistent ''' print('check image') tag = "layer1/check/image1" image_writer = self.writer.image(tag, 10) image = Image.open("./dog.jpg") shape = [image.size[1], image.size[0], 3] origin_data = np.array(image.getdata()).flatten() self.reader = LogReader(self.dir) with self.reader.mode("train") as reader: image_writer.start_sampling() image_writer.add_sample(shape, list(origin_data)) image_writer.finish_sampling() # read and check whether the original image will be displayed image_reader = reader.image(tag) image_record = image_reader.record(0, 0) data = image_record.data() shape = image_record.shape() PIL_image_shape = (shape[0] * shape[1], shape[2]) data = np.array(data, dtype='uint8').reshape(PIL_image_shape) print('origin', origin_data.flatten()) print('data', data.flatten()) image = Image.fromarray(data.reshape(shape)) # manully check the image and found that nothing wrong with the image storage. # image.show() def test_with_syntax(self): with self.writer.mode("train") as writer: scalar = writer.scalar("model/scalar/average") for i in range(10): scalar.add_record(i, float(i)) self.reader = LogReader(self.dir) with self.reader.mode("train") as reader: scalar = reader.scalar("model/scalar/average") self.assertEqual(scalar.caption(), "train") def test_modes(self): store = LogWriter(self.dir, sync_cycle=1) scalars = [] for i in range(10): with store.mode("mode-%d" % i) as writer: scalar = writer.scalar("add/scalar0") scalars.append(scalar) for scalar in scalars[:-1]: for i in range(10): scalar.add_record(i, float(i))
model, base_learning_rate=0.1, policy="step", stepsize=1, gamma=0.999, ) # create VisualDL logger logdir = "/workspace" logger = LogWriter(logdir, sync_cycle=100) # mark the components with 'train' label. with logger.mode("train"): # create a scalar component called 'scalars/' scalar_caffe2_mnist_train_loss = logger.scalar( "scalars/scalar_caffe2_mnist_train_loss") scalar_caffe2_mnist_train_accuracy = logger.scalar( "scalars/scalar_caffe2_mnist_train_accuracy") histogram0 = logger.histogram("histogram/histogram0", num_buckets=50) histogram1 = logger.histogram("histogram/histogram1", num_buckets=50) # Specify the data will be input in NCHW order # (i.e. [batch_size, num_channels, height, width]) arg_scope = {"order": "NCHW"} # Create the model helper for the train model train_model = model_helper.ModelHelper(name="mnist_train", arg_scope=arg_scope) # Specify the input is from the train lmdb data, label = AddInput(train_model, batch_size=64, db=os.path.join(data_folder, 'mnist-train-nchw-lmdb'), db_type='lmdb')
# get data g, label, train_idx, valid_idx, test_idx, evaluator = get_graph_data( d_name=d_name, mini_data=eval(args.mini_data)) # create log writer log_writer = LogWriter(args.log_path, sync_cycle=10) with log_writer.mode("train") as logger: log_train_loss_epoch = logger.scalar("loss") log_train_rocauc_epoch = logger.scalar("rocauc") with log_writer.mode("valid") as logger: log_valid_loss_epoch = logger.scalar("loss") log_valid_rocauc_epoch = logger.scalar("rocauc") log_text = log_writer.text("text") log_time = log_writer.scalar("time") log_test_loss = log_writer.scalar("test_loss") log_test_rocauc = log_writer.scalar("test_rocauc") # training samples = [25, 10] # 2-hop sample size batch_size = args.batch_size sample_workers = 1 place = fluid.CUDAPlace(args.gpu_id) if args.use_gpu else fluid.CPUPlace() train_program = fluid.Program() startup_program = fluid.Program() with fluid.program_guard(train_program, startup_program): gw = pgl.graph_wrapper.GraphWrapper(
img = img / 2 + 0.5 # unnormalize npimg = img.numpy() fig, ax = plt.subplots() plt.imshow(np.transpose(npimg, (1, 2, 0))) # we can either show the image or save it locally # plt.show() fig.savefig('out' + str(np.random.randint(0, 10000)) + '.pdf') logdir = "./workspace" logger = LogWriter(logdir, sync_cycle=100) # mark the components with 'train' label. with logger.mode("train"): # create a scalar component called 'scalars/' scalar_pytorch_train_loss = logger.scalar( "scalars/scalar_pytorch_train_loss") image1 = logger.image("images/image1", 1) image2 = logger.image("images/image2", 1) histogram0 = logger.histogram("histogram/histogram0", num_buckets=100) # get some random training images dataiter = iter(trainloader) images, labels = dataiter.next() # show images imshow(torchvision.utils.make_grid(images)) # print labels print(' '.join('%5s' % classes[labels[j]] for j in range(4))) # Define a Convolution Neural Network
def get_result(test_for): """ get log from db and produce protobuf logs :return: """ result_logs = bm.ViewVisualDLLog.objects.filter(test_for=test_for) if not result_logs: print("no {} results in latest paddle version".format(test_for)) return paddle_version = result_logs[0].paddle_version if result_logs else '' version_path = os.path.join(conf.ROOT_PATH, 'visualdl_logs', paddle_version) cmd = "if [ ! -d %s ]; then mkdir %s; fi" % (version_path, version_path) os.system(cmd) logdir = os.path.join(version_path, test_for) #logdir_des = conf.ROOT_PATH + '/visualdl_logs/latest' logdir_des = os.path.join(conf.ROOT_PATH, 'visualdl_logs', 'latest', test_for) cmd = "if [ -e %s ]; then rm -rf %s; fi; mkdir %s" % (logdir, logdir, logdir) os.system(cmd) logge = LogWriter(logdir, sync_cycle=1) def sample_log(result_log_dict, model, run_machine_type): """sample log from db log depends on model and run_machine_type""" if model == 'ocr': sample_ratio = 1 if run_machine_type.startswith("MULTI_MACHINE_MULTI"): sample_ratio = 62 elif run_machine_type.startswith("MULTI_MACHINE_ONE"): sample_ratio = 15 elif run_machine_type.startswith("ONE"): sample_ratio = 15 elif run_machine_type.startswith("FOUR"): sample_ratio = 15 elif run_machine_type.startswith("MULTI_GPU"): sample_ratio = 15 for k, v in result_log_dict.items(): sample_list = [ v[index] for index in range(len(v)) if index % sample_ratio == 0 ] result_log_dict[k] = [[index + 1, sample_list[index][1]] for index in range(len(sample_list))] return result_log_dict for log in result_logs: model = log.model test_for = log.test_for #code_from = log.code_from run_rpc_type = log.run_rpc_type.lower() run_machine_type = log.run_machine_type.lower() tag = "%s_%s_%s" % (test_for.split('_')[0], run_machine_type, run_rpc_type) result_log_dict = json.loads(log.result_log) #sample_log_dict = sample_log(result_log_dict, model, run_machine_type) print("visualdl_paint cur is: %s_%s_%s" % (model, tag, log.cloud_job_id)) for indicant, values in result_log_dict.items(): with logge.mode(indicant) as logge: val_tag = logge.scalar("%s/%s" % (model, tag)) for step, value in values: if value != 'NaN': val_tag.add_record(int(step), float(value)) cmd = "rm -rf %s && cp -r %s %s" % (logdir_des, logdir, logdir_des) os.system(cmd)
model.add(Dense(128, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(num_classes, activation='softmax')) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(), metrics=['accuracy']) # create VisualDL logger logdir = "/workspace" logger = LogWriter(logdir, sync_cycle=100) # mark the components with 'train' label. with logger.mode("train"): # create a scalar component called 'scalars/' scalar_keras_train_loss = logger.scalar( "scalars/scalar_keras_mnist_train_loss") image_input = logger.image("images/input", 1) image0 = logger.image("images/image0", 1) image1 = logger.image("images/image1", 1) histogram0 = logger.histogram("histogram/histogram0", num_buckets=50) histogram1 = logger.histogram("histogram/histogram1", num_buckets=50) train_step = 0 class LossHistory(keras.callbacks.Callback): def on_batch_end(self, batch, logs={}): global train_step # Scalar scalar_keras_train_loss.add_record(train_step, logs.get('loss'))