def train_loop_pyreader(): py_reader.start() train_stats = TrainingStats(cfg.log_window, keys) try: start_time = time.time() for iter_id in range(cfg.max_iter): prev_start_time = start_time start_time = time.time() outs = train_exe.run(fetch_list=[v.name for v in fetch_list]) stats = { k: np.array(v).mean() for k, v in zip(keys, outs[:-1]) } train_stats.update(stats) logs = train_stats.log() strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format( now_time(), iter_id, np.mean(outs[-1]), logs, start_time - prev_start_time) print(strs) sys.stdout.flush() if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0: save_model("model_iter{}".format(iter_id)) end_time = time.time() total_time = end_time - first_start_time last_loss = np.array(outs[0]).mean() except (StopIteration, fluid.core.EOFException): py_reader.reset()
def train_loop(): start_time = time.time() prev_start_time = start_time start = start_time train_stats = TrainingStats(cfg.log_window, keys) for iter_id, data in enumerate(train_reader()): prev_start_time = start_time start_time = time.time() outs = train_exe.run(fetch_list=[v.name for v in fetch_list], feed=feeder.feed(data)) stats = {k: np.array(v).mean() for k, v in zip(keys, outs[:-1])} train_stats.update(stats) logs = train_stats.log() strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format( now_time(), iter_id, np.mean(outs[-1]), logs, start_time - prev_start_time) print(strs) sys.stdout.flush() if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0: save_model("model_iter{}".format(iter_id)) if (iter_id + 1) == cfg.max_iter: break end_time = time.time() total_time = end_time - start_time last_loss = np.array(outs[0]).mean() # only for ce if cfg.enable_ce: gpu_num = devices_num epoch_idx = iter_id + 1 loss = last_loss print("kpis\teach_pass_duration_card%s\t%s" % (gpu_num, total_time / epoch_idx)) print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, loss)) return np.mean(every_pass_loss)
def train_loop(): start_time = time.time() prev_start_time = start_time start = start_time train_stats = TrainingStats(cfg.log_window, keys) #for iter_id, data in enumerate(next(data_generator)): for iter_id in range(100000): data = next(data_generator) #for data in data_list: prev_start_time = start_time start_time = time.time() outs = exe.run(compiled_train_prog, fetch_list=[v.name for v in fetch_list], feed={"input_images": data[0], "input_score_maps": data[2], "input_geo_maps": data[3], "input_training_masks": data[4]}) stats = {k: np.array(v).mean() for k, v in zip(keys, outs[:-1])} train_stats.update(stats) logs = train_stats.log() strs = '{}, batch: {}, lr: {:.5f}, {}, time: {:.3f}'.format( now_time(), iter_id, np.mean(outs[-1]), logs, start_time - prev_start_time) if iter_id % 10 == 0: print(strs) sys.stdout.flush() if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0: save_model(exe, "model_iter{}".format(iter_id), train_prog) if (iter_id + 1) == cfg.max_iter: break end_time = time.time() total_time = end_time - start_time last_loss = np.array(outs[0]).mean()
def train_loop_pyreader(): py_reader.start() train_stats = TrainingStats(cfg.log_window, keys) try: start_time = time.time() prev_start_time = start_time for iter_id in range(cfg.max_iter): prev_start_time = start_time start_time = time.time() outs = train_exe.run(fetch_list=[v.name for v in fetch_list]) stats = { k: np.array(v).mean() for k, v in zip(keys, outs[:-1]) } train_stats.update(stats) logs = train_stats.log() strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format( now_time(), iter_id, np.mean(outs[-1]), logs, start_time - prev_start_time) print(strs) sys.stdout.flush() if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0: save_model("model_iter{}".format(iter_id)) #profiler tools, used for benchmark if args.is_profiler and iter_id == 10: profiler.start_profiler("All") elif args.is_profiler and iter_id == 15: profiler.stop_profiler("total", args.profiler_path) return end_time = time.time() total_time = end_time - start_time last_loss = np.array(outs[0]).mean() if cfg.enable_ce: gpu_num = devices_num epoch_idx = iter_id + 1 loss = last_loss print("kpis\teach_pass_duration_card%s\t%s" % (gpu_num, total_time / epoch_idx)) print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, loss)) except (StopIteration, fluid.core.EOFException): py_reader.reset()
def train_loop(): data_loader.start() train_stats = TrainingStats(cfg.log_window, keys) try: start_time = time.time() prev_start_time = start_time for iter_id in range(cfg.max_iter): prev_start_time = start_time start_time = time.time() outs = exe.run(compiled_train_prog, fetch_list=[v.name for v in fetch_list]) stats = { k: np.array(v).mean() for k, v in zip(keys, outs[:-1]) } train_stats.update(stats) logs = train_stats.log() if iter_id % 10 == 0: strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format( now_time(), iter_id, np.mean(outs[-1]), logs, start_time - prev_start_time) print(strs) sys.stdout.flush() if (iter_id) % cfg.TRAIN.snapshot_iter == 0 and iter_id != 0: save_name = "{}".format(iter_id) checkpoint.save( exe, train_prog, os.path.join(cfg.model_save_dir, save_name)) if (iter_id) == cfg.max_iter: checkpoint.save( exe, train_prog, os.path.join(cfg.model_save_dir, "model_final")) break end_time = time.time() total_time = end_time - start_time last_loss = np.array(outs[0]).mean() except (StopIteration, fluid.core.EOFException): data_loader.reset()
def train_loop(): train_stats = TrainingStats(cfg.log_window, keys) start_time = time.time() for iter_id, data in enumerate(train_reader()): prev_start_time = start_time start_time = time.time() outs = train_exe.run(fetch_list=[v.name for v in fetch_list], feed=feeder.feed(data)) stats = {k: np.array(v).mean() for k, v in zip(keys, outs[:-1])} train_stats.update(stats) logs = train_stats.log() stats = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format( now_time(), iter_id, np.mean(outs[-1]), logs, start_time - prev_start_time) print(stats) sys.stdout.flush() if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0: save_model("model_iter{}".format(iter_id)) if (iter_id + 1) == cfg.max_iter: break end_time = time.time() total_time = end_time - start_time last_loss = np.array(outs[0]).mean()
def train_loop(): keys = ['loss', 'loss_cls', 'loss_bbox'] train_stats = TrainingStats(cfg.log_window, keys) retinanet.train() for iter_id, data in enumerate(train_reader()): start_time = time.time() gt_max_num = 0 batch_size = len(data) x = data[0] for x in data: #print(x[1].shape[0]) if x[1].shape[0] > gt_max_num: gt_max_num = x[1].shape[0] image_data = np.array( [x[0] for x in data]).astype('float32') if cfg.enable_ce: print('image: {} {}'.format(abs(image_data).sum(), image_data.shape)) gt_box_data = np.zeros([batch_size, gt_max_num, 4]) gt_label_data = np.zeros([batch_size, gt_max_num]) is_crowd_data = np.ones([batch_size, gt_max_num]) for batch_id, x in enumerate(data): gt_num = x[1].shape[0] gt_box_data[batch_id, 0:gt_num, :] = x[1] gt_label_data[batch_id, 0:gt_num] = x[2] is_crowd_data[batch_id, 0:gt_num] = x[3] gt_box_data = gt_box_data.astype('float32') gt_label_data = gt_label_data.astype('int32') is_crowd_data = is_crowd_data.astype('int32') im_info_data = np.array( [x[4] for x in data]).astype('float32') im_id_data = np.array( [x[5] for x in data]).astype('int32') outputs= retinanet('train', image_data, im_info_data, \ gt_box_data, gt_label_data, is_crowd_data) loss_cls = outputs['loss_cls'] loss_bbox = outputs['loss_bbox'] loss = outputs['loss'] score_pred = outputs['score_pred'] loc_pred = outputs['loc_pred'] cls_pred_list = outputs['cls_score_list'] bbox_pred_list = outputs['bbox_pred_list'] cls_score = outputs['cls_score'] bbox_pred = outputs['bbox_pred'] loss_cls_data = loss_cls.numpy() loss_bbox_data = loss_bbox.numpy() loss_data = loss.numpy() if cfg.use_data_parallel: loss = retinanet.scale_loss(loss) loss.backward() retinanet.apply_collective_grads() else: loss.backward() optimizer.minimize(loss) if cfg.enable_ce: print('score_pred grad: {} {}'.format(abs(score_pred.gradient()).sum(), score_pred.gradient().shape)) print('loc_pred grad: {} {}'.format(abs(loc_pred.gradient()).sum(), loc_pred.gradient().shape)) for var in cls_pred_list: print('cls grad reshape: {} {}'.format(abs(var.gradient()).sum(), var.gradient().shape)) for var in bbox_pred_list: print('bbox grad reshape: {} {}'.format(abs(var.gradient()).sum(), var.gradient().shape)) for var in cls_score: print('cls grad original: {} {}'.format(abs(var.gradient()).sum(), var.gradient().shape)) for var in bbox_pred: print('bbox grad original: {} {}'.format(abs(var.gradient()).sum(), var.gradient().shape)) dy_grad_value = {} for param in retinanet.parameters(): if param.name == 'retnet_cls_conv_n3_fpn3/Conv2D_0.retnet_cls_conv_n3_fpn3_w' or \ param.name == 'retnet_cls_conv_n2_fpn3/Conv2D_0.retnet_cls_conv_n2_fpn3_w' or \ param.name == 'retnet_cls_conv_n1_fpn3/Conv2D_0.retnet_cls_conv_n1_fpn3_w' or \ param.name == 'retnet_cls_conv_n0_fpn3/Conv2D_0.retnet_cls_conv_n0_fpn3_w' or \ param.name == 'retnet_cls_pred_fpn3/Conv2D_0.retnet_cls_pred_fpn3_w' or \ param.name == 'conv1/Conv2D_0.conv1_weights': np_array = np.array(param._ivar._grad_ivar().value() .get_tensor()) dy_grad_value[param.name + core.grad_var_suffix( )] = [abs(np_array).sum(), np_array.shape] np_array = np.array(param._ivar.value().get_tensor()) dy_grad_value[param.name] = [abs(np_array).sum(), np_array.shape] for key, value in dy_grad_value.items(): print('{key}: {value}'.format(key = key, value = value)) retinanet.clear_gradients() outs = [loss_data, loss_cls_data, loss_bbox_data] stats = {k: v.mean() for k, v in zip(keys, outs)} train_stats.update(stats) logs = train_stats.log() lr = optimizer._global_learning_rate().numpy() end_time = time.time() strs = '{}, iter: {}, lr: {} {}, time: {:.3f}'.format( now_time(), iter_id, lr, logs, end_time - start_time) print(strs) sys.stdout.flush() if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0: save_model(retinanet.state_dict(), "model_iter{}".format(iter_id), optimizer) if (iter_id + 1) == cfg.max_iter: break