示例#1
0
 def train_loop_pyreader():
     py_reader.start()
     train_stats = TrainingStats(cfg.log_window, keys)
     try:
         start_time = time.time()
         for iter_id in range(cfg.max_iter):
             prev_start_time = start_time
             start_time = time.time()
             outs = train_exe.run(fetch_list=[v.name for v in fetch_list])
             stats = {
                 k: np.array(v).mean()
                 for k, v in zip(keys, outs[:-1])
             }
             train_stats.update(stats)
             logs = train_stats.log()
             strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format(
                 now_time(), iter_id, np.mean(outs[-1]), logs,
                 start_time - prev_start_time)
             print(strs)
             sys.stdout.flush()
             if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0:
                 save_model("model_iter{}".format(iter_id))
         end_time = time.time()
         total_time = end_time - first_start_time
         last_loss = np.array(outs[0]).mean()
     except (StopIteration, fluid.core.EOFException):
         py_reader.reset()
示例#2
0
    def train_loop():
        start_time = time.time()
        prev_start_time = start_time
        start = start_time
        train_stats = TrainingStats(cfg.log_window, keys)
        for iter_id, data in enumerate(train_reader()):
            prev_start_time = start_time
            start_time = time.time()
            outs = train_exe.run(fetch_list=[v.name for v in fetch_list],
                                 feed=feeder.feed(data))
            stats = {k: np.array(v).mean() for k, v in zip(keys, outs[:-1])}
            train_stats.update(stats)
            logs = train_stats.log()
            strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format(
                now_time(), iter_id, np.mean(outs[-1]), logs,
                start_time - prev_start_time)
            print(strs)
            sys.stdout.flush()
            if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0:
                save_model("model_iter{}".format(iter_id))
            if (iter_id + 1) == cfg.max_iter:
                break
        end_time = time.time()
        total_time = end_time - start_time
        last_loss = np.array(outs[0]).mean()
        # only for ce
        if cfg.enable_ce:
            gpu_num = devices_num
            epoch_idx = iter_id + 1
            loss = last_loss
            print("kpis\teach_pass_duration_card%s\t%s" %
                  (gpu_num, total_time / epoch_idx))
            print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, loss))

        return np.mean(every_pass_loss)
示例#3
0
 def train_loop():
     start_time = time.time()
     prev_start_time = start_time
     start = start_time
     train_stats = TrainingStats(cfg.log_window, keys)
     #for iter_id, data in enumerate(next(data_generator)):
     for iter_id in range(100000):
         data = next(data_generator)
         #for data in data_list:
         prev_start_time = start_time
         start_time = time.time()
         outs = exe.run(compiled_train_prog, fetch_list=[v.name for v in fetch_list],
                              feed={"input_images": data[0],
                                    "input_score_maps": data[2],
                                    "input_geo_maps": data[3],
                                    "input_training_masks": data[4]})
         stats = {k: np.array(v).mean() for k, v in zip(keys, outs[:-1])}
         train_stats.update(stats)
         logs = train_stats.log()
         strs = '{}, batch: {}, lr: {:.5f}, {}, time: {:.3f}'.format(
             now_time(), iter_id,
             np.mean(outs[-1]), logs, start_time - prev_start_time)
         if iter_id % 10 == 0:
             print(strs)
         sys.stdout.flush()
         if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0:
             save_model(exe, "model_iter{}".format(iter_id), train_prog)
         if (iter_id + 1) == cfg.max_iter:
             break
     end_time = time.time()
     total_time = end_time - start_time
     last_loss = np.array(outs[0]).mean()
示例#4
0
    def train_loop_pyreader():
        py_reader.start()
        train_stats = TrainingStats(cfg.log_window, keys)
        try:
            start_time = time.time()
            prev_start_time = start_time
            for iter_id in range(cfg.max_iter):
                prev_start_time = start_time
                start_time = time.time()
                outs = train_exe.run(fetch_list=[v.name for v in fetch_list])
                stats = {
                    k: np.array(v).mean()
                    for k, v in zip(keys, outs[:-1])
                }
                train_stats.update(stats)
                logs = train_stats.log()
                strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format(
                    now_time(), iter_id, np.mean(outs[-1]), logs,
                    start_time - prev_start_time)
                print(strs)
                sys.stdout.flush()
                if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0:
                    save_model("model_iter{}".format(iter_id))

                #profiler tools, used for benchmark
                if args.is_profiler and iter_id == 10:
                    profiler.start_profiler("All")
                elif args.is_profiler and iter_id == 15:
                    profiler.stop_profiler("total", args.profiler_path)
                    return

            end_time = time.time()
            total_time = end_time - start_time
            last_loss = np.array(outs[0]).mean()
            if cfg.enable_ce:
                gpu_num = devices_num
                epoch_idx = iter_id + 1
                loss = last_loss
                print("kpis\teach_pass_duration_card%s\t%s" %
                      (gpu_num, total_time / epoch_idx))
                print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, loss))
        except (StopIteration, fluid.core.EOFException):
            py_reader.reset()
示例#5
0
 def train_loop():
     data_loader.start()
     train_stats = TrainingStats(cfg.log_window, keys)
     try:
         start_time = time.time()
         prev_start_time = start_time
         for iter_id in range(cfg.max_iter):
             prev_start_time = start_time
             start_time = time.time()
             outs = exe.run(compiled_train_prog,
                            fetch_list=[v.name for v in fetch_list])
             stats = {
                 k: np.array(v).mean()
                 for k, v in zip(keys, outs[:-1])
             }
             train_stats.update(stats)
             logs = train_stats.log()
             if iter_id % 10 == 0:
                 strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format(
                     now_time(), iter_id, np.mean(outs[-1]), logs,
                     start_time - prev_start_time)
                 print(strs)
             sys.stdout.flush()
             if (iter_id) % cfg.TRAIN.snapshot_iter == 0 and iter_id != 0:
                 save_name = "{}".format(iter_id)
                 checkpoint.save(
                     exe, train_prog,
                     os.path.join(cfg.model_save_dir, save_name))
             if (iter_id) == cfg.max_iter:
                 checkpoint.save(
                     exe, train_prog,
                     os.path.join(cfg.model_save_dir, "model_final"))
                 break
         end_time = time.time()
         total_time = end_time - start_time
         last_loss = np.array(outs[0]).mean()
     except (StopIteration, fluid.core.EOFException):
         data_loader.reset()
示例#6
0
 def train_loop():
     train_stats = TrainingStats(cfg.log_window, keys)
     start_time = time.time()
     for iter_id, data in enumerate(train_reader()):
         prev_start_time = start_time
         start_time = time.time()
         outs = train_exe.run(fetch_list=[v.name for v in fetch_list],
                              feed=feeder.feed(data))
         stats = {k: np.array(v).mean() for k, v in zip(keys, outs[:-1])}
         train_stats.update(stats)
         logs = train_stats.log()
         stats = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format(
             now_time(), iter_id, np.mean(outs[-1]), logs,
             start_time - prev_start_time)
         print(stats)
         sys.stdout.flush()
         if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0:
             save_model("model_iter{}".format(iter_id))
         if (iter_id + 1) == cfg.max_iter:
             break
     end_time = time.time()
     total_time = end_time - start_time
     last_loss = np.array(outs[0]).mean()
示例#7
0
    def train_loop():
        keys = ['loss', 'loss_cls', 'loss_bbox']
        train_stats = TrainingStats(cfg.log_window, keys)

        retinanet.train()
        for iter_id, data in enumerate(train_reader()):
            start_time = time.time()

            gt_max_num = 0
            batch_size = len(data)
            x = data[0]
            for x in data:
                #print(x[1].shape[0])
                if x[1].shape[0] > gt_max_num:
                    gt_max_num = x[1].shape[0]
            image_data = np.array(
                [x[0] for x in data]).astype('float32')
            if cfg.enable_ce:
                print('image: {} {}'.format(abs(image_data).sum(), image_data.shape))
            gt_box_data = np.zeros([batch_size, gt_max_num, 4])
            gt_label_data = np.zeros([batch_size, gt_max_num])
            is_crowd_data = np.ones([batch_size, gt_max_num])
            for batch_id, x in enumerate(data):
                gt_num = x[1].shape[0]
                gt_box_data[batch_id, 0:gt_num, :] = x[1]
                gt_label_data[batch_id, 0:gt_num] = x[2]
                is_crowd_data[batch_id, 0:gt_num] = x[3]
            gt_box_data = gt_box_data.astype('float32')
            gt_label_data = gt_label_data.astype('int32')
            is_crowd_data = is_crowd_data.astype('int32')
            im_info_data = np.array(
                [x[4] for x in data]).astype('float32')
            im_id_data = np.array(
                [x[5] for x in data]).astype('int32')
            outputs= retinanet('train', image_data, im_info_data, \
                gt_box_data, gt_label_data, is_crowd_data)
            loss_cls = outputs['loss_cls']
            loss_bbox = outputs['loss_bbox']
            loss = outputs['loss']
            score_pred = outputs['score_pred']
            loc_pred = outputs['loc_pred']
            cls_pred_list = outputs['cls_score_list']
            bbox_pred_list = outputs['bbox_pred_list']
            cls_score = outputs['cls_score']
            bbox_pred = outputs['bbox_pred']
            loss_cls_data = loss_cls.numpy()
            loss_bbox_data = loss_bbox.numpy()
            loss_data = loss.numpy()
            
            if cfg.use_data_parallel:
                loss = retinanet.scale_loss(loss)
                loss.backward()
                retinanet.apply_collective_grads()
            else:
                loss.backward()
            optimizer.minimize(loss)
            if cfg.enable_ce:
                print('score_pred grad: {} {}'.format(abs(score_pred.gradient()).sum(), score_pred.gradient().shape))
                print('loc_pred grad: {} {}'.format(abs(loc_pred.gradient()).sum(), loc_pred.gradient().shape))
                for var in cls_pred_list:
                    print('cls grad reshape: {} {}'.format(abs(var.gradient()).sum(), var.gradient().shape))
                for var in bbox_pred_list:
                    print('bbox grad reshape: {} {}'.format(abs(var.gradient()).sum(), var.gradient().shape))
                for var in cls_score:
                    print('cls grad original: {} {}'.format(abs(var.gradient()).sum(), var.gradient().shape))
                for var in bbox_pred:
                    print('bbox grad original: {} {}'.format(abs(var.gradient()).sum(), var.gradient().shape))
                dy_grad_value = {}
                for param in retinanet.parameters():
                    if param.name == 'retnet_cls_conv_n3_fpn3/Conv2D_0.retnet_cls_conv_n3_fpn3_w' or \
                        param.name == 'retnet_cls_conv_n2_fpn3/Conv2D_0.retnet_cls_conv_n2_fpn3_w' or \
                        param.name == 'retnet_cls_conv_n1_fpn3/Conv2D_0.retnet_cls_conv_n1_fpn3_w' or \
                        param.name == 'retnet_cls_conv_n0_fpn3/Conv2D_0.retnet_cls_conv_n0_fpn3_w' or \
                        param.name == 'retnet_cls_pred_fpn3/Conv2D_0.retnet_cls_pred_fpn3_w' or \
                        param.name == 'conv1/Conv2D_0.conv1_weights':
                        np_array = np.array(param._ivar._grad_ivar().value()
                                        .get_tensor())
                        dy_grad_value[param.name + core.grad_var_suffix(
                        )] = [abs(np_array).sum(), np_array.shape]
                        np_array = np.array(param._ivar.value().get_tensor())
                        dy_grad_value[param.name] = [abs(np_array).sum(), np_array.shape]
                for key, value in dy_grad_value.items():
                    print('{key}: {value}'.format(key = key, value = value))
            
            retinanet.clear_gradients()

            outs = [loss_data, loss_cls_data, loss_bbox_data]
            stats = {k: v.mean() for k, v in zip(keys, outs)}
            train_stats.update(stats)
            logs = train_stats.log()
            lr = optimizer._global_learning_rate().numpy()
            end_time = time.time()
            strs = '{}, iter: {}, lr: {} {}, time: {:.3f}'.format(
                now_time(), iter_id, lr,
                logs, end_time - start_time)
            print(strs)
            sys.stdout.flush()
            if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0:
                save_model(retinanet.state_dict(), "model_iter{}".format(iter_id), optimizer)
            if (iter_id + 1) == cfg.max_iter:
                break