Exemplo n.º 1
0
def do_train(cfg, model, resume=False):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = DetectionCheckpointer(
        model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler
    )
    start_iter = (
        checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1
    )
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(
        checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter
    )

    writers = default_writers(cfg.OUTPUT_DIR, max_iter) if comm.is_main_process() else []

    # compared to "train_net.py", we do not support accurate timing and
    # precise BN here, because they are not trivial to implement in a small training loop
    data_loader = build_detection_train_loader(cfg)
    logger.info("Starting training from iteration {}".format(start_iter))
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            storage.iter = iteration

            loss_dict = model(data)
            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()}
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)
            scheduler.step()

            if (
                cfg.TEST.EVAL_PERIOD > 0
                and (iteration + 1) % cfg.TEST.EVAL_PERIOD == 0
                and iteration != max_iter - 1
            ):
                do_test(cfg, model)
                # Compared to "train_net.py", the test results are not dumped to EventStorage
                comm.synchronize()

            if iteration - start_iter > 5 and (
                (iteration + 1) % 20 == 0 or iteration == max_iter - 1
            ):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)
Exemplo n.º 2
0
def do_train(cfg, model, resume=False):
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=cfg.SOLVER.BASE_LR)
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10, 20], gamma=0.1)

    checkpointer = DetectionCheckpointer(
        model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler
    )
    start_iter = (
        checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1
    )
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(
        checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter
    )

    writers = [
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR)
    ]
    
    data_loader = build_detection_train_loader(cfg)
    logger.info("Starting training from iteration {}".format(start_iter))
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            iteration = iteration + 1
            storage.step()

            loss_dict = model(data)
            losses = sum(loss for loss in loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            storage.put_scalars(total_loss=losses, **loss_dict)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

            storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)

            if (
                cfg.TEST.EVAL_PERIOD > 0
                and iteration % cfg.TEST.EVAL_PERIOD == 0
                and iteration != max_iter
            ):
                do_test(cfg, model)
                scheduler.step()

            if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)
Exemplo n.º 3
0
 def get_d2_periodic_checkpointer(self, ):
   """
   periodic_checkpointer.step(epoch, **{'first_epoch': epoch})
   periodic_checkpointer.save(name='best', **{'max_mIoU': max_mIoU})
   """
   periodic_checkpointer = PeriodicCheckpointer(
     self.checkpointer, period=self.period, max_iter=self.maxsize, max_to_keep=self.max_to_keep)
   return periodic_checkpointer
Exemplo n.º 4
0
    def do_train(self, cfg, args, model, optimizer, resume=False):
        model.train()

        # some basic settings =========================
        dataset_meta = MetadataCatalog.get(cfg.DATASETS.TRAIN[0])
        data_ref = ref.__dict__[dataset_meta.ref_key]
        obj_names = dataset_meta.objs

        # load data ===================================
        train_dset_names = cfg.DATASETS.TRAIN
        data_loader = build_gdrn_train_loader(cfg, train_dset_names)
        data_loader_iter = iter(data_loader)

        # load 2nd train dataloader if needed
        train_2_dset_names = cfg.DATASETS.get("TRAIN2", ())
        train_2_ratio = cfg.DATASETS.get("TRAIN2_RATIO", 0.0)
        if train_2_ratio > 0.0 and len(train_2_dset_names) > 0:
            data_loader_2 = build_gdrn_train_loader(cfg, train_2_dset_names)
            data_loader_2_iter = iter(data_loader_2)
        else:
            data_loader_2 = None
            data_loader_2_iter = None

        images_per_batch = cfg.SOLVER.IMS_PER_BATCH
        if isinstance(data_loader, AspectRatioGroupedDataset):
            dataset_len = len(data_loader.dataset.dataset)
            if data_loader_2 is not None:
                dataset_len += len(data_loader_2.dataset.dataset)
            iters_per_epoch = dataset_len // images_per_batch
        else:
            dataset_len = len(data_loader.dataset)
            if data_loader_2 is not None:
                dataset_len += len(data_loader_2.dataset)
            iters_per_epoch = dataset_len // images_per_batch
        max_iter = cfg.SOLVER.TOTAL_EPOCHS * iters_per_epoch
        dprint("images_per_batch: ", images_per_batch)
        dprint("dataset length: ", dataset_len)
        dprint("iters per epoch: ", iters_per_epoch)
        dprint("total iters: ", max_iter)

        data_loader = self.setup_dataloaders(data_loader,
                                             replace_sampler=False,
                                             move_to_device=False)
        if data_loader_2 is not None:
            data_loader_2 = self.setup_dataloaders(data_loader_2,
                                                   replace_sampler=False,
                                                   move_to_device=False)

        scheduler = solver_utils.build_lr_scheduler(cfg,
                                                    optimizer,
                                                    total_iters=max_iter)

        # resume or load model ===================================
        extra_ckpt_dict = dict(
            optimizer=optimizer,
            scheduler=scheduler,
        )
        if hasattr(self._precision_plugin, "scaler"):
            extra_ckpt_dict["gradscaler"] = self._precision_plugin.scaler

        checkpointer = MyCheckpointer(
            model,
            cfg.OUTPUT_DIR,
            save_to_disk=self.is_global_zero,
            **extra_ckpt_dict,
        )
        start_iter = checkpointer.resume_or_load(
            cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1

        if cfg.SOLVER.CHECKPOINT_BY_EPOCH:
            ckpt_period = cfg.SOLVER.CHECKPOINT_PERIOD * iters_per_epoch
        else:
            ckpt_period = cfg.SOLVER.CHECKPOINT_PERIOD
        periodic_checkpointer = PeriodicCheckpointer(
            checkpointer,
            ckpt_period,
            max_iter=max_iter,
            max_to_keep=cfg.SOLVER.MAX_TO_KEEP)

        # build writers ==============================================
        tbx_event_writer = self.get_tbx_event_writer(
            cfg.OUTPUT_DIR, backup=not cfg.get("RESUME", False))
        tbx_writer = tbx_event_writer._writer  # NOTE: we want to write some non-scalar data
        writers = ([
            MyCommonMetricPrinter(max_iter),
            MyJSONWriter(osp.join(cfg.OUTPUT_DIR, "metrics.json")),
            tbx_event_writer
        ] if self.is_global_zero else [])

        # compared to "train_net.py", we do not support accurate timing and
        # precise BN here, because they are not trivial to implement
        logger.info("Starting training from iteration {}".format(start_iter))
        iter_time = None
        with EventStorage(start_iter) as storage:
            for iteration in range(start_iter, max_iter):
                storage.iter = iteration
                epoch = iteration // dataset_len + 1

                if np.random.rand() < train_2_ratio:
                    data = next(data_loader_2_iter)
                else:
                    data = next(data_loader_iter)

                if iter_time is not None:
                    storage.put_scalar("time", time.perf_counter() - iter_time)
                iter_time = time.perf_counter()

                # forward ============================================================
                batch = batch_data(cfg, data)

                out_dict, loss_dict = model(
                    batch["roi_img"],
                    gt_xyz=batch.get("roi_xyz", None),
                    gt_xyz_bin=batch.get("roi_xyz_bin", None),
                    gt_mask_trunc=batch["roi_mask_trunc"],
                    gt_mask_visib=batch["roi_mask_visib"],
                    gt_mask_obj=batch["roi_mask_obj"],
                    gt_region=batch.get("roi_region", None),
                    gt_allo_quat=batch.get("allo_quat", None),
                    gt_ego_quat=batch.get("ego_quat", None),
                    gt_allo_rot6d=batch.get("allo_rot6d", None),
                    gt_ego_rot6d=batch.get("ego_rot6d", None),
                    gt_ego_rot=batch.get("ego_rot", None),
                    gt_trans=batch.get("trans", None),
                    gt_trans_ratio=batch["roi_trans_ratio"],
                    gt_points=batch.get("roi_points", None),
                    sym_infos=batch.get("sym_info", None),
                    roi_classes=batch["roi_cls"],
                    roi_cams=batch["roi_cam"],
                    roi_whs=batch["roi_wh"],
                    roi_centers=batch["roi_center"],
                    resize_ratios=batch["resize_ratio"],
                    roi_coord_2d=batch.get("roi_coord_2d", None),
                    roi_extents=batch.get("roi_extent", None),
                    do_loss=True,
                )
                losses = sum(loss_dict.values())
                assert torch.isfinite(losses).all(), loss_dict

                loss_dict_reduced = {
                    k: v.item()
                    for k, v in comm.reduce_dict(loss_dict).items()
                }
                losses_reduced = sum(loss
                                     for loss in loss_dict_reduced.values())
                if self.is_global_zero:
                    storage.put_scalars(total_loss=losses_reduced,
                                        **loss_dict_reduced)

                optimizer.zero_grad(set_to_none=True)
                self.backward(losses)
                optimizer.step()

                storage.put_scalar("lr",
                                   optimizer.param_groups[0]["lr"],
                                   smoothing_hint=False)
                scheduler.step()

                if (cfg.TEST.EVAL_PERIOD > 0
                        and (iteration + 1) % cfg.TEST.EVAL_PERIOD == 0
                        and iteration != max_iter - 1):
                    self.do_test(cfg, model, epoch=epoch, iteration=iteration)
                    # Compared to "train_net.py", the test results are not dumped to EventStorage
                    self.barrier()

                if iteration - start_iter > 5 and (
                    (iteration + 1) % cfg.TRAIN.PRINT_FREQ == 0
                        or iteration == max_iter - 1 or iteration < 100):
                    for writer in writers:
                        writer.write()
                    # visualize some images ========================================
                    if cfg.TRAIN.VIS_IMG:
                        with torch.no_grad():
                            vis_i = 0
                            roi_img_vis = batch["roi_img"][vis_i].cpu().numpy()
                            roi_img_vis = denormalize_image(
                                roi_img_vis, cfg).transpose(1, 2,
                                                            0).astype("uint8")
                            tbx_writer.add_image("input_image", roi_img_vis,
                                                 iteration)

                            out_coor_x = out_dict["coor_x"].detach()
                            out_coor_y = out_dict["coor_y"].detach()
                            out_coor_z = out_dict["coor_z"].detach()
                            out_xyz = get_out_coor(cfg, out_coor_x, out_coor_y,
                                                   out_coor_z)

                            out_xyz_vis = out_xyz[vis_i].cpu().numpy(
                            ).transpose(1, 2, 0)
                            out_xyz_vis = get_emb_show(out_xyz_vis)
                            tbx_writer.add_image("out_xyz", out_xyz_vis,
                                                 iteration)

                            gt_xyz_vis = batch["roi_xyz"][vis_i].cpu().numpy(
                            ).transpose(1, 2, 0)
                            gt_xyz_vis = get_emb_show(gt_xyz_vis)
                            tbx_writer.add_image("gt_xyz", gt_xyz_vis,
                                                 iteration)

                            out_mask = out_dict["mask"].detach()
                            out_mask = get_out_mask(cfg, out_mask)
                            out_mask_vis = out_mask[vis_i, 0].cpu().numpy()
                            tbx_writer.add_image("out_mask", out_mask_vis,
                                                 iteration)

                            gt_mask_vis = batch["roi_mask"][vis_i].detach(
                            ).cpu().numpy()
                            tbx_writer.add_image("gt_mask", gt_mask_vis,
                                                 iteration)

                if (iteration + 1) % periodic_checkpointer.period == 0 or (
                        periodic_checkpointer.max_iter is not None and
                    (iteration + 1) >= periodic_checkpointer.max_iter):
                    if hasattr(optimizer,
                               "consolidate_state_dict"):  # for ddp_sharded
                        optimizer.consolidate_state_dict()
                periodic_checkpointer.step(iteration, epoch=epoch)
Exemplo n.º 5
0
    def do_train(self, cfg, model, resume):
        add_print_flops_callback(cfg, model, disable_after_callback=True)

        optimizer = self.build_optimizer(cfg, model)
        scheduler = self.build_lr_scheduler(cfg, optimizer)

        checkpointer = self.build_checkpointer(
            cfg,
            model,
            save_dir=cfg.OUTPUT_DIR,
            optimizer=optimizer,
            scheduler=scheduler,
        )
        checkpoint = checkpointer.resume_or_load(cfg.MODEL.WEIGHTS,
                                                 resume=resume)
        start_iter = (checkpoint.get("iteration", -1)
                      if resume and checkpointer.has_checkpoint() else -1)
        # The checkpoint stores the training iteration that just finished, thus we start
        # at the next iteration (or iter zero if there's no checkpoint).
        start_iter += 1
        max_iter = cfg.SOLVER.MAX_ITER
        periodic_checkpointer = PeriodicCheckpointer(
            checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter)

        data_loader = self.build_detection_train_loader(cfg)

        def _get_model_with_abnormal_checker(model):
            if not cfg.ABNORMAL_CHECKER.ENABLED:
                return model

            tbx_writer = _get_tbx_writer(
                get_tensorboard_log_dir(cfg.OUTPUT_DIR))
            writers = abnormal_checker.get_writers(cfg, tbx_writer)
            checker = abnormal_checker.AbnormalLossChecker(start_iter, writers)
            ret = abnormal_checker.AbnormalLossCheckerWrapper(model, checker)
            return ret

        trainer = (AMPTrainer if cfg.SOLVER.AMP.ENABLED else SimpleTrainer)(
            _get_model_with_abnormal_checker(model), data_loader, optimizer)
        trainer_hooks = [
            hooks.IterationTimer(),
            model_ema.EMAHook(cfg, model) if cfg.MODEL_EMA.ENABLED else None,
            self._create_after_step_hook(cfg, model, optimizer, scheduler,
                                         periodic_checkpointer),
            hooks.EvalHook(
                cfg.TEST.EVAL_PERIOD,
                lambda: self.do_test(cfg, model, train_iter=trainer.iter),
            ),
            kmeans_anchors.compute_kmeans_anchors_hook(self, cfg),
            self._create_qat_hook(cfg)
            if cfg.QUANTIZATION.QAT.ENABLED else None,
        ]

        if comm.is_main_process():
            tbx_writer = _get_tbx_writer(
                get_tensorboard_log_dir(cfg.OUTPUT_DIR))
            writers = [
                CommonMetricPrinter(max_iter),
                JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
                tbx_writer,
            ]
            trainer_hooks.append(hooks.PeriodicWriter(writers))
        trainer.register_hooks(trainer_hooks)
        trainer.train(start_iter, max_iter)

        if hasattr(self, 'original_cfg'):
            table = get_cfg_diff_table(cfg, self.original_cfg)
            logger.info(
                "GeneralizeRCNN Runner ignoring training config change: \n" +
                table)
            trained_cfg = self.original_cfg.clone()
        else:
            trained_cfg = cfg.clone()
        with temp_defrost(trained_cfg):
            trained_cfg.MODEL.WEIGHTS = checkpointer.get_checkpoint_file()
        return {"model_final": trained_cfg}
Exemplo n.º 6
0
def do_relation_train(cfg, model, resume=False):
    model.train()
    for param in model.named_parameters():
        param[1].requires_grad = False
    for param in model.named_parameters():
        for trainable in cfg.MODEL.TRAINABLE:
            if param[0].startswith(trainable):
                param[1].requires_grad = True
                break

        if param[0] == "relation_heads.instance_head.semantic_embed.weight" or \
            param[0] == "relation_heads.pair_head.semantic_embed.weight" or \
            param[0] == "relation_heads.predicate_head.semantic_embed.weight" or \
            param[0] == "relation_heads.triplet_head.ins_embed.weight" or \
            param[0] == "relation_heads.triplet_head.pred_embed.weight" or \
            param[0] == "relation_heads.subpred_head.sub_embed.weight" or \
            param[0] == "relation_heads.subpred_head.pred_embed.weight" or \
            param[0] == "relation_heads.predobj_head.pred_embed.weight" or \
            param[0] == "relation_heads.predobj_head.obj_embed.weight" or \
            param[0].startswith("relation_heads.predicate_head.freq_bias.obj_baseline.weight"):
            param[1].requires_grad = False

    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)
    metrics_sum_dict = {
        'relation_cls_tp_sum': 0,
        'relation_cls_p_sum': 0.00001,
        'pred_class_tp_sum': 0,
        'pred_class_p_sum': 0.00001,
        'gt_class_tp_sum': 0,
        'gt_class_p_sum': 0.00001,
        'raw_pred_class_tp_sum': 0,
        'raw_pred_class_p_sum': 0.00001,
        'instance_tp_sum':0,
        'instance_p_sum': 0.00001,
        'instance_g_sum':0.00001,
        'subpred_tp_sum': 0,
        'subpred_p_sum': 0.00001,
        'subpred_g_sum': 0.00001,
        'predobj_tp_sum': 0,
        'predobj_p_sum': 0.00001,
        'predobj_g_sum': 0.00001,
        'pair_tp_sum':0,
        'pair_p_sum': 0.00001,
        'pair_g_sum':0.00001,
        'confidence_tp_sum': 0,
        'confidence_p_sum': 0.00001,
        'confidence_g_sum': 0.00001,
        'predicate_tp_sum': 0,
        'predicate_tp20_sum': 0,
        'predicate_tp50_sum': 0,
        'predicate_tp100_sum': 0,
        'predicate_p_sum': 0.00001,
        'predicate_p20_sum': 0.00001,
        'predicate_p50_sum': 0.00001,
        'predicate_p100_sum': 0.00001,
        'predicate_g_sum': 0.00001,
        'triplet_tp_sum': 0,
        'triplet_tp20_sum': 0,
        'triplet_tp50_sum': 0,
        'triplet_tp100_sum': 0,
        'triplet_p_sum': 0.00001,
        'triplet_p20_sum': 0.00001,
        'triplet_p50_sum': 0.00001,
        'triplet_p100_sum': 0.00001,
        'triplet_g_sum': 0.00001,
    }
    checkpointer = DetectionCheckpointer(
        model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler, metrics_sum_dict=metrics_sum_dict
    )
    start_iter = (checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
    # state_dict=torch.load(cfg.MODEL.WEIGHTS).pop("model")
    # model.load_state_dict(state_dict,strict=False)
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter)

    # relation_cls_state_dict=torch.load(cfg.MODEL.WEIGHTS).pop("model")
    # for param in model.named_parameters():
    #     if param[0] not in relation_cls_state_dict:
    #         print(param[0])
    # model.load_state_dict(relation_cls_state_dict,strict=False)

    writers = (
        [
            CommonMetricPrinter(max_iter),
            JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
            TensorboardXWriter(cfg.OUTPUT_DIR),
        ]
        if comm.is_main_process()
        else []
    )
    metrics_pr_dict={}
    # compared to "train_net.py", we do not support accurate timing and
    # precise BN here, because they are not trivial to implement
    data_loader = build_detection_train_loader(cfg)
    logger.info("Starting training from iteration {}".format(start_iter))
    acumulate_losses=0
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            print(iteration)
            iteration = iteration + 1
            storage.step()
            if True:
            # try:
                pred_instances, results_dict, losses_dict, metrics_dict = model(data,iteration,mode="relation",training=True)
                losses = sum(loss for loss in losses_dict.values())
                assert torch.isfinite(losses).all(), losses_dict
                #print(losses_dict)

                loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(losses_dict).items()}
                losses_reduced = sum(loss for loss in loss_dict_reduced.values())
                acumulate_losses += losses_reduced
                if comm.is_main_process():
                    storage.put_scalars(acumulate_losses=acumulate_losses/(iteration-start_iter),total_loss=losses_reduced, **loss_dict_reduced)

                if 'relation_cls_tp' in metrics_dict:
                    metrics_sum_dict['relation_cls_tp_sum']+=metrics_dict['relation_cls_tp']
                    metrics_sum_dict['relation_cls_p_sum'] += metrics_dict['relation_cls_p']
                    metrics_pr_dict['relation_cls_precision'] = metrics_sum_dict['relation_cls_tp_sum'] / metrics_sum_dict['relation_cls_p_sum']
                if 'pred_class_tp' in metrics_dict:
                    metrics_sum_dict['pred_class_tp_sum']+=metrics_dict['pred_class_tp']
                    metrics_sum_dict['pred_class_p_sum'] += metrics_dict['pred_class_p']
                    metrics_pr_dict['pred_class_precision'] = metrics_sum_dict['pred_class_tp_sum'] / metrics_sum_dict['pred_class_p_sum']
                if 'raw_pred_class_tp' in metrics_dict:
                    metrics_sum_dict['raw_pred_class_tp_sum']+=metrics_dict['raw_pred_class_tp']
                    metrics_sum_dict['raw_pred_class_p_sum'] += metrics_dict['raw_pred_class_p']
                    metrics_pr_dict['raw_pred_class_precision'] = metrics_sum_dict['raw_pred_class_tp_sum'] / metrics_sum_dict['raw_pred_class_p_sum']
                if 'gt_class_tp' in metrics_dict:
                    metrics_sum_dict['gt_class_tp_sum']+=metrics_dict['gt_class_tp']
                    metrics_sum_dict['gt_class_p_sum'] += metrics_dict['gt_class_p']
                    metrics_pr_dict['gt_class_precision'] = metrics_sum_dict['gt_class_tp_sum'] / metrics_sum_dict['gt_class_p_sum']
                if 'instance_tp' in metrics_dict:
                    metrics_sum_dict['instance_tp_sum']+=metrics_dict['instance_tp']
                    metrics_sum_dict['instance_p_sum'] += metrics_dict['instance_p']
                    metrics_sum_dict['instance_g_sum'] += metrics_dict['instance_g']
                    metrics_pr_dict['instance_precision'] = metrics_sum_dict['instance_tp_sum'] / metrics_sum_dict['instance_p_sum']
                    metrics_pr_dict['instance_recall'] = metrics_sum_dict['instance_tp_sum'] / metrics_sum_dict['instance_g_sum']
                if 'subpred_tp' in metrics_dict:
                    metrics_sum_dict['subpred_tp_sum']+=metrics_dict['subpred_tp']
                    metrics_sum_dict['subpred_p_sum'] += metrics_dict['subpred_p']
                    metrics_sum_dict['subpred_g_sum'] += metrics_dict['subpred_g']
                    metrics_pr_dict['subpred_precision'] = metrics_sum_dict['subpred_tp_sum'] / metrics_sum_dict['subpred_p_sum']
                    metrics_pr_dict['subpred_recall'] = metrics_sum_dict['subpred_tp_sum'] / metrics_sum_dict['subpred_g_sum']
                if 'predobj_tp' in metrics_dict:
                    metrics_sum_dict['predobj_tp_sum']+=metrics_dict['predobj_tp']
                    metrics_sum_dict['predobj_p_sum'] += metrics_dict['predobj_p']
                    metrics_sum_dict['predobj_g_sum'] += metrics_dict['predobj_g']
                    metrics_pr_dict['predobj_precision'] = metrics_sum_dict['predobj_tp_sum'] / metrics_sum_dict['predobj_p_sum']
                    metrics_pr_dict['predobj_recall'] = metrics_sum_dict['predobj_tp_sum'] / metrics_sum_dict['predobj_g_sum']

                if 'pair_tp' in metrics_dict:
                    metrics_sum_dict['pair_tp_sum'] += metrics_dict['pair_tp']
                    metrics_sum_dict['pair_p_sum'] += metrics_dict['pair_p']
                    metrics_sum_dict['pair_g_sum'] += metrics_dict['pair_g']
                    metrics_pr_dict['pair_precision'] = metrics_sum_dict['pair_tp_sum'] / metrics_sum_dict['pair_p_sum']
                    metrics_pr_dict['pair_recall'] = metrics_sum_dict['pair_tp_sum'] / metrics_sum_dict['pair_g_sum']
                if 'confidence_tp' in metrics_dict:
                    metrics_sum_dict['confidence_tp_sum']+=metrics_dict['confidence_tp']
                    metrics_sum_dict['confidence_p_sum'] += metrics_dict['confidence_p']
                    metrics_sum_dict['confidence_g_sum'] += metrics_dict['confidence_g']
                    metrics_pr_dict['confidence_precision'] = metrics_sum_dict['confidence_tp_sum'] / metrics_sum_dict['confidence_p_sum']
                    metrics_pr_dict['confidence_recall'] = metrics_sum_dict['confidence_tp_sum'] / metrics_sum_dict['confidence_g_sum']
                if 'predicate_tp' in metrics_dict:
                    metrics_sum_dict['predicate_tp_sum']+=metrics_dict['predicate_tp']
                    metrics_sum_dict['predicate_tp20_sum'] += metrics_dict['predicate_tp20']
                    metrics_sum_dict['predicate_tp50_sum'] += metrics_dict['predicate_tp50']
                    metrics_sum_dict['predicate_tp100_sum'] += metrics_dict['predicate_tp100']
                    metrics_sum_dict['predicate_p_sum'] += metrics_dict['predicate_p']
                    metrics_sum_dict['predicate_p20_sum'] += metrics_dict['predicate_p20']
                    metrics_sum_dict['predicate_p50_sum'] += metrics_dict['predicate_p50']
                    metrics_sum_dict['predicate_p100_sum'] += metrics_dict['predicate_p100']
                    metrics_sum_dict['predicate_g_sum'] += metrics_dict['predicate_g']
                    metrics_pr_dict['predicate_precision'] = metrics_sum_dict['predicate_tp_sum'] / metrics_sum_dict['predicate_p_sum']
                    metrics_pr_dict['predicate_precision20'] = metrics_sum_dict['predicate_tp20_sum'] / metrics_sum_dict['predicate_p20_sum']
                    metrics_pr_dict['predicate_precision50'] = metrics_sum_dict['predicate_tp50_sum'] / metrics_sum_dict['predicate_p50_sum']
                    metrics_pr_dict['predicate_precision100'] = metrics_sum_dict['predicate_tp100_sum'] / metrics_sum_dict['predicate_p100_sum']
                    metrics_pr_dict['predicate_recall'] = metrics_sum_dict['predicate_tp_sum'] / metrics_sum_dict['predicate_g_sum']
                    metrics_pr_dict['predicate_recall20'] = metrics_sum_dict['predicate_tp20_sum'] / metrics_sum_dict['predicate_g_sum']
                    metrics_pr_dict['predicate_recall50'] = metrics_sum_dict['predicate_tp50_sum'] / metrics_sum_dict['predicate_g_sum']
                    metrics_pr_dict['predicate_recall100'] = metrics_sum_dict['predicate_tp100_sum'] / metrics_sum_dict['predicate_g_sum']
                if 'triplet_tp' in metrics_dict:
                    metrics_sum_dict['triplet_tp_sum'] += metrics_dict['triplet_tp']
                    metrics_sum_dict['triplet_tp20_sum'] += metrics_dict['triplet_tp20']
                    metrics_sum_dict['triplet_tp50_sum'] += metrics_dict['triplet_tp50']
                    metrics_sum_dict['triplet_tp100_sum'] += metrics_dict['triplet_tp100']
                    metrics_sum_dict['triplet_p_sum'] += metrics_dict['triplet_p']
                    metrics_sum_dict['triplet_p20_sum'] += metrics_dict['triplet_p20']
                    metrics_sum_dict['triplet_p50_sum'] += metrics_dict['triplet_p50']
                    metrics_sum_dict['triplet_p100_sum'] += metrics_dict['triplet_p100']
                    metrics_sum_dict['triplet_g_sum'] += metrics_dict['triplet_g']
                    metrics_pr_dict['triplet_precision'] = metrics_sum_dict['triplet_tp_sum'] / metrics_sum_dict['triplet_p_sum']
                    metrics_pr_dict['triplet_precision20'] = metrics_sum_dict['triplet_tp20_sum'] / metrics_sum_dict['triplet_p20_sum']
                    metrics_pr_dict['triplet_precision50'] = metrics_sum_dict['triplet_tp50_sum'] / metrics_sum_dict['triplet_p50_sum']
                    metrics_pr_dict['triplet_precision100'] = metrics_sum_dict['triplet_tp100_sum'] / metrics_sum_dict['triplet_p100_sum']
                    metrics_pr_dict['triplet_recall'] = metrics_sum_dict['triplet_tp_sum'] / metrics_sum_dict['triplet_g_sum']
                    metrics_pr_dict['triplet_recall20'] = metrics_sum_dict['triplet_tp20_sum'] / metrics_sum_dict['triplet_g_sum']
                    metrics_pr_dict['triplet_recall50'] = metrics_sum_dict['triplet_tp50_sum'] / metrics_sum_dict['triplet_g_sum']
                    metrics_pr_dict['triplet_recall100'] = metrics_sum_dict['triplet_tp100_sum'] / metrics_sum_dict['triplet_g_sum']

                storage.put_scalars(**metrics_pr_dict, smoothing_hint=False)

                optimizer.zero_grad()
                losses.backward()
                optimizer.step()
                storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)
                scheduler.step()

                if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter):
                    for writer in writers:
                        writer.write()
                periodic_checkpointer.step(iteration)
                torch.cuda.empty_cache()
Exemplo n.º 7
0
def do_train(cfg, args):
    # fmt: off
    run_func = cfg.start.get('run_func', 'train_func')
    dataset_name = cfg.start.dataset_name
    IMS_PER_BATCH = cfg.start.IMS_PER_BATCH * comm.get_world_size()
    NUM_WORKERS = cfg.start.NUM_WORKERS
    dataset_mapper = cfg.start.dataset_mapper

    max_epoch = cfg.start.max_epoch
    checkpoint_period = cfg.start.checkpoint_period

    resume_cfg = get_attr_kwargs(cfg.start, 'resume_cfg', default=None)

    cfg.defrost()
    cfg.DATASETS.TRAIN = (dataset_name, )
    cfg.SOLVER.IMS_PER_BATCH = IMS_PER_BATCH
    cfg.DATALOADER.NUM_WORKERS = NUM_WORKERS
    cfg.freeze()
    # fmt: on

    # build dataset
    mapper = build_dataset_mapper(dataset_mapper)
    data_loader = build_detection_train_loader(cfg, mapper=mapper)
    metadata = MetadataCatalog.get(dataset_name)
    num_samples = metadata.get('num_samples')
    iter_every_epoch = num_samples // IMS_PER_BATCH
    max_iter = iter_every_epoch * max_epoch

    model = build_trainer(cfg=cfg,
                          args=args,
                          iter_every_epoch=iter_every_epoch,
                          batch_size=IMS_PER_BATCH,
                          max_iter=max_iter,
                          metadata=metadata,
                          max_epoch=max_epoch,
                          data_loader=data_loader)
    model.train()

    # optimizer = build_optimizer(cfg, model)
    optims_dict = model.build_optimizer()
    scheduler = model.build_lr_scheduler()

    checkpointer = DetectionCheckpointer(model.get_saved_model(),
                                         cfg.OUTPUT_DIR, **optims_dict,
                                         **scheduler)
    if resume_cfg and resume_cfg.resume:
        resume_ckpt_dir = model._get_ckpt_path(
            ckpt_dir=resume_cfg.ckpt_dir,
            ckpt_epoch=resume_cfg.ckpt_epoch,
            iter_every_epoch=resume_cfg.iter_every_epoch)
        start_iter = (
            checkpointer.resume_or_load(resume_ckpt_dir).get("iteration", -1) +
            1)
        if get_attr_kwargs(resume_cfg, 'finetune', default=False):
            start_iter = 0
        model.after_resume()
    else:
        start_iter = 0

    if run_func != 'train_func':
        eval(f'model.{run_func}()')
        exit(0)

    checkpoint_period = eval(checkpoint_period,
                             dict(iter_every_epoch=iter_every_epoch))
    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 checkpoint_period,
                                                 max_iter=max_iter)
    logger.info("Starting training from iteration {}".format(start_iter))
    # modelarts_utils.modelarts_sync_results(args=myargs.args, myargs=myargs, join=True, end=False)
    with EventStorage(start_iter) as storage:
        pbar = zip(data_loader, range(start_iter, max_iter))
        if comm.is_main_process():
            pbar = tqdm.tqdm(
                pbar,
                desc=f'do_train, {args.tl_time_str}, '
                f'iters {iter_every_epoch} * bs {IMS_PER_BATCH} = '
                f'imgs {iter_every_epoch*IMS_PER_BATCH}',
                initial=start_iter,
                total=max_iter)

        for data, iteration in pbar:
            comm.synchronize()
            iteration = iteration + 1
            storage.step()

            model.train_func(data, iteration - 1, pbar=pbar)

            periodic_checkpointer.step(iteration)
            pass
    # modelarts_utils.modelarts_sync_results(args=myargs.args, myargs=myargs, join=True, end=True)
    comm.synchronize()
Exemplo n.º 8
0
def do_train(cfg, args, model, optimizer, resume=False):
    model.train()

    # some basic settings =========================
    dataset_meta = MetadataCatalog.get(cfg.DATASETS.TRAIN[0])
    data_ref = ref.__dict__[dataset_meta.ref_key]
    obj_names = dataset_meta.objs

    # load data ===================================
    train_dset_names = cfg.DATASETS.TRAIN
    data_loader = build_gdrn_train_loader(cfg, train_dset_names)
    data_loader_iter = iter(data_loader)

    # load 2nd train dataloader if needed
    train_2_dset_names = cfg.DATASETS.get("TRAIN2", ())
    train_2_ratio = cfg.DATASETS.get("TRAIN2_RATIO", 0.0)
    if train_2_ratio > 0.0 and len(train_2_dset_names) > 0:
        data_loader_2 = build_gdrn_train_loader(cfg, train_2_dset_names)
        data_loader_2_iter = iter(data_loader_2)
    else:
        data_loader_2 = None
        data_loader_2_iter = None

    images_per_batch = cfg.SOLVER.IMS_PER_BATCH
    if isinstance(data_loader, AspectRatioGroupedDataset):
        dataset_len = len(data_loader.dataset.dataset)
        if data_loader_2 is not None:
            dataset_len += len(data_loader_2.dataset.dataset)
        iters_per_epoch = dataset_len // images_per_batch
    else:
        dataset_len = len(data_loader.dataset)
        if data_loader_2 is not None:
            dataset_len += len(data_loader_2.dataset)
        iters_per_epoch = dataset_len // images_per_batch
    max_iter = cfg.SOLVER.TOTAL_EPOCHS * iters_per_epoch
    dprint("images_per_batch: ", images_per_batch)
    dprint("dataset length: ", dataset_len)
    dprint("iters per epoch: ", iters_per_epoch)
    dprint("total iters: ", max_iter)
    scheduler = solver_utils.build_lr_scheduler(cfg,
                                                optimizer,
                                                total_iters=max_iter)

    AMP_ON = cfg.SOLVER.AMP.ENABLED
    logger.info(f"AMP enabled: {AMP_ON}")
    grad_scaler = GradScaler()

    # resume or load model ===================================
    checkpointer = MyCheckpointer(
        model,
        cfg.OUTPUT_DIR,
        optimizer=optimizer,
        scheduler=scheduler,
        gradscaler=grad_scaler,
        save_to_disk=comm.is_main_process(),
    )
    start_iter = checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1

    if comm._USE_HVD:  # hvd may be not available, so do not use the one in args
        # not needed
        # start_iter = hvd.broadcast(torch.tensor(start_iter), root_rank=0, name="start_iter").item()

        # Horovod: broadcast parameters & optimizer state.
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)
        hvd.broadcast_optimizer_state(optimizer, root_rank=0)
        # Horovod: (optional) compression algorithm.
        compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none
        optimizer = hvd.DistributedOptimizer(
            optimizer,
            named_parameters=model.named_parameters(),
            op=hvd.Adasum if args.use_adasum else hvd.Average,
            compression=compression,
        )  # device_dense='/cpu:0'

    if cfg.SOLVER.CHECKPOINT_BY_EPOCH:
        ckpt_period = cfg.SOLVER.CHECKPOINT_PERIOD * iters_per_epoch
    else:
        ckpt_period = cfg.SOLVER.CHECKPOINT_PERIOD
    periodic_checkpointer = PeriodicCheckpointer(
        checkpointer,
        ckpt_period,
        max_iter=max_iter,
        max_to_keep=cfg.SOLVER.MAX_TO_KEEP)

    # build writers ==============================================
    tbx_event_writer = get_tbx_event_writer(
        cfg.OUTPUT_DIR, backup=not cfg.get("RESUME", False))
    tbx_writer = tbx_event_writer._writer  # NOTE: we want to write some non-scalar data
    writers = ([
        MyCommonMetricPrinter(max_iter),
        MyJSONWriter(osp.join(cfg.OUTPUT_DIR, "metrics.json")),
        tbx_event_writer
    ] if comm.is_main_process() else [])

    # compared to "train_net.py", we do not support accurate timing and
    # precise BN here, because they are not trivial to implement
    logger.info("Starting training from iteration {}".format(start_iter))
    iter_time = None
    with EventStorage(start_iter) as storage:
        # for data, iteration in zip(data_loader, range(start_iter, max_iter)):
        for iteration in range(start_iter, max_iter):
            storage.iter = iteration
            epoch = iteration // dataset_len + 1

            if np.random.rand() < train_2_ratio:
                data = next(data_loader_2_iter)
            else:
                data = next(data_loader_iter)

            if iter_time is not None:
                storage.put_scalar("time", time.perf_counter() - iter_time)
            iter_time = time.perf_counter()

            # forward ============================================================
            batch = batch_data(cfg, data)
            with autocast(enabled=AMP_ON):
                out_dict, loss_dict = model(
                    batch["roi_img"],
                    gt_xyz=batch.get("roi_xyz", None),
                    gt_xyz_bin=batch.get("roi_xyz_bin", None),
                    gt_mask_trunc=batch["roi_mask_trunc"],
                    gt_mask_visib=batch["roi_mask_visib"],
                    gt_mask_obj=batch["roi_mask_obj"],
                    gt_region=batch.get("roi_region", None),
                    gt_allo_quat=batch.get("allo_quat", None),
                    gt_ego_quat=batch.get("ego_quat", None),
                    gt_allo_rot6d=batch.get("allo_rot6d", None),
                    gt_ego_rot6d=batch.get("ego_rot6d", None),
                    gt_ego_rot=batch.get("ego_rot", None),
                    gt_trans=batch.get("trans", None),
                    gt_trans_ratio=batch["roi_trans_ratio"],
                    gt_points=batch.get("roi_points", None),
                    sym_infos=batch.get("sym_info", None),
                    roi_classes=batch["roi_cls"],
                    roi_cams=batch["roi_cam"],
                    roi_whs=batch["roi_wh"],
                    roi_centers=batch["roi_center"],
                    resize_ratios=batch["resize_ratio"],
                    roi_coord_2d=batch.get("roi_coord_2d", None),
                    roi_extents=batch.get("roi_extent", None),
                    do_loss=True,
                )
                losses = sum(loss_dict.values())
                assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            optimizer.zero_grad()
            if AMP_ON:
                grad_scaler.scale(losses).backward()

                # # Unscales the gradients of optimizer's assigned params in-place
                # grad_scaler.unscale_(optimizer)
                # # Since the gradients of optimizer's assigned params are unscaled, clips as usual:
                # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
                if comm._USE_HVD:
                    optimizer.synchronize()
                    with optimizer.skip_synchronize():
                        grad_scaler.step(optimizer)
                        grad_scaler.update()
                else:
                    grad_scaler.step(optimizer)
                    grad_scaler.update()
            else:
                losses.backward()
                optimizer.step()

            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()

            if cfg.TEST.EVAL_PERIOD > 0 and (
                    iteration + 1
            ) % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter - 1:
                do_test(cfg, model, epoch=epoch, iteration=iteration)
                # Compared to "train_net.py", the test results are not dumped to EventStorage
                comm.synchronize()

            if iteration - start_iter > 5 and (
                (iteration + 1) % cfg.TRAIN.PRINT_FREQ == 0
                    or iteration == max_iter - 1 or iteration < 100):
                for writer in writers:
                    writer.write()
                # visualize some images ========================================
                if cfg.TRAIN.VIS_IMG:
                    with torch.no_grad():
                        vis_i = 0
                        roi_img_vis = batch["roi_img"][vis_i].cpu().numpy()
                        roi_img_vis = denormalize_image(roi_img_vis,
                                                        cfg).transpose(
                                                            1, 2,
                                                            0).astype("uint8")
                        tbx_writer.add_image("input_image", roi_img_vis,
                                             iteration)

                        out_coor_x = out_dict["coor_x"].detach()
                        out_coor_y = out_dict["coor_y"].detach()
                        out_coor_z = out_dict["coor_z"].detach()
                        out_xyz = get_out_coor(cfg, out_coor_x, out_coor_y,
                                               out_coor_z)

                        out_xyz_vis = out_xyz[vis_i].cpu().numpy().transpose(
                            1, 2, 0)
                        out_xyz_vis = get_emb_show(out_xyz_vis)
                        tbx_writer.add_image("out_xyz", out_xyz_vis, iteration)

                        gt_xyz_vis = batch["roi_xyz"][vis_i].cpu().numpy(
                        ).transpose(1, 2, 0)
                        gt_xyz_vis = get_emb_show(gt_xyz_vis)
                        tbx_writer.add_image("gt_xyz", gt_xyz_vis, iteration)

                        out_mask = out_dict["mask"].detach()
                        out_mask = get_out_mask(cfg, out_mask)
                        out_mask_vis = out_mask[vis_i, 0].cpu().numpy()
                        tbx_writer.add_image("out_mask", out_mask_vis,
                                             iteration)

                        gt_mask_vis = batch["roi_mask"][vis_i].detach().cpu(
                        ).numpy()
                        tbx_writer.add_image("gt_mask", gt_mask_vis, iteration)
            periodic_checkpointer.step(iteration, epoch=epoch)
model = build_model(cfg)
logger.info("Model:\n{}".format(model))
model.train()
optimizer = build_optimizer(cfg, model)
scheduler = build_lr_scheduler(cfg, optimizer)

checkpointer = DetectionCheckpointer(
    model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler
)
start_iter = (
    checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=False).get("iteration", -1) + 1
)
max_iter = cfg.SOLVER.MAX_ITER

periodic_checkpointer = PeriodicCheckpointer(
    checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter
)

writers = (
    [
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ]
    if comm.is_main_process()
    else []
)

# compared to "train_net.py", we do not support accurate timing and
# precise BN here, because they are not trivial to implement
data_loader = build_detection_train_loader(cfg)
Exemplo n.º 10
0
def do_train(cfg_source, cfg_target, model, resume=False):

    model.train()
    print(model)

    optimizer = build_optimizer(cfg_source, model)
    scheduler = build_lr_scheduler(cfg_source, optimizer)

    checkpointer = DetectionCheckpointer(model,
                                         cfg_source.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)
    start_iter = (checkpointer.resume_or_load(
        cfg_source.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
    max_iter = cfg_source.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(
        checkpointer, cfg_source.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg_source.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg_source.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    i = 1
    max_epoch = 41.27  # max iter / min(data_len(data_source, data_target))
    current_epoch = 0
    data_len = 1502

    alpha3 = 0
    alpha4 = 0
    alpha5 = 0

    data_loader_source = build_detection_train_loader(cfg_source)
    data_loader_target = build_detection_train_loader(cfg_target)
    logger.info("Starting training from iteration {}".format(start_iter))

    with EventStorage(start_iter) as storage:
        for data_source, data_target, iteration in zip(
                data_loader_source, data_loader_target,
                range(start_iter, max_iter)):
            iteration = iteration + 1
            storage.step()

            if (iteration % data_len) == 0:
                current_epoch += 1
                i = 1

            p = float(i + current_epoch * data_len) / max_epoch / data_len
            alpha = 2. / (1. + np.exp(-10 * p)) - 1
            i += 1

            alpha3 = alpha
            alpha4 = alpha
            alpha5 = alpha

            if alpha3 > 0.5:
                alpha3 = 0.5

            if alpha4 > 0.5:
                alpha4 = 0.5

            if alpha5 > 0.1:
                alpha5 = 0.1

            loss_dict = model(data_source, False, alpha3, alpha4, alpha5)
            loss_dict_target = model(data_target, True, alpha3, alpha4, alpha5)
            loss_dict["loss_r3"] += loss_dict_target["loss_r3"]
            loss_dict["loss_r4"] += loss_dict_target["loss_r4"]
            loss_dict["loss_r5"] += loss_dict_target["loss_r5"]

            loss_dict["loss_r3"] *= 0.5
            loss_dict["loss_r4"] *= 0.5
            loss_dict["loss_r5"] *= 0.5

            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()

            if iteration - start_iter > 5 and (iteration % 20 == 0
                                               or iteration == max_iter):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)
Exemplo n.º 11
0
    def do_train(self, cfg, model, resume):
        # Note that flops at the beginning of training is often inaccurate,
        # if a model has input-dependent logic
        attach_profilers(cfg, model)

        optimizer = self.build_optimizer(cfg, model)
        scheduler = self.build_lr_scheduler(cfg, optimizer)

        checkpointer = self.build_checkpointer(
            cfg,
            model,
            save_dir=cfg.OUTPUT_DIR,
            optimizer=optimizer,
            scheduler=scheduler,
        )
        checkpoint = checkpointer.resume_or_load(cfg.MODEL.WEIGHTS,
                                                 resume=resume)
        start_iter = (checkpoint.get("iteration", -1)
                      if resume and checkpointer.has_checkpoint() else -1)
        # The checkpoint stores the training iteration that just finished, thus we start
        # at the next iteration (or iter zero if there's no checkpoint).
        start_iter += 1
        max_iter = cfg.SOLVER.MAX_ITER
        periodic_checkpointer = PeriodicCheckpointer(
            checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter)

        data_loader = self.build_detection_train_loader(cfg)

        def _get_model_with_abnormal_checker(model):
            if not cfg.ABNORMAL_CHECKER.ENABLED:
                return model

            tbx_writer = self.get_tbx_writer(cfg)
            writers = abnormal_checker.get_writers(cfg, tbx_writer)
            checker = abnormal_checker.AbnormalLossChecker(start_iter, writers)
            ret = abnormal_checker.AbnormalLossCheckerWrapper(model, checker)
            return ret

        trainer = (AMPTrainer if cfg.SOLVER.AMP.ENABLED else SimpleTrainer)(
            _get_model_with_abnormal_checker(model), data_loader, optimizer)
        trainer_hooks = self._get_trainer_hooks(cfg, model, optimizer,
                                                scheduler,
                                                periodic_checkpointer, trainer)

        if comm.is_main_process():
            tbx_writer = self.get_tbx_writer(cfg)
            writers = [
                CommonMetricPrinter(max_iter),
                JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
                tbx_writer,
            ]
            trainer_hooks.append(hooks.PeriodicWriter(writers))
        update_hooks_from_registry(trainer_hooks)
        trainer.register_hooks(trainer_hooks)
        trainer.train(start_iter, max_iter)

        if hasattr(self, "original_cfg"):
            table = get_cfg_diff_table(cfg, self.original_cfg)
            logger.info(
                "GeneralizeRCNN Runner ignoring training config change: \n" +
                table)
            trained_cfg = self.original_cfg.clone()
        else:
            trained_cfg = cfg.clone()
        with temp_defrost(trained_cfg):
            trained_cfg.MODEL.WEIGHTS = checkpointer.get_checkpoint_file()
        return {"model_final": trained_cfg}
def do_train(cfg, model, resume=False):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = DetectionCheckpointer(model,
                                         cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)
    start_iter = (
        checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get(
            "iteration", -1) +
        1  #FIXME: does not continue from iteration # when resume=True
    )
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    # init best monitor metric
    best_monitor_metric = None

    # init early stopping count
    es_count = 0

    # get train data loader
    data_loader = build_train_loader(cfg)
    logger.info("Starting training from iteration {}".format(start_iter))
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            storage.step()

            _, losses, losses_reduced = get_loss(data, model)
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()

            if (cfg.TEST.EVAL_PERIOD > 0
                    and (iteration + 1) % cfg.TEST.EVAL_PERIOD == 0
                    and iteration != max_iter - 1):
                results = do_test(cfg, model)
                storage.put_scalars(**results['metrics'])

                if cfg.EARLY_STOPPING.ENABLE:
                    curr = None
                    if cfg.EARLY_STOPPING.MONITOR in results['metrics'].keys():
                        curr = results['metrics'][cfg.EARLY_STOPPING.MONITOR]

                    if curr is None:
                        logger.warning(
                            "Early stopping enabled but cannot find metric: %s"
                            % cfg.EARLY_STOPPING.MONITOR)
                        logger.warning(
                            "Options for monitored metrics are: [%s]" %
                            ", ".join(map(str, results['metrics'].keys())))
                    elif best_monitor_metric is None:
                        best_monitor_metric = curr
                    elif get_es_result(cfg.EARLY_STOPPING.MODE, curr,
                                       best_monitor_metric):
                        best_monitor_metric = curr
                        es_count = 0
                        logger.info("Best metric %s improved to %0.4f" %
                                    (cfg.EARLY_STOPPING.MONITOR, curr))
                        # update best model
                        periodic_checkpointer.save(name="model_best",
                                                   **{**results['metrics']})
                        # save best metrics to a .txt file
                        with open(
                                os.path.join(cfg.OUTPUT_DIR,
                                             'best_metrics.txt'), 'w') as f:
                            json.dump(results['metrics'], f)
                    else:
                        logger.info(
                            "Early stopping metric %s did not improve, current %.04f, best %.04f"
                            % (cfg.EARLY_STOPPING.MONITOR, curr,
                               best_monitor_metric))
                        es_count += 1

                storage.put_scalar('val_loss', results['metrics']['val_loss'])

                comm.synchronize()

            if iteration - start_iter > 5 and ((iteration + 1) % 20 == 0
                                               or iteration == max_iter - 1):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)

            if es_count >= cfg.EARLY_STOPPING.PATIENCE:
                logger.info(
                    "Early stopping triggered, metric %s has not improved for %s validation steps"
                    %
                    (cfg.EARLY_STOPPING.MONITOR, cfg.EARLY_STOPPING.PATIENCE))
                break
Exemplo n.º 13
0
def do_train(cfg, model, resume=False):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = DetectionCheckpointer(
        model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler
    )
    start_iter = (
        checkpointer.resume_or_load(
                cfg.MODEL.WEIGHTS, resume=resume,
            ).get("iteration", -1) + 1
    )
    if cfg.SOLVER.RESET_ITER:
        logger.info('Reset loaded iteration. Start training from iteration 0.')
        start_iter = 0

    max_iter = cfg.SOLVER.MAX_ITER
    periodic_checkpointer = PeriodicCheckpointer(
        checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter
    )

    writers = (
        [
            CommonMetricPrinter(max_iter),
            JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
            TensorboardXWriter(cfg.OUTPUT_DIR),
        ]
        if comm.is_main_process()
        else []
    )


    if cfg.MULTI_DATASET.ENABLED:
        data_loader = build_multi_dataset_train_loader(cfg)
        dataset_count = {k: torch.tensor(0).to(comm.get_local_rank()) for k in cfg.MULTI_DATASET.DATASETS}
    else:
        data_loader = build_custom_train_loader(cfg)
    logger.info("Starting training from iteration {}".format(start_iter))
    with EventStorage(start_iter) as storage:
        step_timer = Timer()
        data_timer = Timer()
        start_time = time.perf_counter()
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            data_time = data_timer.seconds()
            storage.put_scalars(data_time=data_time)
            step_timer.reset()
            iteration = iteration + 1
            storage.step()

            loss_dict = model(data)

            losses = sum(
                loss for k, loss in loss_dict.items())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {k: v.item() \
                for k, v in comm.reduce_dict(loss_dict).items()}
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(
                    total_loss=losses_reduced, **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

            storage.put_scalar(
                "lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)
            if cfg.MULTI_DATASET.ENABLED:
                for b in data:
                    dataset_count[cfg.MULTI_DATASET.DATASETS[b['dataset_source']]] += 1
                dataset_count_reduced = {k: v for k, v in \
                    comm.reduce_dict(dataset_count).items()}
                if comm.is_main_process():
                    storage.put_scalars(**dataset_count_reduced)
            step_time = step_timer.seconds()
            storage.put_scalars(time=step_time)
            data_timer.reset()
            scheduler.step()

            if (
                cfg.TEST.EVAL_PERIOD > 0
                and iteration % cfg.TEST.EVAL_PERIOD == 0
                and iteration != max_iter
            ):
                do_test(cfg, model)
                comm.synchronize()

            if iteration - start_iter > 5 and \
                (iteration % 20 == 0 or iteration == max_iter):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)

        total_time = time.perf_counter() - start_time
        logger.info(
            "Total training time: {}".format(
                str(datetime.timedelta(seconds=int(total_time)))))
def do_train(cfg, model, resume=False, patience=20):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    scheduler2 = ReduceLROnPlateau(optimizer, mode="max")

    # warmup_scheduler = warmup.LinearWarmup(optimizer, warmup_period=200)

    checkpointer = DetectionCheckpointer(model,
                                         cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)
    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    # compared to "train_net.py", we do not support accurate timing and
    # precise BN here, because they are not trivial to implement in a small training loop
    data_loader = build_detection_train_loader(cfg)
    logger.info("Starting training from iteration {}".format(start_iter))
    best_ap50 = 0
    best_iteration = 0
    patience_counter = 0
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            storage.step()

            loss_dict = model(data)
            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()
            # warmup_scheduler.dampen(iteration)

            if (cfg.TEST.EVAL_PERIOD > 0
                    and (iteration + 1) % cfg.TEST.EVAL_PERIOD == 0
                    and iteration != max_iter - 1):
                test_results = do_test(cfg, model)

                # scheduler2.step(test_results["bbox"]["AP50"])
                # early stopping.

                # save checkpoint to disk

                checkpointer.save(f"model_{iteration}")

                # TODO: restore from best model
                if test_results["bbox"]["AP50"] > best_ap50:
                    best_ap50 = test_results["bbox"]["AP50"]
                    best_iteration = iteration
                    # reset patience counter
                    patience_counter = 0
                    logger.info(f"Patience counter reset.")
                else:
                    patience_counter += 1
                    logger.info(
                        f"Patience counter increased to {patience_counter}, will be terminated at {patience}"
                    )
                    if patience_counter > patience:
                        for writer in writers:
                            writer.write()
                        # restore to best checkpoint

                        checkpointer.load(
                            f"{cfg.OUTPUT_DIR}/model_{best_iteration}.pth")

                        break
                # Compared to "train_net.py", the test results are not dumped to EventStorage
                comm.synchronize()

            if iteration - start_iter > 5 and ((iteration + 1) % 20 == 0
                                               or iteration == max_iter - 1):
                for writer in writers:
                    writer.write()
            # periodic_checkpointer.step(iteration)
        checkpointer.save(f"model_final")
Exemplo n.º 15
0
def do_train(cfg, model, resume=False, evaluate=False):
    """
    training loop.
    """

    # Build optimizer and scheduler from configuration and model
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    # Build checkpointers
    checkpointer = DetectionCheckpointer(model,
                                         cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)
    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
    max_iter = cfg.SOLVER.MAX_ITER
    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)
    # Build writers
    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    # Build dataloader
    data_loader = build_classification_train_loader(cfg)

    # training loop
    validation_losses = []
    logger.info("Starting training from iteration {}".format(start_iter))
    with EventStorage(start_iter) as storage:
        start = time.perf_counter()
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):

            data_time = time.perf_counter() - start
            iteration = iteration + 1
            storage.step()

            loss_dict = model(data)

            # compute losses
            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalar("data_time", data_time)
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            # backward
            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()

            #validation
            if ((cfg.TEST.EVAL_PERIOD > 0
                 and iteration % cfg.TEST.EVAL_PERIOD == 0)
                    or (iteration == max_iter)):
                # evaluate on the validation dataset
                res = do_test(cfg, model, evaluate=evaluate)
                validation = {}
                for k, v in res.items():
                    print(v, flush=True)
                    validation[k] = v['loss_cls']
                    storage.put_scalars(
                        **validation
                    )  # dump also validation loss into Tensorboard
                    validation['iteration'] = iteration
                validation_losses.append(validation)

            # logging/checkpoint
            if iteration - start_iter > 5 and (iteration % 20 == 0
                                               or iteration == max_iter):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)

            #Try to get an accurate measuremetn of time
            start = time.perf_counter()

    # save validations metrics
    if evaluate:
        print(validation_losses, flush=True)
        file_path = os.path.join(cfg.OUTPUT_DIR, "validations_losses.pth")
        with PathManager.open(file_path, "wb") as f:
            torch.save(validation_losses, f)
Exemplo n.º 16
0
def do_train(cfg, model, resume=False):
    """

    # TODO: Write docstring
    """
    # Set the model to train
    model.train()

    # Create torch optimiser & schedulars
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    # Create a torch checkpointer
    checkpointer = DetectionCheckpointer(
        model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler
    )

    # Create starting checkpoint i.e. pre-trained model using weights from config
    start_iter = (
        checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1
    )

    # Define the number of iterations
    max_iter = cfg.SOLVER.MAX_ITER

    # Create a periodic checkpointer at the configured period
    periodic_checkpointer = PeriodicCheckpointer(
        checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter
    )

    # Export checkpoint data to terminal, JSON & tensorboard files
    writers = (
        [
            CommonMetricPrinter(max_iter),
            JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
            TensorboardXWriter(cfg.OUTPUT_DIR),
        ]
        if comm.is_main_process()
        else []
    )

    # Create a data loader to supply the model with training data
    data_loader = build_detection_train_loader(cfg)

    logger.info("Starting training from iteration {}".format(start_iter))
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            iteration = iteration + 1
            storage.step()

            loss_dict = model(data)
            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()}
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced)
          
            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)
            scheduler.step()

            # If eval period has been set, run test at defined interval
            if (
                cfg.TEST.EVAL_PERIOD > 0
                and iteration % cfg.TEST.EVAL_PERIOD == 0
                and iteration != max_iter
            ):
                do_test(cfg, model)
                comm.synchronize()

            if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter):
                logger.debug('Logging iteration and loss to Weights & Biases')
                wandb.log({"iteration": iteration})
                wandb.log({"total_loss": losses_reduced})
                wandb.log(loss_dict_reduced)

                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)
def do_train(cfg, model, resume=False):
    # Set model to training mode
    model.train()
    # Create optimizer from config file (returns torch.nn.optimizer.Optimizer)
    optimizer = build_optimizer(cfg, model)
    # Create scheduler for learning rate (returns torch.optim.lr._LR_scheduler)
    scheduler = build_lr_scheduler(cfg, optimizer)
    print(f"Scheduler: {scheduler}")

    # Create checkpointer
    checkpointer = DetectionCheckpointer(model,
                                         save_dir=cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)

    # Create start iteration (refernces checkpointer) - https://detectron2.readthedocs.io/modules/checkpoint.html#detectron2.checkpoint.Checkpointer.resume_or_load
    start_iter = (
        # This can be 0
        checkpointer.resume_or_load(
            cfg.MODEL.
            WEIGHTS,  # Use predefined model weights (pretrained model)
            resume=resume).get("iteration", -1) + 1)
    # Set max number of iterations
    max_iter = cfg.SOLVER.MAX_ITER

    # Create periodiccheckpoint
    periodic_checkpointer = PeriodicCheckpointer(
        checkpointer=checkpointer,
        # How often to make checkpoints?
        period=cfg.SOLVER.CHECKPOINT_PERIOD,
        max_iter=max_iter)

    # Create writers (for saving checkpoints?)
    writers = ([
        # Print out common metrics such as iteration time, ETA, memory, all losses, learning rate
        CommonMetricPrinter(max_iter=max_iter),
        # Write scalars to a JSON file such as loss values, time and more
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        # Write all scalars such as loss values to a TensorBoard file for easy visualization
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    ### Original note from script: ###
    # compared to "train_net.py", we do not support accurate timing and precise BN
    # here, because they are not trivial to implement

    # Build a training data loader based off the training dataset name in the config
    data_loader = build_detection_train_loader(cfg)

    # Start logging
    logger.info("Starting training from iteration {}".format(start_iter))

    # Store events
    with EventStorage(start_iter) as storage:
        # Loop through zipped data loader and iteration
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            iteration = iteration + 1
            storage.step(
            )  # update stroage with step - https://detectron2.readthedocs.io/modules/utils.html#detectron2.utils.events.EventStorage.step

            # Create loss dictionary by trying to model data
            loss_dict = model(data)
            losses = sum(loss_dict.values())
            # Are losses infinite? If so, something is wrong
            assert torch.isfinite(losses).all(), loss_dict

            # TODO - Not quite sure what's happening here
            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            # Sum up losses
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())

            # # TODO: wandb.log()? log the losses
            # wandb.log({
            #         "Total loss": losses_reduced
            # })

            # Update storage
            if comm.is_main_process():
                # Store informate in storage - https://detectron2.readthedocs.io/modules/utils.html#detectron2.utils.events.EventStorage.put_scalars
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            # Start doing PyTorch things
            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            # Add learning rate to storage information
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            # This is required for your learning rate to change!!!! (not having this meant my learning rate was staying at 0)
            scheduler.step()

            # Perform evaluation?
            if (cfg.TEST.EVAL_PERIOD > 0
                    and iteration % cfg.TEST.EVAL_PERIOD == 0
                    and iteration != max_iter):
                do_test(cfg, model)
                # TODO - compared to "train_net.py", the test results are not dumped to EventStorage
                comm.synchronize()

            # Log different metrics with writers
            if iteration - start_iter > 5 and (iteration % 20 == 0
                                               or iteration == max_iter):
                for writer in writers:
                    writer.write()

            # Update the periodic_checkpointer
            periodic_checkpointer.step(iteration)
Exemplo n.º 18
0
def do_train(cfg, model, resume=False, val_set='firevysor_val'):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)
    # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, min_lr=1e-6)
    # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.1, last_epoch=-1)
    metric = 0
    print_every = 50

    tensorboard_dir = osp.join(cfg.OUTPUT_DIR, 'tensorboard')
    checkpoint_dir = osp.join(cfg.OUTPUT_DIR, 'checkpoints')
    create_dir(tensorboard_dir)
    create_dir(checkpoint_dir)

    checkpointer = AdetCheckpointer(model,
                                    checkpoint_dir,
                                    optimizer=optimizer,
                                    scheduler=scheduler)
    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        # JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(tensorboard_dir),
    ] if comm.is_main_process() else [])
    data_loader = build_detection_train_loader(cfg)
    val_dataloader = build_detection_val_loader(cfg, val_set)

    logger.info("Starting training from iteration {}".format(start_iter))

    # [PHAT]: Create a log file
    log_file = open(cfg.MY_CUSTOM.LOG_FILE, 'w')

    best_loss = 1e6
    count_not_improve = 0
    train_size = 2177
    epoch_size = int(train_size / cfg.SOLVER.IMS_PER_BATCH)
    n_early_epoch = 10

    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            iteration = iteration + 1
            storage.step()

            loss_dict = model(data)
            losses = sum(loss for loss in loss_dict.values())

            assert torch.isfinite(losses).all(), loss_dict

            # Update loss dict
            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            # Early stopping
            if (iteration > start_iter) and ((iteration - start_iter) %
                                             epoch_size == 0):
                val_loss = do_val(cfg, model, val_dataloader)

                if val_loss >= best_loss:
                    count_not_improve += 1
                    # stop if models doesn't improve after <n_early_epoch> epoch
                    if count_not_improve == epoch_size * n_early_epoch:
                        break
                else:
                    count_not_improve = 0
                    best_loss = val_loss
                    periodic_checkpointer.save("best_model_early")

                # print(f"epoch {iteration//epoch_size}, val_loss: {val_loss}")
                log_file.write(
                    f"Epoch {(iteration-start_iter)//epoch_size}, val_loss: {val_loss}\n"
                )
                comm.synchronize()

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

            lr = optimizer.param_groups[0]["lr"]
            storage.put_scalar("lr", lr, smoothing_hint=False)
            scheduler.step()

            if iteration - start_iter > 5 and (
                (iteration - start_iter) % print_every == 0
                    or iteration == max_iter):
                for writer in writers:
                    writer.write()

                # Write my log
                log_file.write(
                    f"[iter {iteration}, best_loss: {best_loss}] total_loss: {losses}, lr: {lr}\n"
                )

            periodic_checkpointer.step(iteration)

    log_file.close()
def do_train(cfg, model, cat_heatmap_file, resume=False):
    model.train()

    # select optimizer and learning rate scheduler based on the config
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    # creat checkpointer
    checkpointer = DetectionCheckpointer(
        model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler
    )
    start_iter = (
        checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1
    )
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(
        checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter
    )

    # create output writers. Separate TensorBoard writers are created
    # for train and validation sets. This allows easy overlaying of graphs
    # in TensorBoard.
    train_tb_writer = os.path.join(cfg.OUTPUT_DIR, 'train')
    val_tb_writer = os.path.join(cfg.OUTPUT_DIR, 'val')
    train_writers = (
        [
            CommonMetricPrinter(max_iter),
            JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
            TensorboardXWriter(train_tb_writer),
        ]
        if comm.is_main_process()
        else []
    )
    val_writers = [TensorboardXWriter(val_tb_writer)]


    train_dataset_name = cfg.DATASETS.TRAIN[0]
    train_data_loader = build_detection_train_loader(cfg)
    train_eval_data_loader = build_detection_test_loader(cfg, train_dataset_name)
    val_dataset_name = cfg.DATASETS.TEST[0]
    val_eval_data_loader = build_detection_test_loader(cfg, val_dataset_name, DatasetMapper(cfg,True))
    logger.info("Starting training from iteration {}".format(start_iter))
    train_storage = EventStorage(start_iter)
    val_storage = EventStorage(start_iter)

    # Create the training and validation evaluator objects.
    train_evaluator = get_evaluator(
        cfg, train_dataset_name, os.path.join(cfg.OUTPUT_DIR, "train_inference", train_dataset_name),
        cat_heatmap_file
    )
    val_evaluator = get_evaluator(
        cfg, val_dataset_name, os.path.join(cfg.OUTPUT_DIR, "val_inference", val_dataset_name),
        cat_heatmap_file
    )

    # initialize the best AP50 value
    best_AP50 = 0
    start_time = time.time()
    for train_data, iteration in zip(train_data_loader, range(start_iter, max_iter)):
         # stop if the file stop_running exists in the running directory
         if os.path.isfile('stop_running'):
             os.remove('stop_running')
             break

         iteration = iteration + 1

         # run a step with the training data
         with train_storage as storage:
            model.train()
            storage.step()

            loss_dict = model(train_data)
            losses = sum(loss for loss in loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()}
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)
            scheduler.step()


            # periodically evaluate the training set and write the results
            if (cfg.TEST.EVAL_PERIOD > 0
                and iteration % cfg.TEST.EVAL_PERIOD == 0
                and iteration != max_iter):

                train_eval_results = inference_on_dataset(model, train_eval_data_loader,
                                                          train_evaluator)
                flat_results = flatten_results(train_eval_results)
                storage.put_scalars(**flat_results)
                comm.synchronize()

            if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter):
                for writer in train_writers:
                    writer.write()
            periodic_checkpointer.step(iteration)

         # run a step with the validation set
         with val_storage as storage:
            storage.step()

            # every 20 iterations evaluate the dataset to collect the loss
            if iteration % 20 == 0 or iteration == max_iter:
                with torch.set_grad_enabled(False):
                     for input, i in zip(val_eval_data_loader , range(1)):
                        loss_dict = model(input)
                        losses = sum(loss for loss in loss_dict.values())
                        assert torch.isfinite(losses).all(), loss_dict

                        loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()}
                        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

                if comm.is_main_process():
                    storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced)

            # periodically evaluate the validation set and write the results
            # check the results against the best results seen and save the parameters for
            # the best result
            if (cfg.TEST.EVAL_PERIOD > 0
                and iteration % cfg.TEST.EVAL_PERIOD == 0
                or iteration == max_iter):
                val_eval_results = inference_on_dataset(model, val_eval_data_loader,
                                                        val_evaluator)
                logger.info('val_eval_results {}', str(val_eval_results))
                results = val_eval_results.get('segm', None)
                if results is None:
                    results = val_eval_results.get('bbox', None)
                if results is not None and results.get('AP50',-1) > best_AP50:
                    best_AP50 = results['AP50']
                    logger.info('saving best results ({}), iter {}'.format(best_AP50, iteration))
                    checkpointer.save("best_AP50")

                flat_results = flatten_results(val_eval_results)
                storage.put_scalars(**flat_results)
                comm.synchronize()

            if iteration - start_iter > 5 and (iteration % 20 == 0):
                for writer in val_writers:
                    writer.write()
                elapsed = time.time() - start_time
                time_per_iter = elapsed / (iteration - start_iter)
                time_left = time_per_iter * (max_iter - iteration)
                logger.info("ETA: {}".format(str(datetime.timedelta(seconds=time_left))))
Exemplo n.º 20
0
def do_train(cfg, model, resume=False):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = DetectionCheckpointer(model,
                                         cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)
    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    #dataset|mapper|augs|sampler are done during building data_loader
    atoms = generate_atom_list(cfg, True)
    black_magic_mapper = BlackMagicMapper(cfg,
                                          is_train=True,
                                          augmentations=atoms)
    data_loader = build_detection_train_loader(cfg, black_magic_mapper)
    logger.info("Starting training from iteration {}".format(start_iter))
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            if cfg.DATALOADER.SAVE_BLACK_MAGIC_PATH != "":
                save_data_to_disk(cfg, data)
            iteration = iteration + 1
            storage.step()

            loss_dict = model(data)
            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()

            if (cfg.TEST.EVAL_PERIOD > 0
                    and iteration % cfg.TEST.EVAL_PERIOD == 0
                    and iteration != max_iter):
                do_test(cfg, model)
                # Compared to "train_net.py", the test results are not dumped to EventStorage
                comm.synchronize()

            if iteration - start_iter > 5 and (iteration % 20 == 0
                                               or iteration == max_iter):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)
Exemplo n.º 21
0
def do_train(cfg, args, myargs):
    run_func = cfg.start.get('run_func', 'train_func')
    dataset_name = cfg.start.dataset_name
    IMS_PER_BATCH = cfg.start.IMS_PER_BATCH
    max_epoch = cfg.start.max_epoch
    ASPECT_RATIO_GROUPING = cfg.start.ASPECT_RATIO_GROUPING
    NUM_WORKERS = cfg.start.NUM_WORKERS
    checkpoint_period = cfg.start.checkpoint_period
    dataset_mapper = cfg.start.dataset_mapper
    resume_ckpt_dir = get_attr_kwargs(cfg.start,
                                      'resume_ckpt_dir',
                                      default=None)
    resume_ckpt_epoch = get_attr_kwargs(cfg.start,
                                        'resume_ckpt_epoch',
                                        default=0)
    resume_ckpt_iter_every_epoch = get_attr_kwargs(
        cfg.start, 'resume_ckpt_iter_every_epoch', default=0)

    cfg.defrost()
    cfg.DATASETS.TRAIN = (dataset_name, )
    cfg.SOLVER.IMS_PER_BATCH = IMS_PER_BATCH
    cfg.DATALOADER.ASPECT_RATIO_GROUPING = ASPECT_RATIO_GROUPING
    cfg.DATALOADER.NUM_WORKERS = NUM_WORKERS
    cfg.freeze()

    # build dataset
    mapper = build_dataset_mapper(dataset_mapper)
    data_loader = build_detection_train_loader(cfg, mapper=mapper)
    metadata = MetadataCatalog.get(dataset_name)
    num_images = metadata.get('num_images')
    iter_every_epoch = num_images // IMS_PER_BATCH
    max_iter = iter_every_epoch * max_epoch

    model = build_trainer(cfg,
                          myargs=myargs,
                          iter_every_epoch=iter_every_epoch,
                          img_size=dataset_mapper.img_size,
                          dataset_name=dataset_name,
                          train_bs=IMS_PER_BATCH,
                          max_iter=max_iter)
    model.train()

    # optimizer = build_optimizer(cfg, model)
    optims_dict = model.build_optimizer()
    # scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = DetectionCheckpointer(model.get_saved_model(),
                                         cfg.OUTPUT_DIR, **optims_dict)
    if args.resume:
        resume_ckpt_dir = model._get_ckpt_path(
            ckpt_dir=resume_ckpt_dir,
            ckpt_epoch=resume_ckpt_epoch,
            iter_every_epoch=resume_ckpt_iter_every_epoch)
        start_iter = (
            checkpointer.resume_or_load(resume_ckpt_dir).get("iteration", -1) +
            1)
        if get_attr_kwargs(args, 'finetune', default=False):
            start_iter = 0
    else:
        start_iter = 0

    model.after_resume()

    if run_func != 'train_func':
        eval(f'model.{run_func}()')
        exit(0)

    checkpoint_period = eval(checkpoint_period,
                             dict(iter_every_epoch=iter_every_epoch))
    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 checkpoint_period,
                                                 max_iter=max_iter)
    logger.info("Starting training from iteration {}".format(start_iter))

    with EventStorage(start_iter) as storage:
        pbar = zip(data_loader, range(start_iter, max_iter))
        if comm.is_main_process():
            pbar = tqdm.tqdm(
                pbar,
                desc=f'do_train, {myargs.args.time_str_suffix}, '
                f'iters {iter_every_epoch} * bs {IMS_PER_BATCH} = imgs {iter_every_epoch*IMS_PER_BATCH}',
                file=myargs.stdout,
                initial=start_iter,
                total=max_iter)

        for data, iteration in pbar:
            comm.synchronize()
            iteration = iteration + 1
            storage.step()

            model.train_func(data, iteration - 1, pbar=pbar)

            periodic_checkpointer.step(iteration)
            pass

    comm.synchronize()
def do_train(cfg, model, resume=False):
    # 模型设置训练模式
    model.train()
    # 构建优化器
    optimizer = build_optimizer(cfg, model)
    # 构建学习率调整策略
    scheduler = build_lr_scheduler(cfg, optimizer)

    # 断点管理对象
    checkpointer = DetectionCheckpointer(model,
                                         cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)
    # 可用于恢复训练的起始训练步
    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
    # 最大迭代次数
    max_iter = cfg.SOLVER.MAX_ITER

    # 这里的PeriodicCheckpointer是fvcore.common.checkpoint中的类,可以用于在指定checkpoint处保存和加载模型
    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),  # 负责终端loss登信息的打印
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    # 构建batched训练data loader
    data_loader = build_detection_train_loader(cfg)
    # 构建用于获取测试loss的 test data loader
    test_data_loaders = []
    for dataset_name in cfg.DATASETS.TEST:
        test_data_loaders.append({
            "name":
            dataset_name,
            "data_loader":
            build_detection_test_loader(cfg, dataset_name,
                                        DatasetMapper(cfg, True))
        })
    logger.info("从第{}轮开始训练".format(start_iter))
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            iteration = iteration + 1
            # 每个迭代的开始调用,更新storage对象的游标
            storage.step()

            loss_dict = model(data)

            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                # 将该轮前向传播的loss放入storage对象的容器中(storage.histories(),后面读取该容器来打印终端)
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            # 反向传播
            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            # 将该轮学习率放入storage对象的容器中
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()

            if (cfg.TEST.EVAL_PERIOD > 0
                    and iteration % cfg.TEST.EVAL_PERIOD == 0
                    and iteration != max_iter):
                do_test(cfg, model)
                # Compared to "train_net.py", the test results are not dumped to EventStorage
                comm.synchronize()

            # if iteration % 21 == 0:
            #     do_loss_eval(cfg, storage, model, test_data_loaders)
            #     for writer in writers:
            #         writer.write()

            if iteration - start_iter > 5 and (iteration % 20 == 0
                                               or iteration == max_iter):
                do_loss_eval(cfg, storage, model, test_data_loaders)
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)
Exemplo n.º 23
0
def do_train(cfg, model, resume=False):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)
    checkpointer = DetectionCheckpointer(model,
                                         cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)

    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get('iteration', -1) + 1)

    max_iter = cfg.SOLVER.MAX_ITER
    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, 'metric.json')),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    data_loader = build_detection_train_loader(cfg)
    logger.info(" Starting training from iteration {}".format(start_iter))

    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            iteration = iteration + 1
            storage.step()

            loss_dict = model(data)
            losses = sum(loss for loss in loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())

            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()

            if (cfg.TEST.EVAL_PERIOD > 0
                    and iteration % cfg.TEST_EVAL_PERIOC == 0
                    and iteration != max_iter):
                do_test(cfg, model)
                comm.synchronize()

            if iteration - start_iter > 5 and (iteration % 20 == 0
                                               or iteration == max_iter):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)
Exemplo n.º 24
0
def do_train(cfg, model, resume=False):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = DetectionCheckpointer(model,
                                         cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)

    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS,
        resume=resume,
    ).get("iteration", -1) + 1)
    if cfg.SOLVER.RESET_ITER:
        logger.info('Reset loaded iteration. Start training from iteration 0.')
        start_iter = 0
    max_iter = cfg.SOLVER.MAX_ITER if cfg.SOLVER.TRAIN_ITER < 0 else cfg.SOLVER.TRAIN_ITER

    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])


    mapper = DatasetMapper(cfg, True) if cfg.INPUT.CUSTOM_AUG == '' else \
        DatasetMapper(cfg, True, augmentations=build_custom_augmentation(cfg, True))
    if cfg.DATALOADER.SAMPLER_TRAIN in [
            'TrainingSampler', 'RepeatFactorTrainingSampler'
    ]:
        data_loader = build_detection_train_loader(cfg, mapper=mapper)
    else:
        from centernet.data.custom_dataset_dataloader import build_custom_train_loader
        data_loader = build_custom_train_loader(cfg, mapper=mapper)

    logger.info("Starting training from iteration {}".format(start_iter))
    with EventStorage(start_iter) as storage:
        step_timer = Timer()
        data_timer = Timer()
        start_time = time.perf_counter()
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            data_time = data_timer.seconds()
            storage.put_scalars(data_time=data_time)
            step_timer.reset()
            iteration = iteration + 1
            storage.step()
            loss_dict = model(data)

            losses = sum(loss for k, loss in loss_dict.items())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {k: v.item() \
                for k, v in comm.reduce_dict(loss_dict).items()}
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)

            step_time = step_timer.seconds()
            storage.put_scalars(time=step_time)
            data_timer.reset()
            scheduler.step()

            if (cfg.TEST.EVAL_PERIOD > 0
                    and iteration % cfg.TEST.EVAL_PERIOD == 0
                    and iteration != max_iter):
                do_test(cfg, model)
                comm.synchronize()

            if iteration - start_iter > 5 and \
                (iteration % 20 == 0 or iteration == max_iter):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)

        total_time = time.perf_counter() - start_time
        logger.info("Total training time: {}".format(
            str(datetime.timedelta(seconds=int(total_time)))))
Exemplo n.º 25
0
def do_train(cfg, args, model, resume=False):
    # default batch size is 16
    model.train()

    scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = DetectionCheckpointer(model,
                                         cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)
    max_iter = cfg.SOLVER.MAX_ITER
    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)

    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ]
               #if comm.is_main_process()
               #else []
               )

    # compared to "train_net.py", we do not support accurate timing and
    # precise BN here, because they are not trivial to implement in a small training loop
    #logger.info("Starting training from iteration {}".format(start_iter))

    iters = 0
    iter_cnt = 0
    iter_sample_start = 1
    iter_sample_end = 20
    iter_end = 300
    start_time, end_time = 0, 0
    sample_iters = iter_sample_end - iter_sample_start + 1

    if args.scheduler:
        if args.scheduler_baseline:
            grc.memory.clean()
            grc.compressor.clean()
            grc.memory.partition()
        else:
            from mergeComp_dl.torch.scheduler.scheduler import Scheduler
            Scheduler(grc, memory_partition, args)

    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            iters += 1
            iter_cnt += 1
            if iters == iter_end:
                break

            if hvd.local_rank() == 0 and iter_cnt == iter_sample_start:
                torch.cuda.synchronize()
                start_time = time_()

            storage.iter = iteration
            #torch.cuda.synchronize()
            #iter_start_time = time_()

            loss_dict = model(data)

            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            #torch.cuda.synchronize()
            #iter_model_time = time_()

            #loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()}
            #losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            #if comm.is_main_process():
            #    storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced)

            #print("loss dict:", loss_dict, "losses:", losses, "reduced loss dict:", loss_dict_reduced, "reduced losses:", losses_reduced)
            losses.backward()

            #torch.cuda.synchronize()
            #iter_backward_time = time_()

            optimizer.step()
            optimizer.zero_grad()

            #torch.cuda.synchronize()
            #print("Iteration: {}\tmodel time: {:.3f} \tbackward time: {:.3f}\tFP+BP Time: {:.3f}\tstep time: {:.3f}\tData size: {}".format(
            #    iteration,
            #    (iter_model_time - iter_start_time),
            #    (iter_backward_time - iter_model_time),
            #    (iter_backward_time - iter_start_time),
            #    time_() - iter_start_time,
            #    len(data)))

            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()
            if args.compress:
                grc.memory.update_lr(optimizer.param_groups[0]['lr'])

            if hvd.local_rank() == 0 and iter_cnt == iter_sample_end:
                torch.cuda.synchronize()
                end_time = time_()
                iter_cnt = 0
                print(
                    "Iterations: {}\tTime: {:.3f} s\tTraining speed: {:.3f} iters/s"
                    .format(sample_iters, end_time - start_time,
                            sample_iters / (end_time - start_time)))

            if (cfg.TEST.EVAL_PERIOD > 0
                    and (iteration + 1) % cfg.TEST.EVAL_PERIOD == 0
                    and iteration != max_iter - 1):
                do_test(cfg, model)
Exemplo n.º 26
0
def do_train(cfg1, model1, model2, resume=False):

    model1.train()
    optimizer = build_optimizer(cfg1, model1)
    scheduler = build_lr_scheduler(cfg1, optimizer)

    checkpointer = DetectionCheckpointer(model1,
                                         cfg1.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)
    start_iter = (checkpointer.resume_or_load(
        cfg1.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
    max_iter = cfg1.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg1.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg1.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg1.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    # compared to "train_net.py", we do not support accurate timing and
    # precise BN here, because they are not trivial to implement
    data_loader = custom_train_loader(cfg1)
    logger.info("Starting training from iteration {}".format(start_iter))
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            iteration = iteration + 1
            storage.step()

            height = data[0]['image'].shape[1]
            width = data[0]['image'].shape[2]
            second_stream_outputs = inference_second_stream(
                model2, data, height, width)

            loss_dict = model1(data, second_stream_outputs)

            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()

            if (cfg1.TEST.EVAL_PERIOD > 0
                    and iteration % cfg1.TEST.EVAL_PERIOD == 0
                    and iteration != max_iter):
                do_test(cfg1, model1, model2)
                # Compared to "train_net.py", the test results are not dumped to EventStorage
                comm.synchronize()

            if iteration - start_iter > 5 and (iteration % 20 == 0
                                               or iteration == max_iter):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)
Exemplo n.º 27
0
def start_train(al_cfg, cfg, model, resume=False):
    early_stopping = EarlyStopping(patience=al_cfg.EARLY_STOP.PATIENCE,
                                   delta=al_cfg.EARLY_STOP.DELTA,
                                   verbose=True)
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = DetectionCheckpointer(model,
                                         cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)
    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    # compared to "train_net.py", we do not support accurate timing and
    # precise BN here, because they are not trivial to implement
    data_loader = build_detection_train_loader(cfg)
    logger.info("Starting training from iteration {}".format(start_iter))
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            iteration = iteration + 1
            storage.step()

            loss_dict = model(data)
            losses = sum(loss for loss in loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()

            if (cfg.TEST.EVAL_PERIOD > 0
                    and iteration % cfg.TEST.EVAL_PERIOD == 0
                    and iteration != max_iter):
                results = do_test(cfg, model)
                bbox_results = results['bbox']
                AP = bbox_results['AP']
                comm.synchronize()
                print('AP:', AP, '\tValue:', 1 - (AP / 100))
                early_stopping(1 - (AP / 100))
                storage.put_scalars(**bbox_results)
                if early_stopping.counter < 1:
                    checkpointer.save('model_final')

            if iteration - start_iter > 5 and (iteration % 20 == 0
                                               or iteration == max_iter):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)

            if early_stopping.early_stop:
                print("EARLY STOPPING INITIATED AT ITERATION:", iteration)
                # checkpointer.save('model_final')
                break
def do_train(cfg, model, resume=False):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = DetectionCheckpointer(model,
                                         cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)
    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    # compared to "train_net.py", we do not support accurate timing and
    # precise BN here, because they are not trivial to implement
    data_loader = build_detection_train_loader(cfg)
    logger.info("Starting training from iteration {}".format(start_iter))
    forward_pass_end_time = time.perf_counter()
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            iteration_start_time = time.perf_counter()
            if comm.get_rank() == 0:
                print("Approx backwards pass duration: ",
                      iteration_start_time - forward_pass_end_time)
            iteration = iteration + 1
            storage.step()

            if iteration == 500:
                print("Iteration 500. Profiling!")
                with torch.autograd.profiler.profile(
                        use_cuda=True, record_shapes=True) as prof:
                    loss_dict = model(data)
                    losses = sum(loss_dict.values())
                    assert torch.isfinite(losses).all(), loss_dict

                    loss_dict_reduced = {
                        k: v.item()
                        for k, v in comm.reduce_dict(loss_dict).items()
                    }
                    losses_reduced = sum(
                        loss for loss in loss_dict_reduced.values())
                print(prof.key_averages().table(sort_by="self_cpu_time_total"))
                prof.export_chrome_trace("/root/trace.json")
            else:
                loss_dict = model(data)
                losses = sum(loss_dict.values())
                assert torch.isfinite(losses).all(), loss_dict

                loss_dict_reduced = {
                    k: v.item()
                    for k, v in comm.reduce_dict(loss_dict).items()
                }
                losses_reduced = sum(loss
                                     for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            forward_pass_end_time = time.perf_counter()
            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()

            if (cfg.TEST.EVAL_PERIOD > 0
                    and iteration % cfg.TEST.EVAL_PERIOD == 0
                    and iteration != max_iter):
                do_test(cfg, model)
                # Compared to "train_net.py", the test results are not dumped to EventStorage
                comm.synchronize()

            if iteration - start_iter > 5 and (iteration % 20 == 0
                                               or iteration == max_iter):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)
Exemplo n.º 29
0
def do_train(cfg, model, resume=False):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = DetectionCheckpointer(model,
                                         cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)
    checkpointer_spot = DetectionCheckpointer(model,
                                              '/opt/ml/checkpoints',
                                              optimizer=optimizer,
                                              scheduler=scheduler)
    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)
    periodic_checkpointer_spot = PeriodicCheckpointer(
        checkpointer_spot, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    # compared to "train_net.py", we do not support accurate timing and
    # precise BN here, because they are not trivial to implement
    #     data_loader = build_detection_train_loader(cfg)
    data_loader = build_detection_train_loader(
        cfg,
        #    mapper=DatasetMapper(cfg, is_train=True
        #                         , augmentations=[
        #        T.Resize((1024, 1024)),
        #        T.RandomBrightness(.75,1.25),
        #        T.RandomFlip(),
        #        T.RandomSaturation(.75,1.25)
        #    ]
    )
    #     )
    logger.info("Starting training from iteration {}".format(start_iter))
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            iteration = iteration + 1
            storage.step()

            loss_dict = model(data)
            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()

            if (cfg.TEST.EVAL_PERIOD > 0
                    and iteration % cfg.TEST.EVAL_PERIOD == 0
                    and iteration != max_iter):
                do_test(cfg, model)
                # Compared to "train_net.py", the test results are not dumped to EventStorage
                comm.synchronize()
            if iteration % 500 == 0:
                try:
                    torch.save(model.state_dict(),
                               f'{cfg.OUTPUT_DIR}/model_{iteration}.pth')
                except:
                    print('save failed')

            if iteration - start_iter > 5 and (iteration % 20 == 0
                                               or iteration == max_iter):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)
            periodic_checkpointer_spot.step(iteration)
Exemplo n.º 30
0
def train(cfg, args, myargs):
    dataset_name = cfg.start.dataset_name
    IMS_PER_BATCH = cfg.start.IMS_PER_BATCH
    max_epoch = cfg.start.max_epoch
    ASPECT_RATIO_GROUPING = cfg.start.ASPECT_RATIO_GROUPING
    NUM_WORKERS = cfg.start.NUM_WORKERS
    checkpoint_period = cfg.start.checkpoint_period

    cfg.defrost()
    cfg.DATASETS.TRAIN = (dataset_name, )
    cfg.SOLVER.IMS_PER_BATCH = IMS_PER_BATCH
    cfg.DATALOADER.ASPECT_RATIO_GROUPING = ASPECT_RATIO_GROUPING
    cfg.DATALOADER.NUM_WORKERS = NUM_WORKERS
    cfg.freeze()

    # build dataset
    mapper = build_dataset_mapper(cfg)
    data_loader = build_detection_train_loader(cfg, mapper=mapper)
    metadata = MetadataCatalog.get(dataset_name)
    num_images = metadata.get('num_images')
    iter_every_epoch = num_images // IMS_PER_BATCH
    max_iter = iter_every_epoch * max_epoch

    model = build_trainer(cfg,
                          myargs=myargs,
                          iter_every_epoch=iter_every_epoch)
    model.train()

    logger.info("Model:\n{}".format(model))

    # optimizer = build_optimizer(cfg, model)
    optims_dict = model.build_optimizer()
    # scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = DetectionCheckpointer(model.get_saved_model(),
                                         cfg.OUTPUT_DIR, **optims_dict)
    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=args.resume).get("iteration", -1) + 1)

    checkpoint_period = eval(checkpoint_period,
                             dict(iter_every_epoch=iter_every_epoch))
    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 checkpoint_period,
                                                 max_iter=max_iter)

    logger.info("Starting training from iteration {}".format(start_iter))
    modelarts_utils.modelarts_sync_results(args=myargs.args,
                                           myargs=myargs,
                                           join=True,
                                           end=False)
    with EventStorage(start_iter) as storage:
        pbar = zip(data_loader, range(start_iter, max_iter))
        if comm.is_main_process():
            pbar = tqdm.tqdm(
                pbar,
                desc=f'train, {myargs.args.time_str_suffix}, '
                f'iters {iter_every_epoch} * bs {IMS_PER_BATCH} = imgs {iter_every_epoch*IMS_PER_BATCH}',
                file=myargs.stdout,
                initial=start_iter,
                total=max_iter)

        for data, iteration in pbar:
            comm.synchronize()
            iteration = iteration + 1
            storage.step()

            model.train_func(data, iteration - 1, pbar=pbar)

            periodic_checkpointer.step(iteration)
            pass
    modelarts_utils.modelarts_sync_results(args=myargs.args,
                                           myargs=myargs,
                                           join=True,
                                           end=True)
    comm.synchronize()