def do_train(cfg, model, resume=False): model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = DetectionCheckpointer( model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler ) start_iter = ( checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 ) max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer( checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter ) writers = default_writers(cfg.OUTPUT_DIR, max_iter) if comm.is_main_process() else [] # compared to "train_net.py", we do not support accurate timing and # precise BN here, because they are not trivial to implement in a small training loop data_loader = build_detection_train_loader(cfg) logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): storage.iter = iteration loss_dict = model(data) losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()} losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() if ( cfg.TEST.EVAL_PERIOD > 0 and (iteration + 1) % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter - 1 ): do_test(cfg, model) # Compared to "train_net.py", the test results are not dumped to EventStorage comm.synchronize() if iteration - start_iter > 5 and ( (iteration + 1) % 20 == 0 or iteration == max_iter - 1 ): for writer in writers: writer.write() periodic_checkpointer.step(iteration)
def do_train(cfg, model, resume=False): model.train() optimizer = optim.Adam(model.parameters(), lr=cfg.SOLVER.BASE_LR) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10, 20], gamma=0.1) checkpointer = DetectionCheckpointer( model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler ) start_iter = ( checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 ) max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer( checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter ) writers = [ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR) ] data_loader = build_detection_train_loader(cfg) logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): iteration = iteration + 1 storage.step() loss_dict = model(data) losses = sum(loss for loss in loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict storage.put_scalars(total_loss=losses, **loss_dict) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) if ( cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter ): do_test(cfg, model) scheduler.step() if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): for writer in writers: writer.write() periodic_checkpointer.step(iteration)
def get_d2_periodic_checkpointer(self, ): """ periodic_checkpointer.step(epoch, **{'first_epoch': epoch}) periodic_checkpointer.save(name='best', **{'max_mIoU': max_mIoU}) """ periodic_checkpointer = PeriodicCheckpointer( self.checkpointer, period=self.period, max_iter=self.maxsize, max_to_keep=self.max_to_keep) return periodic_checkpointer
def do_train(self, cfg, args, model, optimizer, resume=False): model.train() # some basic settings ========================= dataset_meta = MetadataCatalog.get(cfg.DATASETS.TRAIN[0]) data_ref = ref.__dict__[dataset_meta.ref_key] obj_names = dataset_meta.objs # load data =================================== train_dset_names = cfg.DATASETS.TRAIN data_loader = build_gdrn_train_loader(cfg, train_dset_names) data_loader_iter = iter(data_loader) # load 2nd train dataloader if needed train_2_dset_names = cfg.DATASETS.get("TRAIN2", ()) train_2_ratio = cfg.DATASETS.get("TRAIN2_RATIO", 0.0) if train_2_ratio > 0.0 and len(train_2_dset_names) > 0: data_loader_2 = build_gdrn_train_loader(cfg, train_2_dset_names) data_loader_2_iter = iter(data_loader_2) else: data_loader_2 = None data_loader_2_iter = None images_per_batch = cfg.SOLVER.IMS_PER_BATCH if isinstance(data_loader, AspectRatioGroupedDataset): dataset_len = len(data_loader.dataset.dataset) if data_loader_2 is not None: dataset_len += len(data_loader_2.dataset.dataset) iters_per_epoch = dataset_len // images_per_batch else: dataset_len = len(data_loader.dataset) if data_loader_2 is not None: dataset_len += len(data_loader_2.dataset) iters_per_epoch = dataset_len // images_per_batch max_iter = cfg.SOLVER.TOTAL_EPOCHS * iters_per_epoch dprint("images_per_batch: ", images_per_batch) dprint("dataset length: ", dataset_len) dprint("iters per epoch: ", iters_per_epoch) dprint("total iters: ", max_iter) data_loader = self.setup_dataloaders(data_loader, replace_sampler=False, move_to_device=False) if data_loader_2 is not None: data_loader_2 = self.setup_dataloaders(data_loader_2, replace_sampler=False, move_to_device=False) scheduler = solver_utils.build_lr_scheduler(cfg, optimizer, total_iters=max_iter) # resume or load model =================================== extra_ckpt_dict = dict( optimizer=optimizer, scheduler=scheduler, ) if hasattr(self._precision_plugin, "scaler"): extra_ckpt_dict["gradscaler"] = self._precision_plugin.scaler checkpointer = MyCheckpointer( model, cfg.OUTPUT_DIR, save_to_disk=self.is_global_zero, **extra_ckpt_dict, ) start_iter = checkpointer.resume_or_load( cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 if cfg.SOLVER.CHECKPOINT_BY_EPOCH: ckpt_period = cfg.SOLVER.CHECKPOINT_PERIOD * iters_per_epoch else: ckpt_period = cfg.SOLVER.CHECKPOINT_PERIOD periodic_checkpointer = PeriodicCheckpointer( checkpointer, ckpt_period, max_iter=max_iter, max_to_keep=cfg.SOLVER.MAX_TO_KEEP) # build writers ============================================== tbx_event_writer = self.get_tbx_event_writer( cfg.OUTPUT_DIR, backup=not cfg.get("RESUME", False)) tbx_writer = tbx_event_writer._writer # NOTE: we want to write some non-scalar data writers = ([ MyCommonMetricPrinter(max_iter), MyJSONWriter(osp.join(cfg.OUTPUT_DIR, "metrics.json")), tbx_event_writer ] if self.is_global_zero else []) # compared to "train_net.py", we do not support accurate timing and # precise BN here, because they are not trivial to implement logger.info("Starting training from iteration {}".format(start_iter)) iter_time = None with EventStorage(start_iter) as storage: for iteration in range(start_iter, max_iter): storage.iter = iteration epoch = iteration // dataset_len + 1 if np.random.rand() < train_2_ratio: data = next(data_loader_2_iter) else: data = next(data_loader_iter) if iter_time is not None: storage.put_scalar("time", time.perf_counter() - iter_time) iter_time = time.perf_counter() # forward ============================================================ batch = batch_data(cfg, data) out_dict, loss_dict = model( batch["roi_img"], gt_xyz=batch.get("roi_xyz", None), gt_xyz_bin=batch.get("roi_xyz_bin", None), gt_mask_trunc=batch["roi_mask_trunc"], gt_mask_visib=batch["roi_mask_visib"], gt_mask_obj=batch["roi_mask_obj"], gt_region=batch.get("roi_region", None), gt_allo_quat=batch.get("allo_quat", None), gt_ego_quat=batch.get("ego_quat", None), gt_allo_rot6d=batch.get("allo_rot6d", None), gt_ego_rot6d=batch.get("ego_rot6d", None), gt_ego_rot=batch.get("ego_rot", None), gt_trans=batch.get("trans", None), gt_trans_ratio=batch["roi_trans_ratio"], gt_points=batch.get("roi_points", None), sym_infos=batch.get("sym_info", None), roi_classes=batch["roi_cls"], roi_cams=batch["roi_cam"], roi_whs=batch["roi_wh"], roi_centers=batch["roi_center"], resize_ratios=batch["resize_ratio"], roi_coord_2d=batch.get("roi_coord_2d", None), roi_extents=batch.get("roi_extent", None), do_loss=True, ) losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = { k: v.item() for k, v in comm.reduce_dict(loss_dict).items() } losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if self.is_global_zero: storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad(set_to_none=True) self.backward(losses) optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() if (cfg.TEST.EVAL_PERIOD > 0 and (iteration + 1) % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter - 1): self.do_test(cfg, model, epoch=epoch, iteration=iteration) # Compared to "train_net.py", the test results are not dumped to EventStorage self.barrier() if iteration - start_iter > 5 and ( (iteration + 1) % cfg.TRAIN.PRINT_FREQ == 0 or iteration == max_iter - 1 or iteration < 100): for writer in writers: writer.write() # visualize some images ======================================== if cfg.TRAIN.VIS_IMG: with torch.no_grad(): vis_i = 0 roi_img_vis = batch["roi_img"][vis_i].cpu().numpy() roi_img_vis = denormalize_image( roi_img_vis, cfg).transpose(1, 2, 0).astype("uint8") tbx_writer.add_image("input_image", roi_img_vis, iteration) out_coor_x = out_dict["coor_x"].detach() out_coor_y = out_dict["coor_y"].detach() out_coor_z = out_dict["coor_z"].detach() out_xyz = get_out_coor(cfg, out_coor_x, out_coor_y, out_coor_z) out_xyz_vis = out_xyz[vis_i].cpu().numpy( ).transpose(1, 2, 0) out_xyz_vis = get_emb_show(out_xyz_vis) tbx_writer.add_image("out_xyz", out_xyz_vis, iteration) gt_xyz_vis = batch["roi_xyz"][vis_i].cpu().numpy( ).transpose(1, 2, 0) gt_xyz_vis = get_emb_show(gt_xyz_vis) tbx_writer.add_image("gt_xyz", gt_xyz_vis, iteration) out_mask = out_dict["mask"].detach() out_mask = get_out_mask(cfg, out_mask) out_mask_vis = out_mask[vis_i, 0].cpu().numpy() tbx_writer.add_image("out_mask", out_mask_vis, iteration) gt_mask_vis = batch["roi_mask"][vis_i].detach( ).cpu().numpy() tbx_writer.add_image("gt_mask", gt_mask_vis, iteration) if (iteration + 1) % periodic_checkpointer.period == 0 or ( periodic_checkpointer.max_iter is not None and (iteration + 1) >= periodic_checkpointer.max_iter): if hasattr(optimizer, "consolidate_state_dict"): # for ddp_sharded optimizer.consolidate_state_dict() periodic_checkpointer.step(iteration, epoch=epoch)
def do_train(self, cfg, model, resume): add_print_flops_callback(cfg, model, disable_after_callback=True) optimizer = self.build_optimizer(cfg, model) scheduler = self.build_lr_scheduler(cfg, optimizer) checkpointer = self.build_checkpointer( cfg, model, save_dir=cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler, ) checkpoint = checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume) start_iter = (checkpoint.get("iteration", -1) if resume and checkpointer.has_checkpoint() else -1) # The checkpoint stores the training iteration that just finished, thus we start # at the next iteration (or iter zero if there's no checkpoint). start_iter += 1 max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer( checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) data_loader = self.build_detection_train_loader(cfg) def _get_model_with_abnormal_checker(model): if not cfg.ABNORMAL_CHECKER.ENABLED: return model tbx_writer = _get_tbx_writer( get_tensorboard_log_dir(cfg.OUTPUT_DIR)) writers = abnormal_checker.get_writers(cfg, tbx_writer) checker = abnormal_checker.AbnormalLossChecker(start_iter, writers) ret = abnormal_checker.AbnormalLossCheckerWrapper(model, checker) return ret trainer = (AMPTrainer if cfg.SOLVER.AMP.ENABLED else SimpleTrainer)( _get_model_with_abnormal_checker(model), data_loader, optimizer) trainer_hooks = [ hooks.IterationTimer(), model_ema.EMAHook(cfg, model) if cfg.MODEL_EMA.ENABLED else None, self._create_after_step_hook(cfg, model, optimizer, scheduler, periodic_checkpointer), hooks.EvalHook( cfg.TEST.EVAL_PERIOD, lambda: self.do_test(cfg, model, train_iter=trainer.iter), ), kmeans_anchors.compute_kmeans_anchors_hook(self, cfg), self._create_qat_hook(cfg) if cfg.QUANTIZATION.QAT.ENABLED else None, ] if comm.is_main_process(): tbx_writer = _get_tbx_writer( get_tensorboard_log_dir(cfg.OUTPUT_DIR)) writers = [ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), tbx_writer, ] trainer_hooks.append(hooks.PeriodicWriter(writers)) trainer.register_hooks(trainer_hooks) trainer.train(start_iter, max_iter) if hasattr(self, 'original_cfg'): table = get_cfg_diff_table(cfg, self.original_cfg) logger.info( "GeneralizeRCNN Runner ignoring training config change: \n" + table) trained_cfg = self.original_cfg.clone() else: trained_cfg = cfg.clone() with temp_defrost(trained_cfg): trained_cfg.MODEL.WEIGHTS = checkpointer.get_checkpoint_file() return {"model_final": trained_cfg}
def do_relation_train(cfg, model, resume=False): model.train() for param in model.named_parameters(): param[1].requires_grad = False for param in model.named_parameters(): for trainable in cfg.MODEL.TRAINABLE: if param[0].startswith(trainable): param[1].requires_grad = True break if param[0] == "relation_heads.instance_head.semantic_embed.weight" or \ param[0] == "relation_heads.pair_head.semantic_embed.weight" or \ param[0] == "relation_heads.predicate_head.semantic_embed.weight" or \ param[0] == "relation_heads.triplet_head.ins_embed.weight" or \ param[0] == "relation_heads.triplet_head.pred_embed.weight" or \ param[0] == "relation_heads.subpred_head.sub_embed.weight" or \ param[0] == "relation_heads.subpred_head.pred_embed.weight" or \ param[0] == "relation_heads.predobj_head.pred_embed.weight" or \ param[0] == "relation_heads.predobj_head.obj_embed.weight" or \ param[0].startswith("relation_heads.predicate_head.freq_bias.obj_baseline.weight"): param[1].requires_grad = False optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) metrics_sum_dict = { 'relation_cls_tp_sum': 0, 'relation_cls_p_sum': 0.00001, 'pred_class_tp_sum': 0, 'pred_class_p_sum': 0.00001, 'gt_class_tp_sum': 0, 'gt_class_p_sum': 0.00001, 'raw_pred_class_tp_sum': 0, 'raw_pred_class_p_sum': 0.00001, 'instance_tp_sum':0, 'instance_p_sum': 0.00001, 'instance_g_sum':0.00001, 'subpred_tp_sum': 0, 'subpred_p_sum': 0.00001, 'subpred_g_sum': 0.00001, 'predobj_tp_sum': 0, 'predobj_p_sum': 0.00001, 'predobj_g_sum': 0.00001, 'pair_tp_sum':0, 'pair_p_sum': 0.00001, 'pair_g_sum':0.00001, 'confidence_tp_sum': 0, 'confidence_p_sum': 0.00001, 'confidence_g_sum': 0.00001, 'predicate_tp_sum': 0, 'predicate_tp20_sum': 0, 'predicate_tp50_sum': 0, 'predicate_tp100_sum': 0, 'predicate_p_sum': 0.00001, 'predicate_p20_sum': 0.00001, 'predicate_p50_sum': 0.00001, 'predicate_p100_sum': 0.00001, 'predicate_g_sum': 0.00001, 'triplet_tp_sum': 0, 'triplet_tp20_sum': 0, 'triplet_tp50_sum': 0, 'triplet_tp100_sum': 0, 'triplet_p_sum': 0.00001, 'triplet_p20_sum': 0.00001, 'triplet_p50_sum': 0.00001, 'triplet_p100_sum': 0.00001, 'triplet_g_sum': 0.00001, } checkpointer = DetectionCheckpointer( model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler, metrics_sum_dict=metrics_sum_dict ) start_iter = (checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1) # state_dict=torch.load(cfg.MODEL.WEIGHTS).pop("model") # model.load_state_dict(state_dict,strict=False) max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) # relation_cls_state_dict=torch.load(cfg.MODEL.WEIGHTS).pop("model") # for param in model.named_parameters(): # if param[0] not in relation_cls_state_dict: # print(param[0]) # model.load_state_dict(relation_cls_state_dict,strict=False) writers = ( [ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else [] ) metrics_pr_dict={} # compared to "train_net.py", we do not support accurate timing and # precise BN here, because they are not trivial to implement data_loader = build_detection_train_loader(cfg) logger.info("Starting training from iteration {}".format(start_iter)) acumulate_losses=0 with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): print(iteration) iteration = iteration + 1 storage.step() if True: # try: pred_instances, results_dict, losses_dict, metrics_dict = model(data,iteration,mode="relation",training=True) losses = sum(loss for loss in losses_dict.values()) assert torch.isfinite(losses).all(), losses_dict #print(losses_dict) loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(losses_dict).items()} losses_reduced = sum(loss for loss in loss_dict_reduced.values()) acumulate_losses += losses_reduced if comm.is_main_process(): storage.put_scalars(acumulate_losses=acumulate_losses/(iteration-start_iter),total_loss=losses_reduced, **loss_dict_reduced) if 'relation_cls_tp' in metrics_dict: metrics_sum_dict['relation_cls_tp_sum']+=metrics_dict['relation_cls_tp'] metrics_sum_dict['relation_cls_p_sum'] += metrics_dict['relation_cls_p'] metrics_pr_dict['relation_cls_precision'] = metrics_sum_dict['relation_cls_tp_sum'] / metrics_sum_dict['relation_cls_p_sum'] if 'pred_class_tp' in metrics_dict: metrics_sum_dict['pred_class_tp_sum']+=metrics_dict['pred_class_tp'] metrics_sum_dict['pred_class_p_sum'] += metrics_dict['pred_class_p'] metrics_pr_dict['pred_class_precision'] = metrics_sum_dict['pred_class_tp_sum'] / metrics_sum_dict['pred_class_p_sum'] if 'raw_pred_class_tp' in metrics_dict: metrics_sum_dict['raw_pred_class_tp_sum']+=metrics_dict['raw_pred_class_tp'] metrics_sum_dict['raw_pred_class_p_sum'] += metrics_dict['raw_pred_class_p'] metrics_pr_dict['raw_pred_class_precision'] = metrics_sum_dict['raw_pred_class_tp_sum'] / metrics_sum_dict['raw_pred_class_p_sum'] if 'gt_class_tp' in metrics_dict: metrics_sum_dict['gt_class_tp_sum']+=metrics_dict['gt_class_tp'] metrics_sum_dict['gt_class_p_sum'] += metrics_dict['gt_class_p'] metrics_pr_dict['gt_class_precision'] = metrics_sum_dict['gt_class_tp_sum'] / metrics_sum_dict['gt_class_p_sum'] if 'instance_tp' in metrics_dict: metrics_sum_dict['instance_tp_sum']+=metrics_dict['instance_tp'] metrics_sum_dict['instance_p_sum'] += metrics_dict['instance_p'] metrics_sum_dict['instance_g_sum'] += metrics_dict['instance_g'] metrics_pr_dict['instance_precision'] = metrics_sum_dict['instance_tp_sum'] / metrics_sum_dict['instance_p_sum'] metrics_pr_dict['instance_recall'] = metrics_sum_dict['instance_tp_sum'] / metrics_sum_dict['instance_g_sum'] if 'subpred_tp' in metrics_dict: metrics_sum_dict['subpred_tp_sum']+=metrics_dict['subpred_tp'] metrics_sum_dict['subpred_p_sum'] += metrics_dict['subpred_p'] metrics_sum_dict['subpred_g_sum'] += metrics_dict['subpred_g'] metrics_pr_dict['subpred_precision'] = metrics_sum_dict['subpred_tp_sum'] / metrics_sum_dict['subpred_p_sum'] metrics_pr_dict['subpred_recall'] = metrics_sum_dict['subpred_tp_sum'] / metrics_sum_dict['subpred_g_sum'] if 'predobj_tp' in metrics_dict: metrics_sum_dict['predobj_tp_sum']+=metrics_dict['predobj_tp'] metrics_sum_dict['predobj_p_sum'] += metrics_dict['predobj_p'] metrics_sum_dict['predobj_g_sum'] += metrics_dict['predobj_g'] metrics_pr_dict['predobj_precision'] = metrics_sum_dict['predobj_tp_sum'] / metrics_sum_dict['predobj_p_sum'] metrics_pr_dict['predobj_recall'] = metrics_sum_dict['predobj_tp_sum'] / metrics_sum_dict['predobj_g_sum'] if 'pair_tp' in metrics_dict: metrics_sum_dict['pair_tp_sum'] += metrics_dict['pair_tp'] metrics_sum_dict['pair_p_sum'] += metrics_dict['pair_p'] metrics_sum_dict['pair_g_sum'] += metrics_dict['pair_g'] metrics_pr_dict['pair_precision'] = metrics_sum_dict['pair_tp_sum'] / metrics_sum_dict['pair_p_sum'] metrics_pr_dict['pair_recall'] = metrics_sum_dict['pair_tp_sum'] / metrics_sum_dict['pair_g_sum'] if 'confidence_tp' in metrics_dict: metrics_sum_dict['confidence_tp_sum']+=metrics_dict['confidence_tp'] metrics_sum_dict['confidence_p_sum'] += metrics_dict['confidence_p'] metrics_sum_dict['confidence_g_sum'] += metrics_dict['confidence_g'] metrics_pr_dict['confidence_precision'] = metrics_sum_dict['confidence_tp_sum'] / metrics_sum_dict['confidence_p_sum'] metrics_pr_dict['confidence_recall'] = metrics_sum_dict['confidence_tp_sum'] / metrics_sum_dict['confidence_g_sum'] if 'predicate_tp' in metrics_dict: metrics_sum_dict['predicate_tp_sum']+=metrics_dict['predicate_tp'] metrics_sum_dict['predicate_tp20_sum'] += metrics_dict['predicate_tp20'] metrics_sum_dict['predicate_tp50_sum'] += metrics_dict['predicate_tp50'] metrics_sum_dict['predicate_tp100_sum'] += metrics_dict['predicate_tp100'] metrics_sum_dict['predicate_p_sum'] += metrics_dict['predicate_p'] metrics_sum_dict['predicate_p20_sum'] += metrics_dict['predicate_p20'] metrics_sum_dict['predicate_p50_sum'] += metrics_dict['predicate_p50'] metrics_sum_dict['predicate_p100_sum'] += metrics_dict['predicate_p100'] metrics_sum_dict['predicate_g_sum'] += metrics_dict['predicate_g'] metrics_pr_dict['predicate_precision'] = metrics_sum_dict['predicate_tp_sum'] / metrics_sum_dict['predicate_p_sum'] metrics_pr_dict['predicate_precision20'] = metrics_sum_dict['predicate_tp20_sum'] / metrics_sum_dict['predicate_p20_sum'] metrics_pr_dict['predicate_precision50'] = metrics_sum_dict['predicate_tp50_sum'] / metrics_sum_dict['predicate_p50_sum'] metrics_pr_dict['predicate_precision100'] = metrics_sum_dict['predicate_tp100_sum'] / metrics_sum_dict['predicate_p100_sum'] metrics_pr_dict['predicate_recall'] = metrics_sum_dict['predicate_tp_sum'] / metrics_sum_dict['predicate_g_sum'] metrics_pr_dict['predicate_recall20'] = metrics_sum_dict['predicate_tp20_sum'] / metrics_sum_dict['predicate_g_sum'] metrics_pr_dict['predicate_recall50'] = metrics_sum_dict['predicate_tp50_sum'] / metrics_sum_dict['predicate_g_sum'] metrics_pr_dict['predicate_recall100'] = metrics_sum_dict['predicate_tp100_sum'] / metrics_sum_dict['predicate_g_sum'] if 'triplet_tp' in metrics_dict: metrics_sum_dict['triplet_tp_sum'] += metrics_dict['triplet_tp'] metrics_sum_dict['triplet_tp20_sum'] += metrics_dict['triplet_tp20'] metrics_sum_dict['triplet_tp50_sum'] += metrics_dict['triplet_tp50'] metrics_sum_dict['triplet_tp100_sum'] += metrics_dict['triplet_tp100'] metrics_sum_dict['triplet_p_sum'] += metrics_dict['triplet_p'] metrics_sum_dict['triplet_p20_sum'] += metrics_dict['triplet_p20'] metrics_sum_dict['triplet_p50_sum'] += metrics_dict['triplet_p50'] metrics_sum_dict['triplet_p100_sum'] += metrics_dict['triplet_p100'] metrics_sum_dict['triplet_g_sum'] += metrics_dict['triplet_g'] metrics_pr_dict['triplet_precision'] = metrics_sum_dict['triplet_tp_sum'] / metrics_sum_dict['triplet_p_sum'] metrics_pr_dict['triplet_precision20'] = metrics_sum_dict['triplet_tp20_sum'] / metrics_sum_dict['triplet_p20_sum'] metrics_pr_dict['triplet_precision50'] = metrics_sum_dict['triplet_tp50_sum'] / metrics_sum_dict['triplet_p50_sum'] metrics_pr_dict['triplet_precision100'] = metrics_sum_dict['triplet_tp100_sum'] / metrics_sum_dict['triplet_p100_sum'] metrics_pr_dict['triplet_recall'] = metrics_sum_dict['triplet_tp_sum'] / metrics_sum_dict['triplet_g_sum'] metrics_pr_dict['triplet_recall20'] = metrics_sum_dict['triplet_tp20_sum'] / metrics_sum_dict['triplet_g_sum'] metrics_pr_dict['triplet_recall50'] = metrics_sum_dict['triplet_tp50_sum'] / metrics_sum_dict['triplet_g_sum'] metrics_pr_dict['triplet_recall100'] = metrics_sum_dict['triplet_tp100_sum'] / metrics_sum_dict['triplet_g_sum'] storage.put_scalars(**metrics_pr_dict, smoothing_hint=False) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): for writer in writers: writer.write() periodic_checkpointer.step(iteration) torch.cuda.empty_cache()
def do_train(cfg, args): # fmt: off run_func = cfg.start.get('run_func', 'train_func') dataset_name = cfg.start.dataset_name IMS_PER_BATCH = cfg.start.IMS_PER_BATCH * comm.get_world_size() NUM_WORKERS = cfg.start.NUM_WORKERS dataset_mapper = cfg.start.dataset_mapper max_epoch = cfg.start.max_epoch checkpoint_period = cfg.start.checkpoint_period resume_cfg = get_attr_kwargs(cfg.start, 'resume_cfg', default=None) cfg.defrost() cfg.DATASETS.TRAIN = (dataset_name, ) cfg.SOLVER.IMS_PER_BATCH = IMS_PER_BATCH cfg.DATALOADER.NUM_WORKERS = NUM_WORKERS cfg.freeze() # fmt: on # build dataset mapper = build_dataset_mapper(dataset_mapper) data_loader = build_detection_train_loader(cfg, mapper=mapper) metadata = MetadataCatalog.get(dataset_name) num_samples = metadata.get('num_samples') iter_every_epoch = num_samples // IMS_PER_BATCH max_iter = iter_every_epoch * max_epoch model = build_trainer(cfg=cfg, args=args, iter_every_epoch=iter_every_epoch, batch_size=IMS_PER_BATCH, max_iter=max_iter, metadata=metadata, max_epoch=max_epoch, data_loader=data_loader) model.train() # optimizer = build_optimizer(cfg, model) optims_dict = model.build_optimizer() scheduler = model.build_lr_scheduler() checkpointer = DetectionCheckpointer(model.get_saved_model(), cfg.OUTPUT_DIR, **optims_dict, **scheduler) if resume_cfg and resume_cfg.resume: resume_ckpt_dir = model._get_ckpt_path( ckpt_dir=resume_cfg.ckpt_dir, ckpt_epoch=resume_cfg.ckpt_epoch, iter_every_epoch=resume_cfg.iter_every_epoch) start_iter = ( checkpointer.resume_or_load(resume_ckpt_dir).get("iteration", -1) + 1) if get_attr_kwargs(resume_cfg, 'finetune', default=False): start_iter = 0 model.after_resume() else: start_iter = 0 if run_func != 'train_func': eval(f'model.{run_func}()') exit(0) checkpoint_period = eval(checkpoint_period, dict(iter_every_epoch=iter_every_epoch)) periodic_checkpointer = PeriodicCheckpointer(checkpointer, checkpoint_period, max_iter=max_iter) logger.info("Starting training from iteration {}".format(start_iter)) # modelarts_utils.modelarts_sync_results(args=myargs.args, myargs=myargs, join=True, end=False) with EventStorage(start_iter) as storage: pbar = zip(data_loader, range(start_iter, max_iter)) if comm.is_main_process(): pbar = tqdm.tqdm( pbar, desc=f'do_train, {args.tl_time_str}, ' f'iters {iter_every_epoch} * bs {IMS_PER_BATCH} = ' f'imgs {iter_every_epoch*IMS_PER_BATCH}', initial=start_iter, total=max_iter) for data, iteration in pbar: comm.synchronize() iteration = iteration + 1 storage.step() model.train_func(data, iteration - 1, pbar=pbar) periodic_checkpointer.step(iteration) pass # modelarts_utils.modelarts_sync_results(args=myargs.args, myargs=myargs, join=True, end=True) comm.synchronize()
def do_train(cfg, args, model, optimizer, resume=False): model.train() # some basic settings ========================= dataset_meta = MetadataCatalog.get(cfg.DATASETS.TRAIN[0]) data_ref = ref.__dict__[dataset_meta.ref_key] obj_names = dataset_meta.objs # load data =================================== train_dset_names = cfg.DATASETS.TRAIN data_loader = build_gdrn_train_loader(cfg, train_dset_names) data_loader_iter = iter(data_loader) # load 2nd train dataloader if needed train_2_dset_names = cfg.DATASETS.get("TRAIN2", ()) train_2_ratio = cfg.DATASETS.get("TRAIN2_RATIO", 0.0) if train_2_ratio > 0.0 and len(train_2_dset_names) > 0: data_loader_2 = build_gdrn_train_loader(cfg, train_2_dset_names) data_loader_2_iter = iter(data_loader_2) else: data_loader_2 = None data_loader_2_iter = None images_per_batch = cfg.SOLVER.IMS_PER_BATCH if isinstance(data_loader, AspectRatioGroupedDataset): dataset_len = len(data_loader.dataset.dataset) if data_loader_2 is not None: dataset_len += len(data_loader_2.dataset.dataset) iters_per_epoch = dataset_len // images_per_batch else: dataset_len = len(data_loader.dataset) if data_loader_2 is not None: dataset_len += len(data_loader_2.dataset) iters_per_epoch = dataset_len // images_per_batch max_iter = cfg.SOLVER.TOTAL_EPOCHS * iters_per_epoch dprint("images_per_batch: ", images_per_batch) dprint("dataset length: ", dataset_len) dprint("iters per epoch: ", iters_per_epoch) dprint("total iters: ", max_iter) scheduler = solver_utils.build_lr_scheduler(cfg, optimizer, total_iters=max_iter) AMP_ON = cfg.SOLVER.AMP.ENABLED logger.info(f"AMP enabled: {AMP_ON}") grad_scaler = GradScaler() # resume or load model =================================== checkpointer = MyCheckpointer( model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler, gradscaler=grad_scaler, save_to_disk=comm.is_main_process(), ) start_iter = checkpointer.resume_or_load( cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 if comm._USE_HVD: # hvd may be not available, so do not use the one in args # not needed # start_iter = hvd.broadcast(torch.tensor(start_iter), root_rank=0, name="start_iter").item() # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), op=hvd.Adasum if args.use_adasum else hvd.Average, compression=compression, ) # device_dense='/cpu:0' if cfg.SOLVER.CHECKPOINT_BY_EPOCH: ckpt_period = cfg.SOLVER.CHECKPOINT_PERIOD * iters_per_epoch else: ckpt_period = cfg.SOLVER.CHECKPOINT_PERIOD periodic_checkpointer = PeriodicCheckpointer( checkpointer, ckpt_period, max_iter=max_iter, max_to_keep=cfg.SOLVER.MAX_TO_KEEP) # build writers ============================================== tbx_event_writer = get_tbx_event_writer( cfg.OUTPUT_DIR, backup=not cfg.get("RESUME", False)) tbx_writer = tbx_event_writer._writer # NOTE: we want to write some non-scalar data writers = ([ MyCommonMetricPrinter(max_iter), MyJSONWriter(osp.join(cfg.OUTPUT_DIR, "metrics.json")), tbx_event_writer ] if comm.is_main_process() else []) # compared to "train_net.py", we do not support accurate timing and # precise BN here, because they are not trivial to implement logger.info("Starting training from iteration {}".format(start_iter)) iter_time = None with EventStorage(start_iter) as storage: # for data, iteration in zip(data_loader, range(start_iter, max_iter)): for iteration in range(start_iter, max_iter): storage.iter = iteration epoch = iteration // dataset_len + 1 if np.random.rand() < train_2_ratio: data = next(data_loader_2_iter) else: data = next(data_loader_iter) if iter_time is not None: storage.put_scalar("time", time.perf_counter() - iter_time) iter_time = time.perf_counter() # forward ============================================================ batch = batch_data(cfg, data) with autocast(enabled=AMP_ON): out_dict, loss_dict = model( batch["roi_img"], gt_xyz=batch.get("roi_xyz", None), gt_xyz_bin=batch.get("roi_xyz_bin", None), gt_mask_trunc=batch["roi_mask_trunc"], gt_mask_visib=batch["roi_mask_visib"], gt_mask_obj=batch["roi_mask_obj"], gt_region=batch.get("roi_region", None), gt_allo_quat=batch.get("allo_quat", None), gt_ego_quat=batch.get("ego_quat", None), gt_allo_rot6d=batch.get("allo_rot6d", None), gt_ego_rot6d=batch.get("ego_rot6d", None), gt_ego_rot=batch.get("ego_rot", None), gt_trans=batch.get("trans", None), gt_trans_ratio=batch["roi_trans_ratio"], gt_points=batch.get("roi_points", None), sym_infos=batch.get("sym_info", None), roi_classes=batch["roi_cls"], roi_cams=batch["roi_cam"], roi_whs=batch["roi_wh"], roi_centers=batch["roi_center"], resize_ratios=batch["resize_ratio"], roi_coord_2d=batch.get("roi_coord_2d", None), roi_extents=batch.get("roi_extent", None), do_loss=True, ) losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = { k: v.item() for k, v in comm.reduce_dict(loss_dict).items() } losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() if AMP_ON: grad_scaler.scale(losses).backward() # # Unscales the gradients of optimizer's assigned params in-place # grad_scaler.unscale_(optimizer) # # Since the gradients of optimizer's assigned params are unscaled, clips as usual: # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) if comm._USE_HVD: optimizer.synchronize() with optimizer.skip_synchronize(): grad_scaler.step(optimizer) grad_scaler.update() else: grad_scaler.step(optimizer) grad_scaler.update() else: losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() if cfg.TEST.EVAL_PERIOD > 0 and ( iteration + 1 ) % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter - 1: do_test(cfg, model, epoch=epoch, iteration=iteration) # Compared to "train_net.py", the test results are not dumped to EventStorage comm.synchronize() if iteration - start_iter > 5 and ( (iteration + 1) % cfg.TRAIN.PRINT_FREQ == 0 or iteration == max_iter - 1 or iteration < 100): for writer in writers: writer.write() # visualize some images ======================================== if cfg.TRAIN.VIS_IMG: with torch.no_grad(): vis_i = 0 roi_img_vis = batch["roi_img"][vis_i].cpu().numpy() roi_img_vis = denormalize_image(roi_img_vis, cfg).transpose( 1, 2, 0).astype("uint8") tbx_writer.add_image("input_image", roi_img_vis, iteration) out_coor_x = out_dict["coor_x"].detach() out_coor_y = out_dict["coor_y"].detach() out_coor_z = out_dict["coor_z"].detach() out_xyz = get_out_coor(cfg, out_coor_x, out_coor_y, out_coor_z) out_xyz_vis = out_xyz[vis_i].cpu().numpy().transpose( 1, 2, 0) out_xyz_vis = get_emb_show(out_xyz_vis) tbx_writer.add_image("out_xyz", out_xyz_vis, iteration) gt_xyz_vis = batch["roi_xyz"][vis_i].cpu().numpy( ).transpose(1, 2, 0) gt_xyz_vis = get_emb_show(gt_xyz_vis) tbx_writer.add_image("gt_xyz", gt_xyz_vis, iteration) out_mask = out_dict["mask"].detach() out_mask = get_out_mask(cfg, out_mask) out_mask_vis = out_mask[vis_i, 0].cpu().numpy() tbx_writer.add_image("out_mask", out_mask_vis, iteration) gt_mask_vis = batch["roi_mask"][vis_i].detach().cpu( ).numpy() tbx_writer.add_image("gt_mask", gt_mask_vis, iteration) periodic_checkpointer.step(iteration, epoch=epoch)
model = build_model(cfg) logger.info("Model:\n{}".format(model)) model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = DetectionCheckpointer( model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler ) start_iter = ( checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=False).get("iteration", -1) + 1 ) max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer( checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter ) writers = ( [ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else [] ) # compared to "train_net.py", we do not support accurate timing and # precise BN here, because they are not trivial to implement data_loader = build_detection_train_loader(cfg)
def do_train(cfg_source, cfg_target, model, resume=False): model.train() print(model) optimizer = build_optimizer(cfg_source, model) scheduler = build_lr_scheduler(cfg_source, optimizer) checkpointer = DetectionCheckpointer(model, cfg_source.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) start_iter = (checkpointer.resume_or_load( cfg_source.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1) max_iter = cfg_source.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer( checkpointer, cfg_source.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) writers = ([ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg_source.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg_source.OUTPUT_DIR), ] if comm.is_main_process() else []) i = 1 max_epoch = 41.27 # max iter / min(data_len(data_source, data_target)) current_epoch = 0 data_len = 1502 alpha3 = 0 alpha4 = 0 alpha5 = 0 data_loader_source = build_detection_train_loader(cfg_source) data_loader_target = build_detection_train_loader(cfg_target) logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: for data_source, data_target, iteration in zip( data_loader_source, data_loader_target, range(start_iter, max_iter)): iteration = iteration + 1 storage.step() if (iteration % data_len) == 0: current_epoch += 1 i = 1 p = float(i + current_epoch * data_len) / max_epoch / data_len alpha = 2. / (1. + np.exp(-10 * p)) - 1 i += 1 alpha3 = alpha alpha4 = alpha alpha5 = alpha if alpha3 > 0.5: alpha3 = 0.5 if alpha4 > 0.5: alpha4 = 0.5 if alpha5 > 0.1: alpha5 = 0.1 loss_dict = model(data_source, False, alpha3, alpha4, alpha5) loss_dict_target = model(data_target, True, alpha3, alpha4, alpha5) loss_dict["loss_r3"] += loss_dict_target["loss_r3"] loss_dict["loss_r4"] += loss_dict_target["loss_r4"] loss_dict["loss_r5"] += loss_dict_target["loss_r5"] loss_dict["loss_r3"] *= 0.5 loss_dict["loss_r4"] *= 0.5 loss_dict["loss_r5"] *= 0.5 losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = { k: v.item() for k, v in comm.reduce_dict(loss_dict).items() } losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): for writer in writers: writer.write() periodic_checkpointer.step(iteration)
def do_train(self, cfg, model, resume): # Note that flops at the beginning of training is often inaccurate, # if a model has input-dependent logic attach_profilers(cfg, model) optimizer = self.build_optimizer(cfg, model) scheduler = self.build_lr_scheduler(cfg, optimizer) checkpointer = self.build_checkpointer( cfg, model, save_dir=cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler, ) checkpoint = checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume) start_iter = (checkpoint.get("iteration", -1) if resume and checkpointer.has_checkpoint() else -1) # The checkpoint stores the training iteration that just finished, thus we start # at the next iteration (or iter zero if there's no checkpoint). start_iter += 1 max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer( checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) data_loader = self.build_detection_train_loader(cfg) def _get_model_with_abnormal_checker(model): if not cfg.ABNORMAL_CHECKER.ENABLED: return model tbx_writer = self.get_tbx_writer(cfg) writers = abnormal_checker.get_writers(cfg, tbx_writer) checker = abnormal_checker.AbnormalLossChecker(start_iter, writers) ret = abnormal_checker.AbnormalLossCheckerWrapper(model, checker) return ret trainer = (AMPTrainer if cfg.SOLVER.AMP.ENABLED else SimpleTrainer)( _get_model_with_abnormal_checker(model), data_loader, optimizer) trainer_hooks = self._get_trainer_hooks(cfg, model, optimizer, scheduler, periodic_checkpointer, trainer) if comm.is_main_process(): tbx_writer = self.get_tbx_writer(cfg) writers = [ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), tbx_writer, ] trainer_hooks.append(hooks.PeriodicWriter(writers)) update_hooks_from_registry(trainer_hooks) trainer.register_hooks(trainer_hooks) trainer.train(start_iter, max_iter) if hasattr(self, "original_cfg"): table = get_cfg_diff_table(cfg, self.original_cfg) logger.info( "GeneralizeRCNN Runner ignoring training config change: \n" + table) trained_cfg = self.original_cfg.clone() else: trained_cfg = cfg.clone() with temp_defrost(trained_cfg): trained_cfg.MODEL.WEIGHTS = checkpointer.get_checkpoint_file() return {"model_final": trained_cfg}
def do_train(cfg, model, resume=False): model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) start_iter = ( checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get( "iteration", -1) + 1 #FIXME: does not continue from iteration # when resume=True ) max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) writers = ([ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else []) # init best monitor metric best_monitor_metric = None # init early stopping count es_count = 0 # get train data loader data_loader = build_train_loader(cfg) logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): storage.step() _, losses, losses_reduced = get_loss(data, model) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() if (cfg.TEST.EVAL_PERIOD > 0 and (iteration + 1) % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter - 1): results = do_test(cfg, model) storage.put_scalars(**results['metrics']) if cfg.EARLY_STOPPING.ENABLE: curr = None if cfg.EARLY_STOPPING.MONITOR in results['metrics'].keys(): curr = results['metrics'][cfg.EARLY_STOPPING.MONITOR] if curr is None: logger.warning( "Early stopping enabled but cannot find metric: %s" % cfg.EARLY_STOPPING.MONITOR) logger.warning( "Options for monitored metrics are: [%s]" % ", ".join(map(str, results['metrics'].keys()))) elif best_monitor_metric is None: best_monitor_metric = curr elif get_es_result(cfg.EARLY_STOPPING.MODE, curr, best_monitor_metric): best_monitor_metric = curr es_count = 0 logger.info("Best metric %s improved to %0.4f" % (cfg.EARLY_STOPPING.MONITOR, curr)) # update best model periodic_checkpointer.save(name="model_best", **{**results['metrics']}) # save best metrics to a .txt file with open( os.path.join(cfg.OUTPUT_DIR, 'best_metrics.txt'), 'w') as f: json.dump(results['metrics'], f) else: logger.info( "Early stopping metric %s did not improve, current %.04f, best %.04f" % (cfg.EARLY_STOPPING.MONITOR, curr, best_monitor_metric)) es_count += 1 storage.put_scalar('val_loss', results['metrics']['val_loss']) comm.synchronize() if iteration - start_iter > 5 and ((iteration + 1) % 20 == 0 or iteration == max_iter - 1): for writer in writers: writer.write() periodic_checkpointer.step(iteration) if es_count >= cfg.EARLY_STOPPING.PATIENCE: logger.info( "Early stopping triggered, metric %s has not improved for %s validation steps" % (cfg.EARLY_STOPPING.MONITOR, cfg.EARLY_STOPPING.PATIENCE)) break
def do_train(cfg, model, resume=False): model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = DetectionCheckpointer( model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler ) start_iter = ( checkpointer.resume_or_load( cfg.MODEL.WEIGHTS, resume=resume, ).get("iteration", -1) + 1 ) if cfg.SOLVER.RESET_ITER: logger.info('Reset loaded iteration. Start training from iteration 0.') start_iter = 0 max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer( checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter ) writers = ( [ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else [] ) if cfg.MULTI_DATASET.ENABLED: data_loader = build_multi_dataset_train_loader(cfg) dataset_count = {k: torch.tensor(0).to(comm.get_local_rank()) for k in cfg.MULTI_DATASET.DATASETS} else: data_loader = build_custom_train_loader(cfg) logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: step_timer = Timer() data_timer = Timer() start_time = time.perf_counter() for data, iteration in zip(data_loader, range(start_iter, max_iter)): data_time = data_timer.seconds() storage.put_scalars(data_time=data_time) step_timer.reset() iteration = iteration + 1 storage.step() loss_dict = model(data) losses = sum( loss for k, loss in loss_dict.items()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = {k: v.item() \ for k, v in comm.reduce_dict(loss_dict).items()} losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars( total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar( "lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) if cfg.MULTI_DATASET.ENABLED: for b in data: dataset_count[cfg.MULTI_DATASET.DATASETS[b['dataset_source']]] += 1 dataset_count_reduced = {k: v for k, v in \ comm.reduce_dict(dataset_count).items()} if comm.is_main_process(): storage.put_scalars(**dataset_count_reduced) step_time = step_timer.seconds() storage.put_scalars(time=step_time) data_timer.reset() scheduler.step() if ( cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter ): do_test(cfg, model) comm.synchronize() if iteration - start_iter > 5 and \ (iteration % 20 == 0 or iteration == max_iter): for writer in writers: writer.write() periodic_checkpointer.step(iteration) total_time = time.perf_counter() - start_time logger.info( "Total training time: {}".format( str(datetime.timedelta(seconds=int(total_time)))))
def do_train(cfg, model, resume=False, patience=20): model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) scheduler2 = ReduceLROnPlateau(optimizer, mode="max") # warmup_scheduler = warmup.LinearWarmup(optimizer, warmup_period=200) checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) start_iter = (checkpointer.resume_or_load( cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1) max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) writers = ([ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else []) # compared to "train_net.py", we do not support accurate timing and # precise BN here, because they are not trivial to implement in a small training loop data_loader = build_detection_train_loader(cfg) logger.info("Starting training from iteration {}".format(start_iter)) best_ap50 = 0 best_iteration = 0 patience_counter = 0 with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): storage.step() loss_dict = model(data) losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = { k: v.item() for k, v in comm.reduce_dict(loss_dict).items() } losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() # warmup_scheduler.dampen(iteration) if (cfg.TEST.EVAL_PERIOD > 0 and (iteration + 1) % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter - 1): test_results = do_test(cfg, model) # scheduler2.step(test_results["bbox"]["AP50"]) # early stopping. # save checkpoint to disk checkpointer.save(f"model_{iteration}") # TODO: restore from best model if test_results["bbox"]["AP50"] > best_ap50: best_ap50 = test_results["bbox"]["AP50"] best_iteration = iteration # reset patience counter patience_counter = 0 logger.info(f"Patience counter reset.") else: patience_counter += 1 logger.info( f"Patience counter increased to {patience_counter}, will be terminated at {patience}" ) if patience_counter > patience: for writer in writers: writer.write() # restore to best checkpoint checkpointer.load( f"{cfg.OUTPUT_DIR}/model_{best_iteration}.pth") break # Compared to "train_net.py", the test results are not dumped to EventStorage comm.synchronize() if iteration - start_iter > 5 and ((iteration + 1) % 20 == 0 or iteration == max_iter - 1): for writer in writers: writer.write() # periodic_checkpointer.step(iteration) checkpointer.save(f"model_final")
def do_train(cfg, model, resume=False, evaluate=False): """ training loop. """ # Build optimizer and scheduler from configuration and model model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) # Build checkpointers checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) start_iter = (checkpointer.resume_or_load( cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1) max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) # Build writers writers = ([ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else []) # Build dataloader data_loader = build_classification_train_loader(cfg) # training loop validation_losses = [] logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: start = time.perf_counter() for data, iteration in zip(data_loader, range(start_iter, max_iter)): data_time = time.perf_counter() - start iteration = iteration + 1 storage.step() loss_dict = model(data) # compute losses losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = { k: v.item() for k, v in comm.reduce_dict(loss_dict).items() } losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalar("data_time", data_time) storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) # backward optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() #validation if ((cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0) or (iteration == max_iter)): # evaluate on the validation dataset res = do_test(cfg, model, evaluate=evaluate) validation = {} for k, v in res.items(): print(v, flush=True) validation[k] = v['loss_cls'] storage.put_scalars( **validation ) # dump also validation loss into Tensorboard validation['iteration'] = iteration validation_losses.append(validation) # logging/checkpoint if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): for writer in writers: writer.write() periodic_checkpointer.step(iteration) #Try to get an accurate measuremetn of time start = time.perf_counter() # save validations metrics if evaluate: print(validation_losses, flush=True) file_path = os.path.join(cfg.OUTPUT_DIR, "validations_losses.pth") with PathManager.open(file_path, "wb") as f: torch.save(validation_losses, f)
def do_train(cfg, model, resume=False): """ # TODO: Write docstring """ # Set the model to train model.train() # Create torch optimiser & schedulars optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) # Create a torch checkpointer checkpointer = DetectionCheckpointer( model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler ) # Create starting checkpoint i.e. pre-trained model using weights from config start_iter = ( checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 ) # Define the number of iterations max_iter = cfg.SOLVER.MAX_ITER # Create a periodic checkpointer at the configured period periodic_checkpointer = PeriodicCheckpointer( checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter ) # Export checkpoint data to terminal, JSON & tensorboard files writers = ( [ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else [] ) # Create a data loader to supply the model with training data data_loader = build_detection_train_loader(cfg) logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): iteration = iteration + 1 storage.step() loss_dict = model(data) losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()} losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() # If eval period has been set, run test at defined interval if ( cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter ): do_test(cfg, model) comm.synchronize() if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): logger.debug('Logging iteration and loss to Weights & Biases') wandb.log({"iteration": iteration}) wandb.log({"total_loss": losses_reduced}) wandb.log(loss_dict_reduced) for writer in writers: writer.write() periodic_checkpointer.step(iteration)
def do_train(cfg, model, resume=False): # Set model to training mode model.train() # Create optimizer from config file (returns torch.nn.optimizer.Optimizer) optimizer = build_optimizer(cfg, model) # Create scheduler for learning rate (returns torch.optim.lr._LR_scheduler) scheduler = build_lr_scheduler(cfg, optimizer) print(f"Scheduler: {scheduler}") # Create checkpointer checkpointer = DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) # Create start iteration (refernces checkpointer) - https://detectron2.readthedocs.io/modules/checkpoint.html#detectron2.checkpoint.Checkpointer.resume_or_load start_iter = ( # This can be 0 checkpointer.resume_or_load( cfg.MODEL. WEIGHTS, # Use predefined model weights (pretrained model) resume=resume).get("iteration", -1) + 1) # Set max number of iterations max_iter = cfg.SOLVER.MAX_ITER # Create periodiccheckpoint periodic_checkpointer = PeriodicCheckpointer( checkpointer=checkpointer, # How often to make checkpoints? period=cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) # Create writers (for saving checkpoints?) writers = ([ # Print out common metrics such as iteration time, ETA, memory, all losses, learning rate CommonMetricPrinter(max_iter=max_iter), # Write scalars to a JSON file such as loss values, time and more JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), # Write all scalars such as loss values to a TensorBoard file for easy visualization TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else []) ### Original note from script: ### # compared to "train_net.py", we do not support accurate timing and precise BN # here, because they are not trivial to implement # Build a training data loader based off the training dataset name in the config data_loader = build_detection_train_loader(cfg) # Start logging logger.info("Starting training from iteration {}".format(start_iter)) # Store events with EventStorage(start_iter) as storage: # Loop through zipped data loader and iteration for data, iteration in zip(data_loader, range(start_iter, max_iter)): iteration = iteration + 1 storage.step( ) # update stroage with step - https://detectron2.readthedocs.io/modules/utils.html#detectron2.utils.events.EventStorage.step # Create loss dictionary by trying to model data loss_dict = model(data) losses = sum(loss_dict.values()) # Are losses infinite? If so, something is wrong assert torch.isfinite(losses).all(), loss_dict # TODO - Not quite sure what's happening here loss_dict_reduced = { k: v.item() for k, v in comm.reduce_dict(loss_dict).items() } # Sum up losses losses_reduced = sum(loss for loss in loss_dict_reduced.values()) # # TODO: wandb.log()? log the losses # wandb.log({ # "Total loss": losses_reduced # }) # Update storage if comm.is_main_process(): # Store informate in storage - https://detectron2.readthedocs.io/modules/utils.html#detectron2.utils.events.EventStorage.put_scalars storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) # Start doing PyTorch things optimizer.zero_grad() losses.backward() optimizer.step() # Add learning rate to storage information storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) # This is required for your learning rate to change!!!! (not having this meant my learning rate was staying at 0) scheduler.step() # Perform evaluation? if (cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter): do_test(cfg, model) # TODO - compared to "train_net.py", the test results are not dumped to EventStorage comm.synchronize() # Log different metrics with writers if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): for writer in writers: writer.write() # Update the periodic_checkpointer periodic_checkpointer.step(iteration)
def do_train(cfg, model, resume=False, val_set='firevysor_val'): model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, min_lr=1e-6) # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.1, last_epoch=-1) metric = 0 print_every = 50 tensorboard_dir = osp.join(cfg.OUTPUT_DIR, 'tensorboard') checkpoint_dir = osp.join(cfg.OUTPUT_DIR, 'checkpoints') create_dir(tensorboard_dir) create_dir(checkpoint_dir) checkpointer = AdetCheckpointer(model, checkpoint_dir, optimizer=optimizer, scheduler=scheduler) start_iter = (checkpointer.resume_or_load( cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1) max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) writers = ([ CommonMetricPrinter(max_iter), # JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(tensorboard_dir), ] if comm.is_main_process() else []) data_loader = build_detection_train_loader(cfg) val_dataloader = build_detection_val_loader(cfg, val_set) logger.info("Starting training from iteration {}".format(start_iter)) # [PHAT]: Create a log file log_file = open(cfg.MY_CUSTOM.LOG_FILE, 'w') best_loss = 1e6 count_not_improve = 0 train_size = 2177 epoch_size = int(train_size / cfg.SOLVER.IMS_PER_BATCH) n_early_epoch = 10 with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): iteration = iteration + 1 storage.step() loss_dict = model(data) losses = sum(loss for loss in loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict # Update loss dict loss_dict_reduced = { k: v.item() for k, v in comm.reduce_dict(loss_dict).items() } losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) # Early stopping if (iteration > start_iter) and ((iteration - start_iter) % epoch_size == 0): val_loss = do_val(cfg, model, val_dataloader) if val_loss >= best_loss: count_not_improve += 1 # stop if models doesn't improve after <n_early_epoch> epoch if count_not_improve == epoch_size * n_early_epoch: break else: count_not_improve = 0 best_loss = val_loss periodic_checkpointer.save("best_model_early") # print(f"epoch {iteration//epoch_size}, val_loss: {val_loss}") log_file.write( f"Epoch {(iteration-start_iter)//epoch_size}, val_loss: {val_loss}\n" ) comm.synchronize() optimizer.zero_grad() losses.backward() optimizer.step() lr = optimizer.param_groups[0]["lr"] storage.put_scalar("lr", lr, smoothing_hint=False) scheduler.step() if iteration - start_iter > 5 and ( (iteration - start_iter) % print_every == 0 or iteration == max_iter): for writer in writers: writer.write() # Write my log log_file.write( f"[iter {iteration}, best_loss: {best_loss}] total_loss: {losses}, lr: {lr}\n" ) periodic_checkpointer.step(iteration) log_file.close()
def do_train(cfg, model, cat_heatmap_file, resume=False): model.train() # select optimizer and learning rate scheduler based on the config optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) # creat checkpointer checkpointer = DetectionCheckpointer( model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler ) start_iter = ( checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 ) max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer( checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter ) # create output writers. Separate TensorBoard writers are created # for train and validation sets. This allows easy overlaying of graphs # in TensorBoard. train_tb_writer = os.path.join(cfg.OUTPUT_DIR, 'train') val_tb_writer = os.path.join(cfg.OUTPUT_DIR, 'val') train_writers = ( [ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(train_tb_writer), ] if comm.is_main_process() else [] ) val_writers = [TensorboardXWriter(val_tb_writer)] train_dataset_name = cfg.DATASETS.TRAIN[0] train_data_loader = build_detection_train_loader(cfg) train_eval_data_loader = build_detection_test_loader(cfg, train_dataset_name) val_dataset_name = cfg.DATASETS.TEST[0] val_eval_data_loader = build_detection_test_loader(cfg, val_dataset_name, DatasetMapper(cfg,True)) logger.info("Starting training from iteration {}".format(start_iter)) train_storage = EventStorage(start_iter) val_storage = EventStorage(start_iter) # Create the training and validation evaluator objects. train_evaluator = get_evaluator( cfg, train_dataset_name, os.path.join(cfg.OUTPUT_DIR, "train_inference", train_dataset_name), cat_heatmap_file ) val_evaluator = get_evaluator( cfg, val_dataset_name, os.path.join(cfg.OUTPUT_DIR, "val_inference", val_dataset_name), cat_heatmap_file ) # initialize the best AP50 value best_AP50 = 0 start_time = time.time() for train_data, iteration in zip(train_data_loader, range(start_iter, max_iter)): # stop if the file stop_running exists in the running directory if os.path.isfile('stop_running'): os.remove('stop_running') break iteration = iteration + 1 # run a step with the training data with train_storage as storage: model.train() storage.step() loss_dict = model(train_data) losses = sum(loss for loss in loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()} losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() # periodically evaluate the training set and write the results if (cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter): train_eval_results = inference_on_dataset(model, train_eval_data_loader, train_evaluator) flat_results = flatten_results(train_eval_results) storage.put_scalars(**flat_results) comm.synchronize() if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): for writer in train_writers: writer.write() periodic_checkpointer.step(iteration) # run a step with the validation set with val_storage as storage: storage.step() # every 20 iterations evaluate the dataset to collect the loss if iteration % 20 == 0 or iteration == max_iter: with torch.set_grad_enabled(False): for input, i in zip(val_eval_data_loader , range(1)): loss_dict = model(input) losses = sum(loss for loss in loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()} losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) # periodically evaluate the validation set and write the results # check the results against the best results seen and save the parameters for # the best result if (cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 or iteration == max_iter): val_eval_results = inference_on_dataset(model, val_eval_data_loader, val_evaluator) logger.info('val_eval_results {}', str(val_eval_results)) results = val_eval_results.get('segm', None) if results is None: results = val_eval_results.get('bbox', None) if results is not None and results.get('AP50',-1) > best_AP50: best_AP50 = results['AP50'] logger.info('saving best results ({}), iter {}'.format(best_AP50, iteration)) checkpointer.save("best_AP50") flat_results = flatten_results(val_eval_results) storage.put_scalars(**flat_results) comm.synchronize() if iteration - start_iter > 5 and (iteration % 20 == 0): for writer in val_writers: writer.write() elapsed = time.time() - start_time time_per_iter = elapsed / (iteration - start_iter) time_left = time_per_iter * (max_iter - iteration) logger.info("ETA: {}".format(str(datetime.timedelta(seconds=time_left))))
def do_train(cfg, model, resume=False): model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) start_iter = (checkpointer.resume_or_load( cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1) max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) writers = ([ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else []) #dataset|mapper|augs|sampler are done during building data_loader atoms = generate_atom_list(cfg, True) black_magic_mapper = BlackMagicMapper(cfg, is_train=True, augmentations=atoms) data_loader = build_detection_train_loader(cfg, black_magic_mapper) logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): if cfg.DATALOADER.SAVE_BLACK_MAGIC_PATH != "": save_data_to_disk(cfg, data) iteration = iteration + 1 storage.step() loss_dict = model(data) losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = { k: v.item() for k, v in comm.reduce_dict(loss_dict).items() } losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() if (cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter): do_test(cfg, model) # Compared to "train_net.py", the test results are not dumped to EventStorage comm.synchronize() if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): for writer in writers: writer.write() periodic_checkpointer.step(iteration)
def do_train(cfg, args, myargs): run_func = cfg.start.get('run_func', 'train_func') dataset_name = cfg.start.dataset_name IMS_PER_BATCH = cfg.start.IMS_PER_BATCH max_epoch = cfg.start.max_epoch ASPECT_RATIO_GROUPING = cfg.start.ASPECT_RATIO_GROUPING NUM_WORKERS = cfg.start.NUM_WORKERS checkpoint_period = cfg.start.checkpoint_period dataset_mapper = cfg.start.dataset_mapper resume_ckpt_dir = get_attr_kwargs(cfg.start, 'resume_ckpt_dir', default=None) resume_ckpt_epoch = get_attr_kwargs(cfg.start, 'resume_ckpt_epoch', default=0) resume_ckpt_iter_every_epoch = get_attr_kwargs( cfg.start, 'resume_ckpt_iter_every_epoch', default=0) cfg.defrost() cfg.DATASETS.TRAIN = (dataset_name, ) cfg.SOLVER.IMS_PER_BATCH = IMS_PER_BATCH cfg.DATALOADER.ASPECT_RATIO_GROUPING = ASPECT_RATIO_GROUPING cfg.DATALOADER.NUM_WORKERS = NUM_WORKERS cfg.freeze() # build dataset mapper = build_dataset_mapper(dataset_mapper) data_loader = build_detection_train_loader(cfg, mapper=mapper) metadata = MetadataCatalog.get(dataset_name) num_images = metadata.get('num_images') iter_every_epoch = num_images // IMS_PER_BATCH max_iter = iter_every_epoch * max_epoch model = build_trainer(cfg, myargs=myargs, iter_every_epoch=iter_every_epoch, img_size=dataset_mapper.img_size, dataset_name=dataset_name, train_bs=IMS_PER_BATCH, max_iter=max_iter) model.train() # optimizer = build_optimizer(cfg, model) optims_dict = model.build_optimizer() # scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = DetectionCheckpointer(model.get_saved_model(), cfg.OUTPUT_DIR, **optims_dict) if args.resume: resume_ckpt_dir = model._get_ckpt_path( ckpt_dir=resume_ckpt_dir, ckpt_epoch=resume_ckpt_epoch, iter_every_epoch=resume_ckpt_iter_every_epoch) start_iter = ( checkpointer.resume_or_load(resume_ckpt_dir).get("iteration", -1) + 1) if get_attr_kwargs(args, 'finetune', default=False): start_iter = 0 else: start_iter = 0 model.after_resume() if run_func != 'train_func': eval(f'model.{run_func}()') exit(0) checkpoint_period = eval(checkpoint_period, dict(iter_every_epoch=iter_every_epoch)) periodic_checkpointer = PeriodicCheckpointer(checkpointer, checkpoint_period, max_iter=max_iter) logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: pbar = zip(data_loader, range(start_iter, max_iter)) if comm.is_main_process(): pbar = tqdm.tqdm( pbar, desc=f'do_train, {myargs.args.time_str_suffix}, ' f'iters {iter_every_epoch} * bs {IMS_PER_BATCH} = imgs {iter_every_epoch*IMS_PER_BATCH}', file=myargs.stdout, initial=start_iter, total=max_iter) for data, iteration in pbar: comm.synchronize() iteration = iteration + 1 storage.step() model.train_func(data, iteration - 1, pbar=pbar) periodic_checkpointer.step(iteration) pass comm.synchronize()
def do_train(cfg, model, resume=False): # 模型设置训练模式 model.train() # 构建优化器 optimizer = build_optimizer(cfg, model) # 构建学习率调整策略 scheduler = build_lr_scheduler(cfg, optimizer) # 断点管理对象 checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) # 可用于恢复训练的起始训练步 start_iter = (checkpointer.resume_or_load( cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1) # 最大迭代次数 max_iter = cfg.SOLVER.MAX_ITER # 这里的PeriodicCheckpointer是fvcore.common.checkpoint中的类,可以用于在指定checkpoint处保存和加载模型 periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) writers = ([ CommonMetricPrinter(max_iter), # 负责终端loss登信息的打印 JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else []) # 构建batched训练data loader data_loader = build_detection_train_loader(cfg) # 构建用于获取测试loss的 test data loader test_data_loaders = [] for dataset_name in cfg.DATASETS.TEST: test_data_loaders.append({ "name": dataset_name, "data_loader": build_detection_test_loader(cfg, dataset_name, DatasetMapper(cfg, True)) }) logger.info("从第{}轮开始训练".format(start_iter)) with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): iteration = iteration + 1 # 每个迭代的开始调用,更新storage对象的游标 storage.step() loss_dict = model(data) losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = { k: v.item() for k, v in comm.reduce_dict(loss_dict).items() } losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): # 将该轮前向传播的loss放入storage对象的容器中(storage.histories(),后面读取该容器来打印终端) storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) # 反向传播 optimizer.zero_grad() losses.backward() optimizer.step() # 将该轮学习率放入storage对象的容器中 storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() if (cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter): do_test(cfg, model) # Compared to "train_net.py", the test results are not dumped to EventStorage comm.synchronize() # if iteration % 21 == 0: # do_loss_eval(cfg, storage, model, test_data_loaders) # for writer in writers: # writer.write() if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): do_loss_eval(cfg, storage, model, test_data_loaders) for writer in writers: writer.write() periodic_checkpointer.step(iteration)
def do_train(cfg, model, resume=False): model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) start_iter = (checkpointer.resume_or_load( cfg.MODEL.WEIGHTS, resume=resume).get('iteration', -1) + 1) max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) writers = ([ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, 'metric.json')), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else []) data_loader = build_detection_train_loader(cfg) logger.info(" Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): iteration = iteration + 1 storage.step() loss_dict = model(data) losses = sum(loss for loss in loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = { k: v.item() for k, v in comm.reduce_dict(loss_dict).items() } losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() if (cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST_EVAL_PERIOC == 0 and iteration != max_iter): do_test(cfg, model) comm.synchronize() if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): for writer in writers: writer.write() periodic_checkpointer.step(iteration)
def do_train(cfg, model, resume=False): model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) start_iter = (checkpointer.resume_or_load( cfg.MODEL.WEIGHTS, resume=resume, ).get("iteration", -1) + 1) if cfg.SOLVER.RESET_ITER: logger.info('Reset loaded iteration. Start training from iteration 0.') start_iter = 0 max_iter = cfg.SOLVER.MAX_ITER if cfg.SOLVER.TRAIN_ITER < 0 else cfg.SOLVER.TRAIN_ITER periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) writers = ([ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else []) mapper = DatasetMapper(cfg, True) if cfg.INPUT.CUSTOM_AUG == '' else \ DatasetMapper(cfg, True, augmentations=build_custom_augmentation(cfg, True)) if cfg.DATALOADER.SAMPLER_TRAIN in [ 'TrainingSampler', 'RepeatFactorTrainingSampler' ]: data_loader = build_detection_train_loader(cfg, mapper=mapper) else: from centernet.data.custom_dataset_dataloader import build_custom_train_loader data_loader = build_custom_train_loader(cfg, mapper=mapper) logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: step_timer = Timer() data_timer = Timer() start_time = time.perf_counter() for data, iteration in zip(data_loader, range(start_iter, max_iter)): data_time = data_timer.seconds() storage.put_scalars(data_time=data_time) step_timer.reset() iteration = iteration + 1 storage.step() loss_dict = model(data) losses = sum(loss for k, loss in loss_dict.items()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = {k: v.item() \ for k, v in comm.reduce_dict(loss_dict).items()} losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) step_time = step_timer.seconds() storage.put_scalars(time=step_time) data_timer.reset() scheduler.step() if (cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter): do_test(cfg, model) comm.synchronize() if iteration - start_iter > 5 and \ (iteration % 20 == 0 or iteration == max_iter): for writer in writers: writer.write() periodic_checkpointer.step(iteration) total_time = time.perf_counter() - start_time logger.info("Total training time: {}".format( str(datetime.timedelta(seconds=int(total_time)))))
def do_train(cfg, args, model, resume=False): # default batch size is 16 model.train() scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) max_iter = cfg.SOLVER.MAX_ITER start_iter = (checkpointer.resume_or_load( cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1) periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) writers = ([ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] #if comm.is_main_process() #else [] ) # compared to "train_net.py", we do not support accurate timing and # precise BN here, because they are not trivial to implement in a small training loop #logger.info("Starting training from iteration {}".format(start_iter)) iters = 0 iter_cnt = 0 iter_sample_start = 1 iter_sample_end = 20 iter_end = 300 start_time, end_time = 0, 0 sample_iters = iter_sample_end - iter_sample_start + 1 if args.scheduler: if args.scheduler_baseline: grc.memory.clean() grc.compressor.clean() grc.memory.partition() else: from mergeComp_dl.torch.scheduler.scheduler import Scheduler Scheduler(grc, memory_partition, args) with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): iters += 1 iter_cnt += 1 if iters == iter_end: break if hvd.local_rank() == 0 and iter_cnt == iter_sample_start: torch.cuda.synchronize() start_time = time_() storage.iter = iteration #torch.cuda.synchronize() #iter_start_time = time_() loss_dict = model(data) losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict #torch.cuda.synchronize() #iter_model_time = time_() #loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()} #losses_reduced = sum(loss for loss in loss_dict_reduced.values()) #if comm.is_main_process(): # storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) #print("loss dict:", loss_dict, "losses:", losses, "reduced loss dict:", loss_dict_reduced, "reduced losses:", losses_reduced) losses.backward() #torch.cuda.synchronize() #iter_backward_time = time_() optimizer.step() optimizer.zero_grad() #torch.cuda.synchronize() #print("Iteration: {}\tmodel time: {:.3f} \tbackward time: {:.3f}\tFP+BP Time: {:.3f}\tstep time: {:.3f}\tData size: {}".format( # iteration, # (iter_model_time - iter_start_time), # (iter_backward_time - iter_model_time), # (iter_backward_time - iter_start_time), # time_() - iter_start_time, # len(data))) storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() if args.compress: grc.memory.update_lr(optimizer.param_groups[0]['lr']) if hvd.local_rank() == 0 and iter_cnt == iter_sample_end: torch.cuda.synchronize() end_time = time_() iter_cnt = 0 print( "Iterations: {}\tTime: {:.3f} s\tTraining speed: {:.3f} iters/s" .format(sample_iters, end_time - start_time, sample_iters / (end_time - start_time))) if (cfg.TEST.EVAL_PERIOD > 0 and (iteration + 1) % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter - 1): do_test(cfg, model)
def do_train(cfg1, model1, model2, resume=False): model1.train() optimizer = build_optimizer(cfg1, model1) scheduler = build_lr_scheduler(cfg1, optimizer) checkpointer = DetectionCheckpointer(model1, cfg1.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) start_iter = (checkpointer.resume_or_load( cfg1.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1) max_iter = cfg1.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg1.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) writers = ([ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg1.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg1.OUTPUT_DIR), ] if comm.is_main_process() else []) # compared to "train_net.py", we do not support accurate timing and # precise BN here, because they are not trivial to implement data_loader = custom_train_loader(cfg1) logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): iteration = iteration + 1 storage.step() height = data[0]['image'].shape[1] width = data[0]['image'].shape[2] second_stream_outputs = inference_second_stream( model2, data, height, width) loss_dict = model1(data, second_stream_outputs) losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = { k: v.item() for k, v in comm.reduce_dict(loss_dict).items() } losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() if (cfg1.TEST.EVAL_PERIOD > 0 and iteration % cfg1.TEST.EVAL_PERIOD == 0 and iteration != max_iter): do_test(cfg1, model1, model2) # Compared to "train_net.py", the test results are not dumped to EventStorage comm.synchronize() if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): for writer in writers: writer.write() periodic_checkpointer.step(iteration)
def start_train(al_cfg, cfg, model, resume=False): early_stopping = EarlyStopping(patience=al_cfg.EARLY_STOP.PATIENCE, delta=al_cfg.EARLY_STOP.DELTA, verbose=True) model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) start_iter = (checkpointer.resume_or_load( cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1) max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) writers = ([ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else []) # compared to "train_net.py", we do not support accurate timing and # precise BN here, because they are not trivial to implement data_loader = build_detection_train_loader(cfg) logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): iteration = iteration + 1 storage.step() loss_dict = model(data) losses = sum(loss for loss in loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = { k: v.item() for k, v in comm.reduce_dict(loss_dict).items() } losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() if (cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter): results = do_test(cfg, model) bbox_results = results['bbox'] AP = bbox_results['AP'] comm.synchronize() print('AP:', AP, '\tValue:', 1 - (AP / 100)) early_stopping(1 - (AP / 100)) storage.put_scalars(**bbox_results) if early_stopping.counter < 1: checkpointer.save('model_final') if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): for writer in writers: writer.write() periodic_checkpointer.step(iteration) if early_stopping.early_stop: print("EARLY STOPPING INITIATED AT ITERATION:", iteration) # checkpointer.save('model_final') break
def do_train(cfg, model, resume=False): model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) start_iter = (checkpointer.resume_or_load( cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1) max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) writers = ([ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else []) # compared to "train_net.py", we do not support accurate timing and # precise BN here, because they are not trivial to implement data_loader = build_detection_train_loader(cfg) logger.info("Starting training from iteration {}".format(start_iter)) forward_pass_end_time = time.perf_counter() with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): iteration_start_time = time.perf_counter() if comm.get_rank() == 0: print("Approx backwards pass duration: ", iteration_start_time - forward_pass_end_time) iteration = iteration + 1 storage.step() if iteration == 500: print("Iteration 500. Profiling!") with torch.autograd.profiler.profile( use_cuda=True, record_shapes=True) as prof: loss_dict = model(data) losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = { k: v.item() for k, v in comm.reduce_dict(loss_dict).items() } losses_reduced = sum( loss for loss in loss_dict_reduced.values()) print(prof.key_averages().table(sort_by="self_cpu_time_total")) prof.export_chrome_trace("/root/trace.json") else: loss_dict = model(data) losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = { k: v.item() for k, v in comm.reduce_dict(loss_dict).items() } losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) forward_pass_end_time = time.perf_counter() optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() if (cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter): do_test(cfg, model) # Compared to "train_net.py", the test results are not dumped to EventStorage comm.synchronize() if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): for writer in writers: writer.write() periodic_checkpointer.step(iteration)
def do_train(cfg, model, resume=False): model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) checkpointer_spot = DetectionCheckpointer(model, '/opt/ml/checkpoints', optimizer=optimizer, scheduler=scheduler) start_iter = (checkpointer.resume_or_load( cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1) max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) periodic_checkpointer_spot = PeriodicCheckpointer( checkpointer_spot, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) writers = ([ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else []) # compared to "train_net.py", we do not support accurate timing and # precise BN here, because they are not trivial to implement # data_loader = build_detection_train_loader(cfg) data_loader = build_detection_train_loader( cfg, # mapper=DatasetMapper(cfg, is_train=True # , augmentations=[ # T.Resize((1024, 1024)), # T.RandomBrightness(.75,1.25), # T.RandomFlip(), # T.RandomSaturation(.75,1.25) # ] ) # ) logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): iteration = iteration + 1 storage.step() loss_dict = model(data) losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = { k: v.item() for k, v in comm.reduce_dict(loss_dict).items() } losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() if (cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter): do_test(cfg, model) # Compared to "train_net.py", the test results are not dumped to EventStorage comm.synchronize() if iteration % 500 == 0: try: torch.save(model.state_dict(), f'{cfg.OUTPUT_DIR}/model_{iteration}.pth') except: print('save failed') if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): for writer in writers: writer.write() periodic_checkpointer.step(iteration) periodic_checkpointer_spot.step(iteration)
def train(cfg, args, myargs): dataset_name = cfg.start.dataset_name IMS_PER_BATCH = cfg.start.IMS_PER_BATCH max_epoch = cfg.start.max_epoch ASPECT_RATIO_GROUPING = cfg.start.ASPECT_RATIO_GROUPING NUM_WORKERS = cfg.start.NUM_WORKERS checkpoint_period = cfg.start.checkpoint_period cfg.defrost() cfg.DATASETS.TRAIN = (dataset_name, ) cfg.SOLVER.IMS_PER_BATCH = IMS_PER_BATCH cfg.DATALOADER.ASPECT_RATIO_GROUPING = ASPECT_RATIO_GROUPING cfg.DATALOADER.NUM_WORKERS = NUM_WORKERS cfg.freeze() # build dataset mapper = build_dataset_mapper(cfg) data_loader = build_detection_train_loader(cfg, mapper=mapper) metadata = MetadataCatalog.get(dataset_name) num_images = metadata.get('num_images') iter_every_epoch = num_images // IMS_PER_BATCH max_iter = iter_every_epoch * max_epoch model = build_trainer(cfg, myargs=myargs, iter_every_epoch=iter_every_epoch) model.train() logger.info("Model:\n{}".format(model)) # optimizer = build_optimizer(cfg, model) optims_dict = model.build_optimizer() # scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = DetectionCheckpointer(model.get_saved_model(), cfg.OUTPUT_DIR, **optims_dict) start_iter = (checkpointer.resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume).get("iteration", -1) + 1) checkpoint_period = eval(checkpoint_period, dict(iter_every_epoch=iter_every_epoch)) periodic_checkpointer = PeriodicCheckpointer(checkpointer, checkpoint_period, max_iter=max_iter) logger.info("Starting training from iteration {}".format(start_iter)) modelarts_utils.modelarts_sync_results(args=myargs.args, myargs=myargs, join=True, end=False) with EventStorage(start_iter) as storage: pbar = zip(data_loader, range(start_iter, max_iter)) if comm.is_main_process(): pbar = tqdm.tqdm( pbar, desc=f'train, {myargs.args.time_str_suffix}, ' f'iters {iter_every_epoch} * bs {IMS_PER_BATCH} = imgs {iter_every_epoch*IMS_PER_BATCH}', file=myargs.stdout, initial=start_iter, total=max_iter) for data, iteration in pbar: comm.synchronize() iteration = iteration + 1 storage.step() model.train_func(data, iteration - 1, pbar=pbar) periodic_checkpointer.step(iteration) pass modelarts_utils.modelarts_sync_results(args=myargs.args, myargs=myargs, join=True, end=True) comm.synchronize()