def train_model(dataset): # Export the dataset to COCO format export_file, image_dir = export_dataset(dataset) # Register it as a COCO dataset in the Detectron2 framework try: register_coco_instances('my_dataset', {}, export_file, image_dir) except: print('Dataset was already registered') dataset_dicts = load_coco_json(export_file, image_dir) MetadataCatalog.get('my_dataset').set( thing_classes=[c['name'] for c in dataset.categories]) segments_metadata = MetadataCatalog.get('my_dataset') print(segments_metadata) # Configure the training run cfg = get_cfg() cfg.merge_from_file( model_zoo.get_config_file( 'COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml')) cfg.DATASETS.TRAIN = ('my_dataset', ) cfg.DATASETS.TEST = () cfg.INPUT.MASK_FORMAT = 'bitmask' cfg.DATALOADER.NUM_WORKERS = 2 cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url( 'COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml' ) # Let training initialize from model zoo cfg.SOLVER.IMS_PER_BATCH = 4 # 4 cfg.SOLVER.BASE_LR = 0.00025 # pick a good LR cfg.SOLVER.MAX_ITER = 6000 # 300 iterations seems good enough for this toy dataset; you may need to train longer for a practical dataset cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512 # faster, and good enough for this toy dataset (default: 512) cfg.MODEL.ROI_HEADS.NUM_CLASSES = len( dataset.categories) # number of categories # cfg.MODEL.DEVICE = 'cuda' print('Max iter is ', cfg.SOLVER.MAX_ITER) # Start the training os.makedirs(cfg.OUTPUT_DIR, exist_ok=True) trainer = DefaultTrainer(cfg) trainer.resume_or_load(resume=False) trainer.train() # Return the model cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, 'model_final.pth') cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7 # set the testing threshold for this model cfg.DATASETS.TEST = ('my_dataset', ) cfg.TEST.DETECTIONS_PER_IMAGE = 1000 built_model = build_model(cfg) # returns a torch.nn.Module DetectionCheckpointer(built_model).load( cfg.MODEL.WEIGHTS) #capture trained model checkpointer = DetectionCheckpointer( built_model, save_dir="/content/gdrive/My Drive/Colab Notebooks") checkpointer.save("model_final") # save to output/model_999.pth predictor = DefaultPredictor(cfg) model = Model(predictor) return model
def save(self, dst): try: from detectron2.checkpoint import ( DetectionCheckpointer, ) # noqa # pylint: disable=unused-import from detectron2.config import get_cfg except ImportError: raise MissingDependencyException( "Detectron package is required to use DetectronArtifact") os.makedirs(dst, exist_ok=True) checkpointer = DetectionCheckpointer(self._model, save_dir=dst) checkpointer.save(self._file_name) cfg = get_cfg() cfg.merge_from_file(self._input_model_yaml) with open(os.path.join(dst, f"{self._file_name}.yaml"), 'w', encoding='utf-8') as output_file: output_file.write(cfg.dump())
def run_train(): torch.multiprocessing.freeze_support() cfg = get_cfg() cfg.merge_from_file(model_zoo.get_config_file('COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml')) # cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.75 # Threshold cfg.MODEL.WEIGHTS = "detectron2://COCO-Detection/faster_rcnn_R_101_FPN_3x/137851257/model_final_f6e8b1.pkl" cfg.MODEL.DEVICE = "cpu" # cpu or cuda register_datasets() cfg.DATASETS.TRAIN = ('grini_nc_merged_bbox_only_train',) cfg.DATASETS.TEST = ('grini_nc_merged_bbox_only_val',) # cfg.MODEL.WEIGHTS = get_checkpoint_url('COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml') cfg.MODEL.DEVICE = "cpu" # cpu or cuda # todo find out how rescale images and annotations first... # Parameters fixed cfg.SOLVER.IMS_PER_BATCH = 4 cfg.SOLVER.BASE_LR = 0.001 cfg.SOLVER.WARMUP_ITERS = 1000 cfg.SOLVER.MAX_ITER = 1500 # adjust up if val mAP is still rising, adjust down if overfit cfg.SOLVER.STEPS = (1000, 1500) cfg.SOLVER.GAMMA = 0.05 cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 12 cfg.MODEL.ROI_HEADS.NUM_CLASSES = 3 cfg.TEST.EVAL_PERIOD = 500 # makedirs(cfg.OUTPUT_DIR, exist_ok=True) setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="my_model") # DetectionCheckpointer(cfg).load(file_path_or_url) # load a file, usually from cfg.MODEL.WEIGHTS checkpointer = DetectionCheckpointer(build_model(cfg), save_dir=cfg.OUTPUT_DIR) checkpointer.save("model_faster_rcnn_unscaled") # save to output/model_999.pth trainer = CocoTrainer(cfg) trainer.resume_or_load(resume=False) trainer.train()
def start_train(al_cfg, cfg, model, resume=False): early_stopping = EarlyStopping(patience=al_cfg.EARLY_STOP.PATIENCE, delta=al_cfg.EARLY_STOP.DELTA, verbose=True) model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) start_iter = (checkpointer.resume_or_load( cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1) max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) writers = ([ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else []) # compared to "train_net.py", we do not support accurate timing and # precise BN here, because they are not trivial to implement data_loader = build_detection_train_loader(cfg) logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): iteration = iteration + 1 storage.step() loss_dict = model(data) losses = sum(loss for loss in loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = { k: v.item() for k, v in comm.reduce_dict(loss_dict).items() } losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() if (cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter): results = do_test(cfg, model) bbox_results = results['bbox'] AP = bbox_results['AP'] comm.synchronize() print('AP:', AP, '\tValue:', 1 - (AP / 100)) early_stopping(1 - (AP / 100)) storage.put_scalars(**bbox_results) if early_stopping.counter < 1: checkpointer.save('model_final') if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): for writer in writers: writer.write() periodic_checkpointer.step(iteration) if early_stopping.early_stop: print("EARLY STOPPING INITIATED AT ITERATION:", iteration) # checkpointer.save('model_final') break
def do_train(cfg, model, cat_heatmap_file, resume=False): model.train() # select optimizer and learning rate scheduler based on the config optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) # creat checkpointer checkpointer = DetectionCheckpointer( model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler ) start_iter = ( checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 ) max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer( checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter ) # create output writers. Separate TensorBoard writers are created # for train and validation sets. This allows easy overlaying of graphs # in TensorBoard. train_tb_writer = os.path.join(cfg.OUTPUT_DIR, 'train') val_tb_writer = os.path.join(cfg.OUTPUT_DIR, 'val') train_writers = ( [ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(train_tb_writer), ] if comm.is_main_process() else [] ) val_writers = [TensorboardXWriter(val_tb_writer)] train_dataset_name = cfg.DATASETS.TRAIN[0] train_data_loader = build_detection_train_loader(cfg) train_eval_data_loader = build_detection_test_loader(cfg, train_dataset_name) val_dataset_name = cfg.DATASETS.TEST[0] val_eval_data_loader = build_detection_test_loader(cfg, val_dataset_name, DatasetMapper(cfg,True)) logger.info("Starting training from iteration {}".format(start_iter)) train_storage = EventStorage(start_iter) val_storage = EventStorage(start_iter) # Create the training and validation evaluator objects. train_evaluator = get_evaluator( cfg, train_dataset_name, os.path.join(cfg.OUTPUT_DIR, "train_inference", train_dataset_name), cat_heatmap_file ) val_evaluator = get_evaluator( cfg, val_dataset_name, os.path.join(cfg.OUTPUT_DIR, "val_inference", val_dataset_name), cat_heatmap_file ) # initialize the best AP50 value best_AP50 = 0 start_time = time.time() for train_data, iteration in zip(train_data_loader, range(start_iter, max_iter)): # stop if the file stop_running exists in the running directory if os.path.isfile('stop_running'): os.remove('stop_running') break iteration = iteration + 1 # run a step with the training data with train_storage as storage: model.train() storage.step() loss_dict = model(train_data) losses = sum(loss for loss in loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()} losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() # periodically evaluate the training set and write the results if (cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter): train_eval_results = inference_on_dataset(model, train_eval_data_loader, train_evaluator) flat_results = flatten_results(train_eval_results) storage.put_scalars(**flat_results) comm.synchronize() if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): for writer in train_writers: writer.write() periodic_checkpointer.step(iteration) # run a step with the validation set with val_storage as storage: storage.step() # every 20 iterations evaluate the dataset to collect the loss if iteration % 20 == 0 or iteration == max_iter: with torch.set_grad_enabled(False): for input, i in zip(val_eval_data_loader , range(1)): loss_dict = model(input) losses = sum(loss for loss in loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()} losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) # periodically evaluate the validation set and write the results # check the results against the best results seen and save the parameters for # the best result if (cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 or iteration == max_iter): val_eval_results = inference_on_dataset(model, val_eval_data_loader, val_evaluator) logger.info('val_eval_results {}', str(val_eval_results)) results = val_eval_results.get('segm', None) if results is None: results = val_eval_results.get('bbox', None) if results is not None and results.get('AP50',-1) > best_AP50: best_AP50 = results['AP50'] logger.info('saving best results ({}), iter {}'.format(best_AP50, iteration)) checkpointer.save("best_AP50") flat_results = flatten_results(val_eval_results) storage.put_scalars(**flat_results) comm.synchronize() if iteration - start_iter > 5 and (iteration % 20 == 0): for writer in val_writers: writer.write() elapsed = time.time() - start_time time_per_iter = elapsed / (iteration - start_iter) time_left = time_per_iter * (max_iter - iteration) logger.info("ETA: {}".format(str(datetime.timedelta(seconds=time_left))))
class TrainingModule(LightningModule): def __init__(self, cfg): super().__init__() if not logger.isEnabledFor(logging.INFO): # setup_logger is not called for d2 setup_logger() self.cfg = DefaultTrainer.auto_scale_workers(cfg, comm.get_world_size()) self.storage: EventStorage = None self.model = build_model(self.cfg) self.start_iter = 0 self.max_iter = cfg.SOLVER.MAX_ITER def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None: checkpoint["iteration"] = self.storage.iter def on_load_checkpoint(self, checkpointed_state: Dict[str, Any]) -> None: self.start_iter = checkpointed_state["iteration"] self.storage.iter = self.start_iter def setup(self, stage: str): if self.cfg.MODEL.WEIGHTS: self.checkpointer = DetectionCheckpointer( # Assume you want to save checkpoints together with logs/statistics self.model, self.cfg.OUTPUT_DIR, ) logger.info(f"Load model weights from checkpoint: {self.cfg.MODEL.WEIGHTS}.") # Only load weights, use lightning checkpointing if you want to resume self.checkpointer.load(self.cfg.MODEL.WEIGHTS) self.iteration_timer = hooks.IterationTimer() self.iteration_timer.before_train() self.data_start = time.perf_counter() self.writers = None def training_step(self, batch, batch_idx): data_time = time.perf_counter() - self.data_start # Need to manually enter/exit since trainer may launch processes # This ideally belongs in setup, but setup seems to run before processes are spawned if self.storage is None: self.storage = EventStorage(0) self.storage.__enter__() self.iteration_timer.trainer = weakref.proxy(self) self.iteration_timer.before_step() self.writers = ( default_writers(self.cfg.OUTPUT_DIR, self.max_iter) if comm.is_main_process() else {} ) loss_dict = self.model(batch) SimpleTrainer.write_metrics(loss_dict, data_time) opt = self.optimizers() self.storage.put_scalar( "lr", opt.param_groups[self._best_param_group_id]["lr"], smoothing_hint=False ) self.iteration_timer.after_step() self.storage.step() # A little odd to put before step here, but it's the best way to get a proper timing self.iteration_timer.before_step() if self.storage.iter % 20 == 0: for writer in self.writers: writer.write() return sum(loss_dict.values()) def training_step_end(self, training_step_outpus): self.data_start = time.perf_counter() return training_step_outpus def training_epoch_end(self, training_step_outputs): self.iteration_timer.after_train() if comm.is_main_process(): self.checkpointer.save("model_final") for writer in self.writers: writer.write() writer.close() self.storage.__exit__(None, None, None) def _process_dataset_evaluation_results(self) -> OrderedDict: results = OrderedDict() for idx, dataset_name in enumerate(self.cfg.DATASETS.TEST): results[dataset_name] = self._evaluators[idx].evaluate() if comm.is_main_process(): print_csv_format(results[dataset_name]) if len(results) == 1: results = list(results.values())[0] return results def _reset_dataset_evaluators(self): self._evaluators = [] for dataset_name in self.cfg.DATASETS.TEST: evaluator = build_evaluator(self.cfg, dataset_name) evaluator.reset() self._evaluators.append(evaluator) def on_validation_epoch_start(self, _outputs): self._reset_dataset_evaluators() def validation_epoch_end(self, _outputs): results = self._process_dataset_evaluation_results(_outputs) flattened_results = flatten_results_dict(results) for k, v in flattened_results.items(): try: v = float(v) except Exception as e: raise ValueError( "[EvalHook] eval_function should return a nested dict of float. " "Got '{}: {}' instead.".format(k, v) ) from e self.storage.put_scalars(**flattened_results, smoothing_hint=False) def validation_step(self, batch, batch_idx: int, dataloader_idx: int = 0) -> None: if not isinstance(batch, List): batch = [batch] outputs = self.model(batch) self._evaluators[dataloader_idx].process(batch, outputs) def configure_optimizers(self): optimizer = build_optimizer(self.cfg, self.model) self._best_param_group_id = hooks.LRScheduler.get_best_param_group_id(optimizer) scheduler = build_lr_scheduler(self.cfg, optimizer) return [optimizer], [{"scheduler": scheduler, "interval": "step"}]
def do_train(cfg, model, resume=False): model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) # checkpointer = DetectionCheckpointer( # model, cfg.OUTPUT_DIR, # optimizer=optimizer, # scheduler=scheduler # ) #do not load checkpointer's optimizer and scheduler checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR) start_iter = (checkpointer.resume_or_load( cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1) #model.load_state_dict(optimizer) max_iter = cfg.SOLVER.MAX_ITER writers = ([ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else []) # compared to "train_net.py", we do not support accurate timing and # precise BN here, because they are not trivial to implement train_data_loader = build_detection_train_loader( cfg, mapper=PathwayDatasetMapper(cfg, True)) # epoch_data_loader = build_detection_test_loader(cfg=cfg, dataset_name= cfg.DATASETS.TRAIN[0], # mapper=PathwayDatasetMapper(cfg, True)) val_data_loader = build_detection_validation_loader( cfg=cfg, dataset_name=cfg.DATASETS.TEST[0], mapper=PathwayDatasetMapper(cfg, False)) if cfg.DATALOADER.ASPECT_RATIO_GROUPING: epoch_num = (train_data_loader.dataset.sampler._size // cfg.SOLVER.IMS_PER_BATCH) + 1 else: epoch_num = train_data_loader.dataset.sampler._size // cfg.SOLVER.IMS_PER_BATCH # periodic_checkpointer = PeriodicCheckpointer( # checkpointer, # #cfg.SOLVER.CHECKPOINT_PERIOD, # epoch_num, # max_iter=max_iter # ) logger.info("Starting training from iteration {}".format(start_iter)) loss_weights = {'loss_cls': 1, 'loss_box_reg': 1} with EventStorage(start_iter) as storage: loss_per_epoch = 0.0 best_loss = 99999.0 best_val_loss = 99999.0 better_train = False better_val = False for data, iteration in zip(train_data_loader, range(start_iter, max_iter)): iteration = iteration + 1 storage.step() loss_dict = model(data) losses = sum(loss for loss in loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = { k: v.item() * loss_weights[k] for k, v in comm.reduce_dict(loss_dict).items() } losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() #prevent gredient explosion torch.nn.utils.clip_grad_norm_(model.parameters(), 1) optimizer.step() #if comm.is_main_process(): storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() # if ( # # cfg.TEST.EVAL_PERIOD > 0 # # and # iteration % epoch_num == 0 # #iteration % cfg.TEST.EVAL_PERIOD == 0 # and iteration != max_iter # ): # do_test(cfg, model) # # Compared to "train_net.py", the test results are not dumped to EventStorage # comm.synchronize() loss_per_epoch += losses_reduced if iteration % epoch_num == 0 or iteration == max_iter: #one complete epoch epoch_loss = loss_per_epoch / epoch_num #do validation #epoch_loss, epoch_cls_loss, epoch_box_reg_loss = do_validation(epoch_data_loader, model, loss_weights) #val_loss, val_cls_loss, val_box_reg_loss = do_validation(val_data_loader, model, loss_weights) checkpointer.save("model_{:07d}".format(iteration), **{"iteration": iteration}) # calculate epoch_loss and push to history cache #if comm.is_main_process(): storage.put_scalar("epoch_loss", epoch_loss, smoothing_hint=False) # storage.put_scalar("epoch_cls_loss", epoch_cls_loss, smoothing_hint=False) # storage.put_scalar("epoch_box_reg_loss", epoch_box_reg_loss, smoothing_hint=False) # storage.put_scalar("val_loss", val_loss, smoothing_hint=False) # storage.put_scalar("val_cls_loss", val_cls_loss, smoothing_hint=False) # storage.put_scalar("val_box_reg_loss", val_box_reg_loss, smoothing_hint=False) for writer in writers: writer.write() # only save improved checkpoints on epoch_loss # if best_loss > epoch_loss: # best_loss = epoch_loss # better_train = True # if best_val_loss > val_loss: # best_val_loss = val_loss # better_val = True #if better_val: #checkpointer.save("model_{:07d}".format(iteration), **{"iteration": iteration}) #comm.synchronize() #reset loss_per_epoch loss_per_epoch = 0.0 # better_train = False # better_val = False del loss_dict, losses, losses_reduced, loss_dict_reduced torch.cuda.empty_cache()
def do_train(cfg, model, resume=False): model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = DetectionCheckpointer( model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler ) start_iter = ( checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 ) max_iter = cfg.SOLVER.MAX_ITER writers = ( [ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else [] ) min_size = cfg.INPUT.MIN_SIZE_TRAIN max_size = cfg.INPUT.MAX_SIZE_TRAIN, sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING data_loader = build_detection_train_loader(cfg, mapper=DatasetMapper(cfg, is_train=True, augmentations=[ T.ResizeShortestEdge(min_size, max_size, sample_style), T.RandomApply(T.RandomFlip(prob = 1, vertical = False), prob = 0.5), T.RandomApply(T.RandomRotation(angle = [180], sample_style = 'choice'), prob = 0.1), T.RandomApply(T.RandomRotation(angle = [-10,10], sample_style = 'range'), prob = 0.9), T.RandomApply(T.RandomBrightness(0.5,1.5), prob = 0.5), T.RandomApply(T.RandomContrast(0.5,1.5), prob = 0.5) ])) best_model_weight = copy.deepcopy(model.state_dict()) best_val_loss = None data_val_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0], mapper = DatasetMapper(cfg, True)) logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): iteration += 1 start = time.time() storage.step() loss_dict = model(data) losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()} losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() if ( cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter ): logger.setLevel(logging.CRITICAL) print('validating') val_total_loss = do_val_monitor(cfg, model, data_val_loader) logger.setLevel(logging.DEBUG) logger.info(f"validation loss of iteration {iteration}th: {val_total_loss}") storage.put_scalar(name = 'val_total_loss', value = val_total_loss) if best_val_loss is None or val_total_loss < best_val_loss: best_val_loss = val_total_loss best_model_weight = copy.deepcopy(model.state_dict()) comm.synchronize() # สร้าง checkpointer เพิ่มให้ save best model โดยดูจาก val loss if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): for writer in writers: writer.write() model.load_state_dict(best_model_weight) experiment_name = os.getenv('MLFLOW_EXPERIMENT_NAME') checkpointer.save(f'model_{experiment_name}') return model
# %% # Storing the checkpoint # ----------------------- # Now, we can use the pre-trained backbone from the Detectron2 model. The code # below shows how to save it as a Detectron2 checkpoint called `my_model.pth`. # get the first module from the backbone (i.e. the detectron2 ResNet) # backbone: # L ResNet50 # L SelectStage # L AdaptiveAvgPool2d detmodel.backbone.bottom_up = simclr_backbone[0] checkpointer = DetectionCheckpointer(detmodel, save_dir='./') checkpointer.save('my_model') # %% # Finetuning with Detectron2 # --------------------------- # # The checkpoint from above can now be used by any Detectron2 script. For example, # you can use the `train_net.py` script in the Detectron2 `tools`: # # # %% #.. code-block:: none # # python train_net.py --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \ # MODEL.WEIGHTS path/to/my_model.pth \
def do_train(cfg, model, resume=False, patience=20): model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) scheduler2 = ReduceLROnPlateau(optimizer, mode="max") # warmup_scheduler = warmup.LinearWarmup(optimizer, warmup_period=200) checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) start_iter = (checkpointer.resume_or_load( cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1) max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) writers = ([ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else []) # compared to "train_net.py", we do not support accurate timing and # precise BN here, because they are not trivial to implement in a small training loop data_loader = build_detection_train_loader(cfg) logger.info("Starting training from iteration {}".format(start_iter)) best_ap50 = 0 best_iteration = 0 patience_counter = 0 with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): storage.step() loss_dict = model(data) losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = { k: v.item() for k, v in comm.reduce_dict(loss_dict).items() } losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() # warmup_scheduler.dampen(iteration) if (cfg.TEST.EVAL_PERIOD > 0 and (iteration + 1) % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter - 1): test_results = do_test(cfg, model) # scheduler2.step(test_results["bbox"]["AP50"]) # early stopping. # save checkpoint to disk checkpointer.save(f"model_{iteration}") # TODO: restore from best model if test_results["bbox"]["AP50"] > best_ap50: best_ap50 = test_results["bbox"]["AP50"] best_iteration = iteration # reset patience counter patience_counter = 0 logger.info(f"Patience counter reset.") else: patience_counter += 1 logger.info( f"Patience counter increased to {patience_counter}, will be terminated at {patience}" ) if patience_counter > patience: for writer in writers: writer.write() # restore to best checkpoint checkpointer.load( f"{cfg.OUTPUT_DIR}/model_{best_iteration}.pth") break # Compared to "train_net.py", the test results are not dumped to EventStorage comm.synchronize() if iteration - start_iter > 5 and ((iteration + 1) % 20 == 0 or iteration == max_iter - 1): for writer in writers: writer.write() # periodic_checkpointer.step(iteration) checkpointer.save(f"model_final")
def do_train(cfg, model, resume=False): #start the training model.train() #configuration of the model based on the cfg optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) #chechpoints configuration checkpointer = DetectionCheckpointer(model,cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) #depending on whether we are using a checkpoint or not the initial iteration #would be different if resume == False: start_iter=1 else: start_iter = (checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1) #Number of iterations max_iter = cfg.SOLVER.MAX_ITER #checkpoints configurations periodic_checkpointer = PeriodicCheckpointer(checkpointer,cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) checkpointer_best= DetectionCheckpointer(model,cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) periodic_checkpointer_best= PeriodicCheckpointer(checkpointer_best, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) #writer: writers = ([CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR),] if comm.is_main_process() else []) #create the dataloader that get information from cfg.training set data_loader = build_detection_train_loader(cfg) #information about the current situation in the training process logger.info("Starting training from iteration {}".format(start_iter)) #start iteration process (epochs) if resume == True: print ('Obtaining best val from previous session') best_loss=np.loadtxt(cfg.OUTPUT_DIR+"/"+"best_validation_loss.txt") print ('Previous best total val loss is %s' %best_loss) else: best_loss=99999999999999999999999999999999999 #the patiente list stores the validation losses during the training process patience_list=[] patience_list.append(best_loss) dataset_size=cfg.NUMBER_IMAGES_TRAINING print("training set size is %s" %dataset_size) iteration_batch_ratio=int(round(float(dataset_size/cfg.SOLVER.IMS_PER_BATCH))) print ("%s Minibatches are cosidered as an entire epoch" %iteration_batch_ratio) with EventStorage(start_iter) as storage: if resume == True: iteration=start_iter else: start_iter=1 iteration=1 minibatch=0 for data, miniepoch in zip(data_loader, range(start_iter*iteration_batch_ratio, max_iter*iteration_batch_ratio)): minibatch= minibatch +1 if minibatch == iteration_batch_ratio: minibatch=0 iteration = iteration + 1 storage.step() loss_dict = model(data) #print (loss_dict) #print ('SPACE') losses = sum(loss for loss in loss_dict.values()) #print (losses) #print ('SPACE') assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()} #print ('SPACE') #get the total loss losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if minibatch == 0: print ("Minibatch %s / %s" %(minibatch, iteration_batch_ratio)) print ("iteration %s / %s" %(iteration, max_iter)) print ('Total losses %s \n' %losses_reduced) print (loss_dict_reduced) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() #Test the validation score of the model if (cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter and minibatch ==0 ): results, loss_val =do_test(cfg, model) patience_list.append(loss_val) #Compared to "train_net.py", the test results are not dumped to EventStorage if loss_val < best_loss: print ('saving best model') best_loss=loss_val array_loss=np.array([best_loss]) #save best model checkpointer_best.save('best_model') np.savetxt(cfg.OUTPUT_DIR+"/"+"best_validation_loss.txt", array_loss, delimiter=',') if len(patience_list) > cfg.patience + cfg.warm_up_patience: print('Chenking val losses .......') #Item obtained (patience) iterations ago item_patience=patience_list[-cfg.patience] continue_training=False #Check whether the val loss has improved for i in range(cfg.patience): item_to_check=patience_list[-i] if item_to_check < item_patience: continue_training=True if continue_training == True: print ('The val loss has improved') else: print ('The val loss has not improved. Stopping training') #print the validation losses print (patience_list) #Plot validation loss error evolution plt.plot(range(1,len(patience_list)+1,1),patience_list) plt.xlabel('iterations') plt.ylabel('validation loss') plt.title('Evolution validation loss: \n min val loss: ' +str(min(patience_list))) #save the plot plt.savefig(os.path.join(cfg.OUTPUT_DIR,'evolution_val_loss.png')) break comm.synchronize() # if iteration - start_iter > cfg.TEST.EVAL_PERIOD and (iteration % cfg.TEST.EVAL_PERIOD == 0 or iteration == max_iter): # for writer in writers: # writer.write() if minibatch == 1: periodic_checkpointer.step(iteration)