def do_train(cfg, model, resume=False): model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) checkpointer_spot = DetectionCheckpointer(model, '/opt/ml/checkpoints', optimizer=optimizer, scheduler=scheduler) start_iter = (checkpointer.resume_or_load( cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1) max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) periodic_checkpointer_spot = PeriodicCheckpointer( checkpointer_spot, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) writers = ([ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else []) # compared to "train_net.py", we do not support accurate timing and # precise BN here, because they are not trivial to implement data_loader = build_detection_train_loader(cfg) logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): iteration = iteration + 1 storage.step() loss_dict = model(data) losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = { k: v.item() for k, v in comm.reduce_dict(loss_dict).items() } losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() if (cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter): do_test(cfg, model) # Compared to "train_net.py", the test results are not dumped to EventStorage comm.synchronize() if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): for writer in writers: writer.write() periodic_checkpointer.step(iteration) periodic_checkpointer_spot.step(iteration)
optimizer=optimizer, scheduler=scheduler) start_iter = (checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=False).get( "iteration", -1) + 1) ckpt = Checkpointer(model) ckpt.load("./frcn_attn_0/model_0044999.pth") max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) writers = ([ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else []) # compared to "train_net.py", we do not support accurate timing and # precise BN here, because they are not trivial to implement data_loader = build_detection_train_loader(cfg) logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): iteration = iteration + 1 storage.step() loss_dict = model(data) losses = sum(loss for loss in loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = {
def do_train(cfg, model, resume=False, patience=20): model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) scheduler2 = ReduceLROnPlateau(optimizer, mode="max") # warmup_scheduler = warmup.LinearWarmup(optimizer, warmup_period=200) checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) start_iter = (checkpointer.resume_or_load( cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1) max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) writers = ([ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else []) # compared to "train_net.py", we do not support accurate timing and # precise BN here, because they are not trivial to implement in a small training loop data_loader = build_detection_train_loader(cfg) logger.info("Starting training from iteration {}".format(start_iter)) best_ap50 = 0 best_iteration = 0 patience_counter = 0 with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): storage.step() loss_dict = model(data) losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = { k: v.item() for k, v in comm.reduce_dict(loss_dict).items() } losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() # warmup_scheduler.dampen(iteration) if (cfg.TEST.EVAL_PERIOD > 0 and (iteration + 1) % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter - 1): test_results = do_test(cfg, model) # scheduler2.step(test_results["bbox"]["AP50"]) # early stopping. # save checkpoint to disk checkpointer.save(f"model_{iteration}") # TODO: restore from best model if test_results["bbox"]["AP50"] > best_ap50: best_ap50 = test_results["bbox"]["AP50"] best_iteration = iteration # reset patience counter patience_counter = 0 logger.info(f"Patience counter reset.") else: patience_counter += 1 logger.info( f"Patience counter increased to {patience_counter}, will be terminated at {patience}" ) if patience_counter > patience: for writer in writers: writer.write() # restore to best checkpoint checkpointer.load( f"{cfg.OUTPUT_DIR}/model_{best_iteration}.pth") break # Compared to "train_net.py", the test results are not dumped to EventStorage comm.synchronize() if iteration - start_iter > 5 and ((iteration + 1) % 20 == 0 or iteration == max_iter - 1): for writer in writers: writer.write() # periodic_checkpointer.step(iteration) checkpointer.save(f"model_final")
def do_train(cfg, model, resume=False, evaluate=False): """ training loop. """ # Build optimizer and scheduler from configuration and model model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) # Build checkpointers checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) start_iter = (checkpointer.resume_or_load( cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1) max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) # Build writers writers = ([ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else []) # Build dataloader data_loader = build_classification_train_loader(cfg) # training loop validation_losses = [] logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: start = time.perf_counter() for data, iteration in zip(data_loader, range(start_iter, max_iter)): data_time = time.perf_counter() - start iteration = iteration + 1 storage.step() loss_dict = model(data) # compute losses losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = { k: v.item() for k, v in comm.reduce_dict(loss_dict).items() } losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalar("data_time", data_time) storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) # backward optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() #validation if ((cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0) or (iteration == max_iter)): # evaluate on the validation dataset res = do_test(cfg, model, evaluate=evaluate) validation = {} for k, v in res.items(): print(v, flush=True) validation[k] = v['loss_cls'] storage.put_scalars( **validation ) # dump also validation loss into Tensorboard validation['iteration'] = iteration validation_losses.append(validation) # logging/checkpoint if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): for writer in writers: writer.write() periodic_checkpointer.step(iteration) #Try to get an accurate measuremetn of time start = time.perf_counter() # save validations metrics if evaluate: print(validation_losses, flush=True) file_path = os.path.join(cfg.OUTPUT_DIR, "validations_losses.pth") with PathManager.open(file_path, "wb") as f: torch.save(validation_losses, f)
def do_train(cfg, model, resume=False): #start the training model.train() #configuration of the model based on the cfg optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) #chechpoints configuration checkpointer = DetectionCheckpointer(model,cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) #depending on whether we are using a checkpoint or not the initial iteration #would be different if resume == False: start_iter=1 else: start_iter = (checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1) #Number of iterations max_iter = cfg.SOLVER.MAX_ITER #checkpoints configurations periodic_checkpointer = PeriodicCheckpointer(checkpointer,cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) checkpointer_best= DetectionCheckpointer(model,cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) periodic_checkpointer_best= PeriodicCheckpointer(checkpointer_best, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) #writer: writers = ([CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR),] if comm.is_main_process() else []) #create the dataloader that get information from cfg.training set data_loader = build_detection_train_loader(cfg) #information about the current situation in the training process logger.info("Starting training from iteration {}".format(start_iter)) #start iteration process (epochs) if resume == True: print ('Obtaining best val from previous session') best_loss=np.loadtxt(cfg.OUTPUT_DIR+"/"+"best_validation_loss.txt") print ('Previous best total val loss is %s' %best_loss) else: best_loss=99999999999999999999999999999999999 #the patiente list stores the validation losses during the training process patience_list=[] patience_list.append(best_loss) dataset_size=cfg.NUMBER_IMAGES_TRAINING print("training set size is %s" %dataset_size) iteration_batch_ratio=int(round(float(dataset_size/cfg.SOLVER.IMS_PER_BATCH))) print ("%s Minibatches are cosidered as an entire epoch" %iteration_batch_ratio) with EventStorage(start_iter) as storage: if resume == True: iteration=start_iter else: start_iter=1 iteration=1 minibatch=0 for data, miniepoch in zip(data_loader, range(start_iter*iteration_batch_ratio, max_iter*iteration_batch_ratio)): minibatch= minibatch +1 if minibatch == iteration_batch_ratio: minibatch=0 iteration = iteration + 1 storage.step() loss_dict = model(data) #print (loss_dict) #print ('SPACE') losses = sum(loss for loss in loss_dict.values()) #print (losses) #print ('SPACE') assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()} #print ('SPACE') #get the total loss losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if minibatch == 0: print ("Minibatch %s / %s" %(minibatch, iteration_batch_ratio)) print ("iteration %s / %s" %(iteration, max_iter)) print ('Total losses %s \n' %losses_reduced) print (loss_dict_reduced) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() #Test the validation score of the model if (cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter and minibatch ==0 ): results, loss_val =do_test(cfg, model) patience_list.append(loss_val) #Compared to "train_net.py", the test results are not dumped to EventStorage if loss_val < best_loss: print ('saving best model') best_loss=loss_val array_loss=np.array([best_loss]) #save best model checkpointer_best.save('best_model') np.savetxt(cfg.OUTPUT_DIR+"/"+"best_validation_loss.txt", array_loss, delimiter=',') if len(patience_list) > cfg.patience + cfg.warm_up_patience: print('Chenking val losses .......') #Item obtained (patience) iterations ago item_patience=patience_list[-cfg.patience] continue_training=False #Check whether the val loss has improved for i in range(cfg.patience): item_to_check=patience_list[-i] if item_to_check < item_patience: continue_training=True if continue_training == True: print ('The val loss has improved') else: print ('The val loss has not improved. Stopping training') #print the validation losses print (patience_list) #Plot validation loss error evolution plt.plot(range(1,len(patience_list)+1,1),patience_list) plt.xlabel('iterations') plt.ylabel('validation loss') plt.title('Evolution validation loss: \n min val loss: ' +str(min(patience_list))) #save the plot plt.savefig(os.path.join(cfg.OUTPUT_DIR,'evolution_val_loss.png')) break comm.synchronize() # if iteration - start_iter > cfg.TEST.EVAL_PERIOD and (iteration % cfg.TEST.EVAL_PERIOD == 0 or iteration == max_iter): # for writer in writers: # writer.write() if minibatch == 1: periodic_checkpointer.step(iteration)
def main(args): print('_' * 60 + f'\nmain <- {args}') if 'setup(args)': cfg = get_cfg() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() default_setup( cfg, args ) # if you don't like any of the default setup, write your own setup code global CONFIG CONFIG = cfg if True: # N_GPU > 0: # __________________ For Debug _____________________________ # mem_stats_df.record('Before-Build-Model') if 'build_model(cfg)': meta_arch = cfg.MODEL.META_ARCHITECTURE model = META_ARCH_REGISTRY.get(meta_arch)(cfg) # for param in model.backbone.parameters(): # param.requires_grad = False model.to(torch.device(cfg.MODEL.DEVICE)) # __________________ For Debug _____________________________ # mem_stats_df.record('After-Build-Model') if args.eval_only: DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume ) return do_test(cfg, model) distributed = comm.get_world_size() > 1 if distributed: model = DistributedDataParallel( model, device_ids=[comm.get_local_rank()], broadcast_buffers=False ) if 'do-train': dataloader = build_train_dataloader(cfg) if N_GPUS > 0: cfg, model, resume = cfg, model, args.resume model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = DetectionCheckpointer( model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler, ) # "iteration" always be loaded whether resume or not. # "model" state_dict will always be loaded whether resume or not. start_iter = ( checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 ) max_iter = cfg.SOLVER.MAX_ITER # optimizer and scheduler will be resume to checkpointer.checkpointables[*] if resume is True if resume: optimizer = checkpointer.checkpointables['optimizer'] scheduler = checkpointer.checkpointables['scheduler'] else: start_iter = 0 periodic_checkpointer = PeriodicCheckpointer( checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter ) writers = ( [ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else [] ) logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: for data, itr in zip(dataloader, range(start_iter, max_iter)): iteration = itr + 1 storage.step() loss_dict = model(data) losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()} losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() # __________________ Checkpoint / Test / Metrics ___________ periodic_checkpointer.step(iteration) if ( cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter ): do_test(cfg, model) # Compared to "train_net.py", the test results are not dumped to EventStorage comm.synchronize() if iteration - start_iter > 5 and (iteration % 100 == 0 or iteration == max_iter): for writer in writers: writer.write() # __________________ For Debug _____________________________ # mem_summary = torch.cuda.memory_summary() # tcp_sock.send(mem_summary.encode('utf-8')) global TIC if TIC is None: TIC = datetime.datetime.now() else: toc = datetime.datetime.now() logger.info('_' * 35 + f'Time Elapsed: {(toc - TIC).total_seconds()} s') TIC = toc