def train(self, config: ConfigurationNode = None): """ Take a configuration node and train the model from it. :param config: :return: """ if config is None: config = self.config # Create writable timestamp for easier record keeping timestamp = datetime.now().isoformat(sep="T", timespec="auto") name_timestamp = timestamp.replace(":", "_") # Start the mlflow run: mlflow.start_run(run_name=name_timestamp) # Check valid output path, set path from the path_cfg_override modules respectively assert config.OUTPUT_PATH != '' path_output = config.OUTPUT_PATH # output folder path_train = config.DATASET.TRAIN_DATA_PATH # training data folder path_val = config.DATASET.VAL_DATA_PATH # validation data folder # Make output dir and its parents if not exist. if not os.path.exists(path_output): os.makedirs(path_output) # Make result folders if they do not exist. self.results_dir = (Path(path_output) / name_timestamp) if not os.path.exists(self.results_dir): os.makedirs(self.results_dir) # Make backup folders if they do not exist. self.backup_dir = os.path.join(self.results_dir, 'model_backups') if not os.path.exists(self.backup_dir): os.makedirs(self.backup_dir) writer_tensorboard = SummaryWriter(log_dir=Path(self.results_dir / "logs_tensorflow")) # Now that CFG has been properly merged with new data along the way, time to dump a version of it into a string for trackability purposes. config.dump(stream=open( os.path.join(self.results_dir, f'config{name_timestamp}.yaml'), 'w')) # file path to store the state of the model. state_fpath = os.path.join(self.results_dir, f'model{name_timestamp}.pt') # ???? perf_path = os.path.join(self.results_dir, f'trace{name_timestamp}.p') perf_trace = [] # Load data, create the data loader objects from them. data_train = pickle.load(open(path_train, 'rb')) data_val = pickle.load(open(path_val, 'rb')) self.loader_train = build_data_loader(data_train, config.DATASET, True) self.loader_val = build_data_loader(data_val, config.DATASET, False) # Build the model using configue dict node self.model = build_model(config.MODEL) # Enable parallel multi GPU mode if the config specify it. if config.MODEL.PARALLEL: print("Utilized parallel processing") self.model = torch.nn.DataParallel(self.model) current_epoch = 0 # For resuming training (i.e. load checkpoint) if config.RESUME_PATH != "": checkpoint = torch.load(config.RESUME_PATH, map_location='cpu') current_epoch = checkpoint['epoch'] self.model.load_state_dict(checkpoint["model_state"]) _ = self.model.cuda() # SOLVER EVALUATOR cfg_solver = config.MODEL.SOLVER # Build optimizer (between train/validation, using the solver portion of the configuration. optimizer = build_optimizer(self.model, cfg_solver) # Build evaluator (between train/validation, using the solver portion of the configuration. evaluator = build_evaluator(cfg_solver) evaluator.float().cuda() total_epochs = cfg_solver.TOTAL_EPOCHS # Main training epoch loop starts here. for epoch in range(current_epoch, total_epochs): # Train a single epoch self.train_epoch(epoch, evaluator, optimizer, perf_path, perf_trace, state_fpath, writer_tensorboard) mlflow.end_run()
def find_lr(cfg, max_iter=400, init_value=1e-6, final_value=1.0): ''' WIP We track the losses given different lr values. Same training loop, but we update the lr according to an update step for each batch iteration We apply a smoothing function to the losses for better visualization afterward. ''' # FILES, PATHS train_path = cfg.DATASET.TRAIN_DATA_PATH val_path = cfg.DATASET.VAL_DATA_PATH # DATA LOADER train_data = pickle.load(open(train_path, 'rb')) val_data = pickle.load(open(val_path, 'rb')) train_loader = build_data_loader(train_data, cfg.DATASET, True) val_loader = build_data_loader(val_data, cfg.DATASET, False) # MODEL model = build_model(cfg.MODEL) current_epoch = 0 if cfg.RESUME_PATH != "": checkpoint = torch.load(cfg.RESUME_PATH, map_location='cpu') current_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint["model_state"]) _ = model.cuda() # SOLVER EVALUATOR solver_cfg = cfg.MODEL.SOLVER optimizer = build_optimizer(model, solver_cfg) evaluator = build_evaluator(solver_cfg) evaluator.float().cuda() total_epochs = solver_cfg.TOTAL_EPOCHS # find_lr variables number_in_epoch = len(train_loader) - 1 update_step = (final_value / init_value)**(2 / number_in_epoch) lr = init_value optimizer.param_groups[0]["lr"] = lr best_loss, batch_num = 0.0, 0 losses, log_lrs = [], [] model.train() train_itr = iter(train_loader) total_err = 0 total_acc = 0 for idx, (inputs, labels) in enumerate(train_itr): batch_num += 1 # compute input_data = inputs.float().cuda() labels = labels.cuda() grapheme_logits, vowel_logits, consonant_logits = model(input_data) eval_result = evaluator(grapheme_logits, vowel_logits, consonant_logits, labels) # keep track of the loss loss = eval_result['loss'] # Stopping condition: if loss explodes ogir idx = 2000 if batch_num > 1 and loss > 4 * best_loss or idx == 2000: losses = [x.item() for x in losses] losses = smoothen_by_spline(log_lrs, losses, s=4) return log_lrs[10:-5], losses[10:-5] # Record the best loss if loss < best_loss or batch_num == 1: best_loss = loss # Store the values losses.append(loss) log_lrs.append(math.log10(lr)) # Do the backward pass and optimize optimizer.zero_grad() eval_result['loss'].backward() optimizer.step() eval_result = {k: eval_result[k].item() for k in eval_result} total_err += eval_result['loss'] total_acc += eval_result['acc'] if idx % 100 == 0: print(idx, eval_result['loss'], eval_result['acc']) # update the lr lr *= update_step optimizer.param_groups[0]["lr"] = lr
def train(cfg, debug=False): ############################# # Pre-training ############################# # PATHS assert cfg.OUTPUT_PATH != '' output_path = cfg.OUTPUT_PATH train_path = cfg.DATASET.TRAIN_DATA_PATH val_path = cfg.DATASET.VAL_DATA_PATH # sample is 1/4th of the train images - aka 1 .parquet file train_path_sample = cfg.DATASET.TRAIN_DATA_SAMPLE valid_path_sample = cfg.DATASET.VALID_DATA_SAMPLE # Create writable timestamp for easier record keeping timestamp = datetime.now().isoformat(sep="T", timespec="auto") name_timestamp = timestamp.replace(":", "_") # Make output dir and its parents if they do not exist if not os.path.exists(output_path): os.mkdir(output_path) # Make backup folders if they do not exist backup_dir = os.path.join(output_path, 'model_backups') if not os.path.exists(backup_dir): os.mkdir(backup_dir) # Make result folders if they do not exist results_dir = os.path.join(output_path, 'results') if not os.path.exists(results_dir): os.mkdir(results_dir) # to initialize Tensorboard writer_tensorboard = SummaryWriter(log_dir=results_dir + "logs_tensorflow") # Save configs cfg.dump(stream=open(os.path.join(results_dir, f'config_{name_timestamp}.yaml'), 'w')) # File path to store the state of the model state_fpath = os.path.join(output_path, 'model.pt') # Performance path where we'll save our metrics to trace.p perf_path = os.path.join(results_dir, 'trace.p') perf_trace = [] # debug: load a smaller training file if debug: train_data = pickle.load(open(train_path_sample, 'rb')) val_data = pickle.load(open(valid_path_sample, 'rb')) # Folds if cfg.DATASET.USE_FOLDS_DATA: data_path = cfg.DATASET.FOLDS_PATH all_data_folds = pickle.load(open(data_path, 'rb')) val_fold = cfg.DATASET.VALIDATION_FOLD train_data = [] val_data = [] for idx, entries in enumerate(all_data_folds): if idx == val_fold: val_data = entries else: train_data = train_data + entries else: train_data = pickle.load(open(train_path, 'rb')) val_data = pickle.load(open(val_path, 'rb')) # witchcraft: only train on few classes focus_cls = cfg.DATASET.FOCUS_CLASS if len(focus_cls) > 0: train_data = [x for x in train_data if x[1][0] in focus_cls] val_data = [x for x in val_data if x[1][0] in focus_cls] # DataLoader train_loader = build_data_loader(train_data, cfg.DATASET, True) val_loader = build_data_loader(val_data, cfg.DATASET, False) # Build model using config dict node model = build_model(cfg.MODEL) # Solver evaluator solver_cfg = cfg.MODEL.SOLVER # Epochs total_epochs = solver_cfg.TOTAL_EPOCHS # Loss function loss_fn = solver_cfg.LOSS.NAME # for weighted focal loss, initialize last layer bias weights as constant if loss_fn == 'weighted_focal_loss': last_layer = model.head.fc_layers[-1] for m in last_layer.modules(): if isinstance(m, torch.nn.Linear): torch.nn.init.constant_(m.bias, -3.0) current_epoch = 0 # MultiGPU training multi_gpu_training = cfg.MULTI_GPU_TRAINING if cfg.RESUME_PATH != "": checkpoint = torch.load(cfg.RESUME_PATH, map_location='cpu') current_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint["model_state"]) if multi_gpu_training: model = torch.nn.DataParallel(model) _ = model.cuda() # Optimizer, scheduler, amp opti_cfg = solver_cfg.OPTIMIZER optimizer = build_optimizer(model, opti_cfg) use_amp = solver_cfg.AMP # ------ Uncomment if we use apex library -------- # if use_amp: # opt_level = 'O1' # model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) # Resume training with correct optimizer and scheduler if cfg.RESUME_PATH != "": if 'optimizer_state' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer_state']) if 'scheduler_state' in checkpoint and scheduler is not None: scheduler.load_state_dict(checkpoint['scheduler_state']) # ------ Uncomment if we use apex library -------- # if use_amp and 'amp_state' in checkpoint: # amp.load_state_dict(checkpoint['amp_state']) # Build Scheduler scheduler_cfg = solver_cfg.SCHEDULER scheduler_type = scheduler_cfg.NAME scheduler = build_scheduler(optimizer, scheduler_cfg) # Build evaluator with or without Mixup mixup_training = solver_cfg.MIXUP_AUGMENT if mixup_training: mixup_augmenter = MixupAugmenter(solver_cfg.MIXUP) evaluator, mixup_evaluator = build_evaluator(solver_cfg) evaluator.float().cuda() if mixup_evaluator is not None: mixup_evaluator.float().cuda() ########################################## # Main training epoch loop starts here ########################################## s_time = time.time() parameters = list(model.parameters()) for epoch in range(current_epoch, total_epochs): model.train() if multi_gpu_training: model.freeze_bn() print('Start epoch', epoch) train_itr = iter(train_loader) total_err = 0 total_acc = 0 inputs, labels = next(train_itr) for idx, (inputs, labels) in enumerate(train_itr): # compute input_data = inputs.float().cuda() labels = labels.cuda() # Use the model to produce the classification if mixup_training: input_data, labels = mixup_augmenter(input_data, labels) grapheme_logits, vowel_logits, consonant_logits = model(input_data) # Calling MultiHeadsEval forward function to produce evaluator results if mixup_training: eval_result = mixup_evaluator(grapheme_logits, vowel_logits, consonant_logits, labels) else: eval_result = evaluator(grapheme_logits, vowel_logits, consonant_logits, labels) optimizer.zero_grad() loss = eval_result['loss'] # ------ Uncomment if we use apex library -------- # if use_amp: # with amp.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() # get loss, back propagate, step loss.backward() max_grad = torch.max(parameters[-1].grad) if not torch.isnan(max_grad): optimizer.step() else: print('NAN in gradient, skip this step') optimizer.zero_grad() # tabulate the steps from the evaluation eval_result = {k: eval_result[k].item() for k in eval_result} # Update Scheduler at this point only if scheduler_type is 'OneCycleLR' if scheduler_type == 'OneCycleLR': scheduler.step() if idx % 100 == 0: t_time = time.time() print(idx, eval_result['loss'], eval_result['acc'], t_time - s_time) s_time = time.time() ############################### # Send images to Tensorboard # -- could also do this outside the loop with xb, yb = next(itr(DL)) ############################### if epoch == 0: # Get the std and mean of each channel std = torch.FloatTensor(cfg.DATASET.NORMALIZE_STD).view(3,1,1) m = torch.FloatTensor(cfg.DATASET.NORMALIZE_MEAN).view(3,1,1) # Un-normalize images, send mean and std to gpu for mixuped images imgs, imgs_mixup = ((inputs*std)+m)*255, ((input_data*std.cuda())+m.cuda())*255 imgs, imgs_mixup = imgs.type(torch.uint8), imgs_mixup.type(torch.uint8) img_grid = torchvision.utils.make_grid(imgs) img_grid_mixup = torchvision.utils.make_grid(imgs_mixup) img_grid = torchvision.utils.make_grid(imgs) img_grid_mixup = torchvision.utils.make_grid(imgs_mixup) writer_tensorboard.add_image("images no mixup", img_grid) writer_tensorboard.add_image("images with mixup", img_grid_mixup) #################### # Training metrics #################### if mixup_training: train_result = mixup_evaluator.evalulate_on_cache() mixup_evaluator.clear_cache() else: train_result = evaluator.evalulate_on_cache() # Store training loss, accuracy, kaggle score and write to Tensorboard train_total_err = train_result['loss'] writer_tensorboard.add_scalar('Loss/train', train_total_err, global_step=epoch) train_total_acc = train_result['acc'] writer_tensorboard.add_scalar('Accuracy/train', train_total_acc, global_step=epoch) train_kaggle_score = train_result['kaggle_score'] writer_tensorboard.add_scalar('Kaggle_Score/train', train_kaggle_score, global_step=epoch) lr = optimizer.param_groups[-1]['lr'] print("Epoch {0} Training, Loss {1}, Acc {2}".format(epoch, train_total_err, train_total_acc)) evaluator.clear_cache() ############################### # Compute validation error ############################### model.eval() val_itr = iter(val_loader) with torch.no_grad(): for idx, (inputs, labels) in enumerate(val_itr): input_data = inputs.float().cuda() labels = labels.cuda() grapheme_logits, vowel_logits, consonant_logits = model(input_data) eval_result = evaluator(grapheme_logits, vowel_logits, consonant_logits, labels) eval_result = {k: eval_result[k].item() for k in eval_result} total_err += eval_result['loss'] total_acc += eval_result['acc'] val_result = evaluator.evalulate_on_cache() val_total_err = val_result['loss'] val_total_acc = val_result['acc'] val_kaggle_score = val_result['kaggle_score'] print("Epoch {0} Eval, Loss {1}, Acc {2}".format(epoch, val_total_err, val_total_acc)) evaluator.clear_cache() # Update scheudler here if not 'OneCycleLR' if scheduler is not None and scheduler != 'OneCycleLR': if scheduler_type == 'reduce_on_plateau': scheduler.step(val_total_err) else: scheduler.step() ###################################### # Saving the model + performance ###################################### print("Saving the model (epoch %d)" % epoch) save_state = { "epoch": epoch + 1, "model_state": model.state_dict(), "optimizer_state": optimizer.state_dict(), } if scheduler is not None: save_state['scheduler_state'] = scheduler.state_dict() # ------ Uncomment if we use apex library -------- # if use_amp: # save_state['amp_state'] = amp.state_dict() torch.save(save_state, state_fpath) print("Making a backup (step %d)" % epoch) backup_fpath = os.path.join(backup_dir, "model_bak_%06d.pt" % (epoch,)) torch.save(save_state, backup_fpath) # Dump the traces perf_trace.append( { 'epoch': epoch, 'train_err': train_total_err, 'train_acc': train_total_acc, 'train_kaggle_score': train_kaggle_score, 'val_err': val_total_err, 'val_acc': val_total_acc, 'val_kaggle_score': val_kaggle_score } ) pickle.dump(perf_trace, open(perf_path, 'wb')) # store epoch full result separately epoch_result = { 'epoch': epoch, 'train_result': train_result, 'val_result': val_result } pickle.dump(epoch_result, open(os.path.join(results_dir, 'result_epoch_{0}.p'.format(epoch)), 'wb')) # output_path_base = os.path.basename(output_path) # os.system('aws s3 sync /root/bengali_data/{0} s3://eaitest1/{1}'.format(output_path_base, output_path_base)) # os.system('rm -r /root/bengali_data/{0}/model_backups'.format(output_path_base)) # os.system('mkdir /root/bengali_data/{0}/model_backups'.format(output_path_base)) # Add model to Tensorboard to inspect the details of the architecture writer_tensorboard.add_graph(model, input_data) writer_tensorboard.close()
def find_lr(cfg, max_iter=1400, init_value=1e-6, final_value=1.0): ''' WIP We track the losses given different lr values. Same training loop, but we update the lr according to an update step for each batch iteration We apply a smoothing function to the losses for better visualization afterward. ''' # FILES, PATHS train_path = cfg.DATASET.TRAIN_DATA_PATH # DATA LOADER train_data = pickle.load(open(train_path, 'rb')) train_loader = build_data_loader(train_data, cfg.DATASET, True) # MODEL model = build_model(cfg.MODEL) model.cuda() # Solver evaluator solver_cfg = cfg.MODEL.SOLVER total_epochs = solver_cfg.SCHEDULER.TOTAL_EPOCHS # Build optimizerW opti_cfg = solver_cfg.OPTIMIZER optimizer = build_optimizer(model, opti_cfg) # Build scheduler sched_cfg = solver_cfg.SCHEDULER scheduler = build_scheduler(optimizer, sched_cfg, steps_per_epoch=np.int(len(train_loader)), epochs=total_epochs) # Build evaluator with or without Mixup mixup_training = solver_cfg.MIXUP_AUGMENT if mixup_training: mixup_augmenter = MixupAugmenter(solver_cfg.MIXUP) evaluator, mixup_evaluator = build_evaluator(solver_cfg) evaluator.float().cuda() if mixup_evaluator is not None: mixup_evaluator.float().cuda() # find_lr variables number_in_epoch = len(train_loader) - 1 update_step = (final_value / init_value)**(2 / number_in_epoch) lr = init_value optimizer.param_groups[0]["lr"] = lr best_loss, batch_num = 0.0, 0 losses, log_lrs = [], [] model.train() train_itr = iter(train_loader) for idx, (inputs, labels) in enumerate(train_itr): batch_num += 1 # compute input_data = inputs.float().cuda() labels = labels.cuda() # Use the model to produce the classification if mixup_training: input_data, labels = mixup_augmenter(input_data, labels) grapheme_logits, vowel_logits, consonant_logits = model(input_data) # Calling MultiHeadsEval forward function to produce evaluator results if mixup_training: eval_result = mixup_evaluator(grapheme_logits, vowel_logits, consonant_logits, labels) else: eval_result = evaluator(grapheme_logits, vowel_logits, consonant_logits, labels) optimizer.zero_grad() # get loss, back propagate, step loss = eval_result['loss'] # Stopping condition: if loss explodes ogir idx = 2000 if batch_num > 1 and loss > 4 * best_loss or idx == max_iter: losses = [x.item() for x in losses] losses = smoothen_by_spline(log_lrs, losses, s=4) return log_lrs[10:-5], losses[10:-5] # Record the best loss if loss < best_loss or batch_num == 1: best_loss = loss # Store the values losses.append(loss) log_lrs.append(math.log10(lr)) # Do the backward pass and optimize optimizer.zero_grad() loss.backward() optimizer.step() eval_result = {k: eval_result[k].item() for k in eval_result} if idx % 100 == 0: print(idx, eval_result['loss'], eval_result['acc']) # update the lr lr *= update_step optimizer.param_groups[0]["lr"] = lr
def train(cfg, debug=False): # FILES, PATHS assert cfg.OUTPUT_PATH != '' output_path = cfg.OUTPUT_PATH train_path = cfg.DATASET.TRAIN_DATA_PATH val_path = cfg.DATASET.VAL_DATA_PATH train_path_sample = cfg.DATASET.TRAIN_DATA_SAMPLE valid_path_sample = cfg.DATASET.VALID_DATA_SAMPLE if not os.path.exists(output_path): os.mkdir(output_path) backup_dir = os.path.join(output_path, 'model_backups') if not os.path.exists(backup_dir): os.mkdir(backup_dir) results_dir = os.path.join(output_path, 'results') if not os.path.exists(results_dir): os.mkdir(results_dir) cfg.dump(stream=open(os.path.join(output_path, 'config.yaml'), 'w')) state_fpath = os.path.join(output_path, 'model.pt') perf_path = os.path.join(results_dir, 'trace.p') perf_trace = [] if debug: train_data = pickle.load(open(train_path_sample, 'rb')) val_data = pickle.load(open(valid_path_sample, 'rb')) # DATA LOADER if cfg.DATASET.USE_FOLDS_DATA: data_path = cfg.DATASET.FOLDS_PATH all_data_folds = pickle.load(open(data_path, 'rb')) val_fold = cfg.DATASET.VALIDATION_FOLD train_data = [] val_data = [] for idx, entries in enumerate(all_data_folds): if idx == val_fold: val_data = entries else: train_data = train_data + entries else: train_data = pickle.load(open(train_path, 'rb')) val_data = pickle.load(open(val_path, 'rb')) # witchcraft: only train on few classes focus_cls = cfg.DATASET.FOCUS_CLASS if len(focus_cls) > 0: train_data = [x for x in train_data if x[1][0] in focus_cls] val_data = [x for x in val_data if x[1][0] in focus_cls] train_loader = build_data_loader(train_data, cfg.DATASET, True) val_loader = build_data_loader(val_data, cfg.DATASET, False) # MODEL model = build_model(cfg.MODEL) solver_cfg = cfg.MODEL.SOLVER total_epochs = solver_cfg.TOTAL_EPOCHS loss_fn = solver_cfg.LOSS.NAME # for weighted focal loss, initialize last layer bias weights as constant if loss_fn == 'weighted_focal_loss': last_layer = model.head.fc_layers[-1] for m in last_layer.modules(): if isinstance(m, torch.nn.Linear): torch.nn.init.constant_(m.bias, -3.0) current_epoch = 0 multi_gpu_training = cfg.MULTI_GPU_TRAINING if cfg.RESUME_PATH != "": checkpoint = torch.load(cfg.RESUME_PATH, map_location='cpu') current_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint["model_state"]) if multi_gpu_training: model = torch.nn.DataParallel(model) _ = model.cuda() # optimizer, scheduler, amp opti_cfg = solver_cfg.OPTIMIZER optimizer = build_optimizer(model, opti_cfg) use_amp = solver_cfg.AMP # ------ Uncomment if we use apex library -------- # if use_amp: # opt_level = 'O1' # model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) if cfg.RESUME_PATH != "": if 'optimizer_state' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer_state']) # ------ Uncomment if we use apex library -------- # if use_amp and 'amp_state' in checkpoint: # amp.load_state_dict(checkpoint['amp_state']) scheduler_cfg = solver_cfg.SCHEDULER scheduler_type = scheduler_cfg.NAME scheduler = build_scheduler(optimizer, scheduler_cfg) # evaluator mixup_training = solver_cfg.MIXUP_AUGMENT if mixup_training: mixup_augmenter = MixupAugmenter(solver_cfg.MIXUP) evaluator, mixup_evaluator = build_evaluator(solver_cfg) evaluator.float().cuda() if mixup_evaluator is not None: mixup_evaluator.float().cuda() s_time = time.time() parameters = list(model.parameters()) for epoch in range(current_epoch, total_epochs): model.train() if multi_gpu_training: model.freeze_bn() print('Start epoch', epoch) train_itr = iter(train_loader) total_err = 0 total_acc = 0 inputs, labels = next(train_itr) for idx, (inputs, labels) in enumerate(train_itr): # compute input_data = inputs.float().cuda() labels = labels.cuda() if mixup_training: input_data, labels = mixup_augmenter(input_data, labels) grapheme_logits, vowel_logits, consonant_logits = model(input_data) if mixup_training: eval_result = mixup_evaluator(grapheme_logits, vowel_logits, consonant_logits, labels) else: eval_result = evaluator(grapheme_logits, vowel_logits, consonant_logits, labels) optimizer.zero_grad() loss = eval_result['loss'] # ------ Uncomment if we use apex library -------- # if use_amp: # with amp.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() loss.backward() max_grad = torch.max(parameters[-1].grad) if not torch.isnan(max_grad): optimizer.step() else: print('NAN in gradient, skip this step') optimizer.zero_grad() eval_result = {k: eval_result[k].item() for k in eval_result} if idx % 100 == 0: t_time = time.time() print(idx, eval_result['loss'], eval_result['acc'], t_time - s_time) s_time = time.time() if mixup_training: train_result = mixup_evaluator.evalulate_on_cache() mixup_evaluator.clear_cache() else: train_result = evaluator.evalulate_on_cache() train_total_err = train_result['loss'] train_total_acc = train_result['acc'] train_kaggle_score = train_result['kaggle_score'] print("Epoch {0} Training, Loss {1}, Acc {2}".format( epoch, train_total_err, train_total_acc)) evaluator.clear_cache() # compute validation error model.eval() val_itr = iter(val_loader) with torch.no_grad(): for idx, (inputs, labels) in enumerate(val_itr): input_data = inputs.float().cuda() labels = labels.cuda() grapheme_logits, vowel_logits, consonant_logits = model( input_data) eval_result = evaluator(grapheme_logits, vowel_logits, consonant_logits, labels) eval_result = {k: eval_result[k].item() for k in eval_result} total_err += eval_result['loss'] total_acc += eval_result['acc'] val_result = evaluator.evalulate_on_cache() val_total_err = val_result['loss'] val_total_acc = val_result['acc'] val_kaggle_score = val_result['kaggle_score'] print("Epoch {0} Eval, Loss {1}, Acc {2}".format( epoch, val_total_err, val_total_acc)) evaluator.clear_cache() if scheduler is not None: if scheduler_type == 'reduce_on_plateau': scheduler.step(val_total_err) else: scheduler.step() print("Saving the model (epoch %d)" % epoch) save_state = { "epoch": epoch + 1, "model_state": model.state_dict(), "optimizer_state": optimizer.state_dict(), } if scheduler is not None: save_state['scheduler_state'] = scheduler.state_dict() # ------ Uncomment if we use apex library -------- # if use_amp: # save_state['amp_state'] = amp.state_dict() torch.save(save_state, state_fpath) print("Making a backup (step %d)" % epoch) backup_fpath = os.path.join(backup_dir, "model_bak_%06d.pt" % (epoch, )) torch.save(save_state, backup_fpath) perf_trace.append({ 'epoch': epoch, 'train_err': train_total_err, 'train_acc': train_total_acc, 'train_kaggle_score': train_kaggle_score, 'val_err': val_total_err, 'val_acc': val_total_acc, 'val_kaggle_score': val_kaggle_score }) pickle.dump(perf_trace, open(perf_path, 'wb')) # store epoch full result separately epoch_result = { 'epoch': epoch, 'train_result': train_result, 'val_result': val_result } pickle.dump( epoch_result, open(os.path.join(results_dir, 'result_epoch_{0}.p'.format(epoch)), 'wb'))
def train(cfg): # FILES, PATHS assert cfg.OUTPUT_PATH != '' output_path = cfg.OUTPUT_PATH train_path = cfg.DATASET.TRAIN_DATA_PATH val_path = cfg.DATASET.VAL_DATA_PATH if not os.path.exists(output_path): os.mkdir(output_path) backup_dir = os.path.join(output_path, 'model_backups') if not os.path.exists(backup_dir): os.mkdir(backup_dir) state_fpath = os.path.join(output_path, 'model.pt') perf_path = os.path.join(output_path, 'trace.json') perf_trace = [] # DATA LOADER train_data = pickle.load(open(train_path, 'rb')) val_data = pickle.load(open(val_path, 'rb')) train_loader = build_data_loader(train_data, cfg.DATASET, True) val_loader = build_data_loader(val_data, cfg.DATASET, False) # MODEL model = build_model(cfg.MODEL) current_epoch = 0 if cfg.RESUME_PATH != "": checkpoint = torch.load(cfg.RESUME_PATH, map_location='cpu') current_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint["model_state"]) _ = model.cuda() # SOLVER EVALUATOR solver_cfg = cfg.MODEL.SOLVER optimizer = build_optimizer(model, solver_cfg) evaluator = build_evaluator(solver_cfg) evaluator.float().cuda() total_epochs = solver_cfg.TOTAL_EPOCHS for epoch in range(current_epoch, total_epochs): model.train() print('Start epoch', epoch) train_itr = iter(train_loader) total_err = 0 total_acc = 0 for idx, (inputs, labels) in enumerate(train_itr): # compute input_data = inputs.float().cuda() labels = labels.cuda() grapheme_logits, vowel_logits, consonant_logits = model(input_data) eval_result = evaluator(grapheme_logits, vowel_logits, consonant_logits, labels) optimizer.zero_grad() eval_result['loss'].backward() optimizer.step() eval_result = {k: eval_result[k].item() for k in eval_result} total_err += eval_result['loss'] total_acc += eval_result['acc'] if idx % 100 == 0: print(idx, eval_result['loss'], eval_result['acc']) train_total_err = total_err / (1 + idx) train_total_acc = total_acc / (1 + idx) print("Epoch {0} Training, Loss {1}, Acc {2}".format(epoch, train_total_err, train_total_acc)) # compute validation error model.eval() val_itr = iter(val_loader) total_err = 0 total_acc = 0 with torch.no_grad(): for idx, (image_id, images, label1, label2, label3) in enumerate(val_itr): input_data = images.float().cuda() labels = labels.cuda() grapheme_logits, vowel_logits, consonant_logits = model(input_data) eval_result = evaluator(grapheme_logits, vowel_logits, consonant_logits, labels) eval_result = {k: eval_result[k].item() for k in eval_result} total_err += eval_result['loss'] total_acc += eval_result['acc'] # print(total_err / (1 + idx), total_acc / (1 + idx)) val_total_err = total_err / (1 + idx) val_total_acc = total_acc / (1 + idx) print("Epoch {0} Eval, Loss {1}, Acc {2}".format(epoch, val_total_err, val_total_acc)) print("Saving the model (epoch %d)" % epoch) torch.save({ "epoch": epoch + 1, "model_state": model.state_dict(), "optimizer_state": optimizer.state_dict(), }, state_fpath) print("Making a backup (step %d)" % epoch) backup_fpath = os.path.join(backup_dir, "model_bak_%06d.pt" % (epoch,)) torch.save({ "epoch": epoch + 1, "model_state": model.state_dict(), "optimizer_state": optimizer.state_dict(), }, backup_fpath) perf_trace.append( { 'epoch': epoch, 'train_err': train_total_err, 'train_acc': train_total_acc, 'val_err': val_total_err, 'val_acc': val_total_acc } ) json.dump(perf_trace, open(perf_path, 'w'))
from src.data.bengali_data import build_data_loader from src.modeling.solver.optimizer import build_optimizer from src.modeling.solver.evaluation import build_evaluator from src.config.config import cfg # FILES, PATHS assert cfg.OUTPUT_PATH != '' output_path = cfg.OUTPUT_PATH train_path = cfg.DATASET.TRAIN_DATA_0 # DATA LOADER train_data = pickle.load(open(train_path, 'rb')) train_loader = build_data_loader(train_data, cfg.DATASET, True) # MODEL model = build_model(cfg.MODEL) # SOLVER EVALUATOR solver_cfg = cfg.MODEL.SOLVER optimizer = build_optimizer(model, solver_cfg) evaluator = build_evaluator(solver_cfg) evaluator.float().cuda() total_epochs = solver_cfg.TOTAL_EPOCHS model.train() print('Start training') train_itr = iter(train_loader) total_err = 0 total_acc = 0 for idx, (inputs, labels) in enumerate(train_itr):