def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, loss_fn, metrics, params, model_dir, restore_file=None): """Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the neural network params: (Params) hyperparameters model_dir: (string) directory containing config, weights and log restore_file: (string) - name of file to restore from (without its extension .pth.tar) """ # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) best_val_acc = 0.0 # learning rate schedulers for different models: if params.model_version == "resnet18": scheduler = StepLR(optimizer, step_size=150, gamma=0.1) # for cnn models, num_epoch is always < 100, so it's intentionally not using scheduler here elif params.model_version == "cnn": scheduler = StepLR(optimizer, step_size=100, gamma=0.2) for epoch in range(params.num_epochs): scheduler.step() # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) # compute number of batches in one epoch (one full pass over the training set) train(model, optimizer, loss_fn, train_dataloader, metrics, params) # Evaluate for one epoch on validation set val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params) val_acc = val_metrics['accuracy'] is_best = val_acc>=best_val_acc # Save weights utils.save_checkpoint({'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict' : optimizer.state_dict()}, is_best=is_best, checkpoint=model_dir) # If best_eval, best_save_path if is_best: logging.info("- Found new best accuracy") best_val_acc = val_acc # Save best val metrics in a json file in the model directory best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics, best_json_path) # Save latest val metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json") utils.save_dict_to_json(val_metrics, last_json_path)
def main(): init_platform() logger.info('Experiment name: %s', default.exp_name) logger.info('Initializing ...') if default.mode in ('train', 'infer'): train_loader, val_loaders, train_sampler = init_data() elif default.mode == 'demo': init_demo() model, criterions, optimizer = init_model() if default.mode == 'infer': evaluate(val_loaders, model) return elif default.mode == 'demo': demo_fun(model) return # mode == 'train' global accs, best_acc1 if default.validate_at_begin: acc_all = evaluate(val_loaders, model) accs[default.begin_epoch] = acc_all[0][config.TEST.CRITERION] best_acc1 = acc_all[0][config.TEST.CRITERION] logger.info('iter %d: %.4f' % (default.begin_epoch, accs[default.begin_epoch])) for epoch in range(default.begin_epoch, default.epochs): if default.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch) # train for one epoch train(train_loader, model, criterions, optimizer, epoch) fn = save_checkpoint(epoch, model, optimizer, last_bkup=True) # evaluate on validation set acc_all = evaluate(val_loaders, model) accs[epoch+1] = acc_all[0][config.TEST.CRITERION] for key in sorted(accs.keys()): print 'iter %d: %.4f' % (key, accs[key]) # remember best acc and save checkpoint is_best = accs[epoch+1] > best_acc1 best_acc1 = max(accs[epoch+1], best_acc1) if is_best or not default.keep_best_model: new_fn = checkpoint_name(epoch+1, False) os.rename(fn, new_fn) if default.keep_best_model: if os.path.exists(default.best_model_path): os.remove(default.best_model_path) default.best_model_path = new_fn
def train_net(args): torch.manual_seed(7) np.random.seed(7) checkpoint = args.checkpoint start_epoch = 0 best_acc = 0 writer = SummaryWriter() epochs_since_improvement = 0 # Initialize / load checkpoint if checkpoint is None: model = CarRecognitionModel() model = nn.DataParallel(model) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: checkpoint = torch.load(checkpoint) start_epoch = checkpoint['epoch'] + 1 epochs_since_improvement = checkpoint['epochs_since_improvement'] model = checkpoint['model'] optimizer = checkpoint['optimizer'] logger = get_logger() # Move to GPU, if available model = model.to(device) # Loss function criterion = nn.CrossEntropyLoss() # Custom dataloaders train_dataset = CarRecognitionDataset('train') train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=num_workers) valid_dataset = CarRecognitionDataset('valid') valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False, num_workers=num_workers) # Epochs for epoch in range(start_epoch, args.end_epoch): if epochs_since_improvement > 0 and epochs_since_improvement % patience == 0: adjust_learning_rate(optimizer, shrink_factor=0.1) lr = get_learning_rate(optimizer) logger.info('Learning rate: ' + str(lr)) writer.add_scalar('model/learning_rate', lr, epoch) # One epoch's training train_loss, train_acc = train(train_loader=train_loader, model=model, criterion=criterion, optimizer=optimizer, epoch=epoch, logger=logger) writer.add_scalar('model/train_loss', train_loss, epoch) writer.add_scalar('model/train_accuracy', train_acc, epoch) # One epoch's validation valid_loss, valid_acc = valid(valid_loader=valid_loader, model=model, criterion=criterion, logger=logger) writer.add_scalar('model/valid_loss', valid_loss, epoch) writer.add_scalar('model/valid_accuracy', valid_acc, epoch) # Check if there was an improvement is_best = valid_acc > best_acc best_acc = max(valid_acc, best_acc) if not is_best: epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement, )) else: epochs_since_improvement = 0 # Save checkpoint save_checkpoint(epoch, epochs_since_improvement, model, optimizer, best_acc, is_best)
def main(rank, args): # Distributed setup if args.distributed: setup_distributed(rank, args.world_size) not_main_rank = args.distributed and rank != 0 logging.info("Start time: %s", datetime.now()) # Explicitly set seed to make sure models created in separate processes # start from same random weights and biases torch.manual_seed(args.seed) # Empty CUDA cache torch.cuda.empty_cache() # Change backend for flac files torchaudio.set_audio_backend("soundfile") # Transforms melkwargs = { "n_fft": args.win_length, "n_mels": args.n_bins, "hop_length": args.hop_length, } sample_rate_original = 16000 if args.type == "mfcc": transforms = torch.nn.Sequential( torchaudio.transforms.MFCC( sample_rate=sample_rate_original, n_mfcc=args.n_bins, melkwargs=melkwargs, ), ) num_features = args.n_bins elif args.type == "waveform": transforms = torch.nn.Sequential(UnsqueezeFirst()) num_features = 1 else: raise ValueError("Model type not supported") if args.normalize: transforms = torch.nn.Sequential(transforms, Normalize()) augmentations = torch.nn.Sequential() if args.freq_mask: augmentations = torch.nn.Sequential( augmentations, torchaudio.transforms.FrequencyMasking( freq_mask_param=args.freq_mask), ) if args.time_mask: augmentations = torch.nn.Sequential( augmentations, torchaudio.transforms.TimeMasking(time_mask_param=args.time_mask), ) # Text preprocessing char_blank = "*" char_space = " " char_apostrophe = "'" labels = char_blank + char_space + char_apostrophe + string.ascii_lowercase language_model = LanguageModel(labels, char_blank, char_space) # Dataset training, validation = split_process_librispeech( [args.dataset_train, args.dataset_valid], [transforms, transforms], language_model, root=args.dataset_root, folder_in_archive=args.dataset_folder_in_archive, ) # Decoder if args.decoder == "greedy": decoder = GreedyDecoder() else: raise ValueError("Selected decoder not supported") # Model model = Wav2Letter( num_classes=language_model.length, input_type=args.type, num_features=num_features, ) if args.jit: model = torch.jit.script(model) if args.distributed: n = torch.cuda.device_count() // args.world_size devices = list(range(rank * n, (rank + 1) * n)) model = model.to(devices[0]) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=devices) else: devices = ["cuda" if torch.cuda.is_available() else "cpu"] model = model.to(devices[0], non_blocking=True) model = torch.nn.DataParallel(model) n = count_parameters(model) logging.info("Number of parameters: %s", n) # Optimizer if args.optimizer == "adadelta": optimizer = Adadelta( model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay, eps=args.eps, rho=args.rho, ) elif args.optimizer == "sgd": optimizer = SGD( model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay, ) elif args.optimizer == "adam": optimizer = Adam( model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay, ) elif args.optimizer == "adamw": optimizer = AdamW( model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay, ) else: raise ValueError("Selected optimizer not supported") if args.scheduler == "exponential": scheduler = ExponentialLR(optimizer, gamma=args.gamma) elif args.scheduler == "reduceonplateau": scheduler = ReduceLROnPlateau(optimizer, patience=10, threshold=1e-3) else: raise ValueError("Selected scheduler not supported") criterion = torch.nn.CTCLoss(blank=language_model.mapping[char_blank], zero_infinity=False) # Data Loader collate_fn_train = collate_factory(model_length_function, augmentations) collate_fn_valid = collate_factory(model_length_function) loader_training_params = { "num_workers": args.workers, "pin_memory": True, "shuffle": True, "drop_last": True, } loader_validation_params = loader_training_params.copy() loader_validation_params["shuffle"] = False loader_training = DataLoader( training, batch_size=args.batch_size, collate_fn=collate_fn_train, **loader_training_params, ) loader_validation = DataLoader( validation, batch_size=args.batch_size, collate_fn=collate_fn_valid, **loader_validation_params, ) # Setup checkpoint best_loss = 1.0 load_checkpoint = args.checkpoint and os.path.isfile(args.checkpoint) if args.distributed: torch.distributed.barrier() if load_checkpoint: logging.info("Checkpoint: loading %s", args.checkpoint) checkpoint = torch.load(args.checkpoint) args.start_epoch = checkpoint["epoch"] best_loss = checkpoint["best_loss"] model.load_state_dict(checkpoint["state_dict"]) optimizer.load_state_dict(checkpoint["optimizer"]) scheduler.load_state_dict(checkpoint["scheduler"]) logging.info("Checkpoint: loaded '%s' at epoch %s", args.checkpoint, checkpoint["epoch"]) else: logging.info("Checkpoint: not found") save_checkpoint( { "epoch": args.start_epoch, "state_dict": model.state_dict(), "best_loss": best_loss, "optimizer": optimizer.state_dict(), "scheduler": scheduler.state_dict(), }, False, args.checkpoint, not_main_rank, ) if args.distributed: torch.distributed.barrier() torch.autograd.set_detect_anomaly(False) for epoch in range(args.start_epoch, args.epochs): logging.info("Epoch: %s", epoch) train_one_epoch( model, criterion, optimizer, scheduler, loader_training, decoder, language_model, devices[0], epoch, args.clip_grad, not_main_rank, not args.reduce_lr_valid, ) loss = evaluate( model, criterion, loader_validation, decoder, language_model, devices[0], epoch, not_main_rank, ) if args.reduce_lr_valid and isinstance(scheduler, ReduceLROnPlateau): scheduler.step(loss) is_best = loss < best_loss best_loss = min(loss, best_loss) save_checkpoint( { "epoch": epoch + 1, "state_dict": model.state_dict(), "best_loss": best_loss, "optimizer": optimizer.state_dict(), "scheduler": scheduler.state_dict(), }, is_best, args.checkpoint, not_main_rank, ) logging.info("End time: %s", datetime.now()) if args.distributed: torch.distributed.destroy_process_group()
train() if 't0' in optimizer.param_groups[0]: tmp = {} for prm in model.parameters(): tmp[prm] = prm.data.clone() prm.data = optimizer.state[prm]['ax'].clone() val_loss2 = evaluate(val_data) logging('-' * 89) logging('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), val_loss2, math.exp(val_loss2))) logging('-' * 89) if val_loss2 < stored_loss: save_checkpoint(model, optimizer, args.save, finetune=True) logging('Saving Averaged!') stored_loss = val_loss2 for prm in model.parameters(): prm.data = tmp[prm].clone() if (len(best_val_loss)>args.nonmono and val_loss2 > min(best_val_loss[:-args.nonmono])): logging('Done!') break optimizer = torch.optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay) #optimizer.param_groups[0]['lr'] /= 2. best_val_loss.append(val_loss2) except KeyboardInterrupt: logging('-' * 89)
if 't0' in optimizer.param_groups[0]: tmp = {} for prm in model.parameters(): tmp[prm] = prm.data.clone() prm.data = optimizer.state[prm]['ax'].clone() val_loss2 = evaluate(val_data) logging('-' * 89) logging('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), val_loss2, math.exp(val_loss2))) logging('-' * 89) val_perp_list.append(math.exp(val_loss2)) if val_loss2 < stored_loss: save_checkpoint(model, optimizer, args.save) logging('Saving Averaged!') stored_loss = val_loss2 for prm in model.parameters(): prm.data = tmp[prm].clone() else: val_loss = evaluate(val_data, eval_batch_size) logging('-' * 89) logging('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) logging('-' * 89) val_perp_list.append(math.exp(val_loss))
for epoch in range(start_epoch, start_epoch+50): time_ep = time.time() lr = args.lr_set utils.adjust_learning_rate(optimizer, lr) train_res = utils.train_epoch(loaders['train'], model, criterion, optimizer) test_res = utils.eval(loaders['test'], model, criterion) if train_res['loss']<train_res_swa['loss'] and test_res['loss']>test_res_swa['loss']: print('find',file=f_out) print('find') utils.save_checkpoint( args.dir, epoch + 1, state_dict=model.state_dict(), optimizer=optimizer.state_dict() ) time_ep = time.time() - time_ep values = [epoch + 1, lr, train_res['loss'], train_res['accuracy'], test_res['loss'], test_res['accuracy'], time_ep] table = tabulate.tabulate([values], columns, tablefmt='simple', floatfmt='8.4f') if epoch % 40 == 0: table = table.split('\n') table = '\n'.join([table[1]] + table) else: table = table.split('\n')[2] print(table, file = f_out) print(table)
ensemble_size = 0 predictions_sum = np.zeros((len(loaders['test'].dataset), num_classes)) columns = [ 'ep', 'lr', 'tr_loss', 'tr_acc', 'te_nll', 'te_acc', 'ens_acc', 'time' ] if args.regularizer is None: regularizer = None elif args.regularizer == 'MSE2': regularizer = regularization.TwoModelsMSE(model, args.reg_wd).reg utils.save_checkpoint(args.dir, start_epoch, name='fge', model_state=model.state_dict(), optimizer_state=optimizer.state_dict()) for epoch in range(args.epochs): time_ep = time.time() lr_schedule = utils.cyclic_learning_rate(epoch, args.cycle, args.lr_1, args.lr_2) if args.weighted_samples is None: train_res = utils.train(loaders['train'], model, optimizer, criterion, lr_schedule=lr_schedule, regularizer=regularizer) else:
def main(args, logger): # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv') trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl') trn_df['is_original'] = 1 # raw_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/raw_pseudo_tst_df.csv') # half_opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/half_opt_pseudo_tst_df.csv') # opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/opt_pseudo_tst_df.csv') # clean texts # trn_df = clean_data(trn_df, ['question_title', 'question_body', 'answer']) # load additional tokens # with open('./mnt/inputs/nes_info/trn_over_10_vocab.pkl', 'rb') as fin: # additional_tokens = pickle.load(fin) gkf = GroupKFold(n_splits=5).split( X=trn_df.question_body, groups=trn_df.question_body_le, ) histories = { 'trn_loss': {}, 'val_loss': {}, 'val_metric': {}, 'val_metric_raws': {}, } loaded_fold = -1 loaded_epoch = -1 if args.checkpoint: histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint) fold_best_metrics = [] fold_best_metrics_raws = [] for fold, (trn_idx, val_idx) in enumerate(gkf): if fold < loaded_fold: fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax( histories["val_metric"][fold])]) continue sel_log( f' --------------------------- start fold {fold} --------------------------- ', logger) fold_trn_df = trn_df.iloc[trn_idx] # .query('is_original == 1') fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'], axis=1) # use only original row fold_val_df = trn_df.iloc[val_idx].query('is_original == 1') fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'], axis=1) if args.debug: fold_trn_df = fold_trn_df.sample(100, random_state=71) fold_val_df = fold_val_df.sample(100, random_state=71) temp = pd.Series( list( itertools.chain.from_iterable( fold_trn_df.question_title.apply(lambda x: x.split(' ')) + fold_trn_df.question_body.apply(lambda x: x.split(' ')) + fold_trn_df.answer.apply(lambda x: x.split(' ')))) ).value_counts() tokens = temp[temp >= 10].index.tolist() # tokens = [] tokens = [ 'CAT_TECHNOLOGY'.casefold(), 'CAT_STACKOVERFLOW'.casefold(), 'CAT_CULTURE'.casefold(), 'CAT_SCIENCE'.casefold(), 'CAT_LIFE_ARTS'.casefold(), ] # + additional_tokens # fold_trn_df = pd.concat([fold_trn_df, opt_pseudo_df, half_opt_pseudo_df], axis=0) trn_dataset = QUESTDataset( df=fold_trn_df, mode='train', tokens=tokens, augment=[], tokenizer_type=TOKENIZER_TYPE, pretrained_model_name_or_path=TOKENIZER_PRETRAIN, do_lower_case=DO_LOWER_CASE, LABEL_COL=LABEL_COL, t_max_len=T_MAX_LEN, q_max_len=Q_MAX_LEN, a_max_len=A_MAX_LEN, tqa_mode=TQA_MODE, TBSEP='[TBSEP]', pos_id_type='arange', MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, ) # update token trn_sampler = RandomSampler(data_source=trn_dataset) trn_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, sampler=trn_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=True, pin_memory=True) val_dataset = QUESTDataset( df=fold_val_df, mode='valid', tokens=tokens, augment=[], tokenizer_type=TOKENIZER_TYPE, pretrained_model_name_or_path=TOKENIZER_PRETRAIN, do_lower_case=DO_LOWER_CASE, LABEL_COL=LABEL_COL, t_max_len=T_MAX_LEN, q_max_len=Q_MAX_LEN, a_max_len=A_MAX_LEN, tqa_mode=TQA_MODE, TBSEP='[TBSEP]', pos_id_type='arange', MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, ) val_sampler = RandomSampler(data_source=val_dataset) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=False, pin_memory=True) fobj = BCEWithLogitsLoss() state_dict = BertModel.from_pretrained(MODEL_PRETRAIN).state_dict() model = BertModelForBinaryMultiLabelClassifier( num_labels=len(LABEL_COL), config_path=MODEL_CONFIG_PATH, state_dict=state_dict, token_size=len(trn_dataset.tokenizer), MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, cat_last_layer_num=1, do_ratio=0.2, ) optimizer = optim.Adam(model.parameters(), lr=3e-5) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=MAX_EPOCH, eta_min=1e-5) # load checkpoint model, optim, scheduler if args.checkpoint and fold == loaded_fold: load_checkpoint(args.checkpoint, model, optimizer, scheduler) for epoch in tqdm(list(range(MAX_EPOCH))): if fold <= loaded_fold and epoch <= loaded_epoch: continue if epoch < 1: model.freeze_unfreeze_bert(freeze=True, logger=logger) else: model.freeze_unfreeze_bert(freeze=False, logger=logger) model = DataParallel(model) model = model.to(DEVICE) trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader, DEVICE) val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test( model, fobj, val_loader, DEVICE, mode='valid') scheduler.step() if fold in histories['trn_loss']: histories['trn_loss'][fold].append(trn_loss) else: histories['trn_loss'][fold] = [ trn_loss, ] if fold in histories['val_loss']: histories['val_loss'][fold].append(val_loss) else: histories['val_loss'][fold] = [ val_loss, ] if fold in histories['val_metric']: histories['val_metric'][fold].append(val_metric) else: histories['val_metric'][fold] = [ val_metric, ] if fold in histories['val_metric_raws']: histories['val_metric_raws'][fold].append(val_metric_raws) else: histories['val_metric_raws'][fold] = [ val_metric_raws, ] logging_val_metric_raws = '' for val_metric_raw in val_metric_raws: logging_val_metric_raws += f'{float(val_metric_raw):.4f}, ' sel_log( f'fold : {fold} -- epoch : {epoch} -- ' f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_metric : {float(val_metric):.4f} -- ' f'val_metric_raws : {logging_val_metric_raws}', logger) model = model.to('cpu') model = model.module save_checkpoint( f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model, optimizer, scheduler, histories, val_y_preds, val_y_trues, val_qa_ids, fold, epoch, val_loss, val_metric, ) fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax( histories["val_metric"][fold])]) save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', trn_dataset.tokenizer, clean=False) del model # calc training stats fold_best_metric_mean = np.mean(fold_best_metrics) fold_best_metric_std = np.std(fold_best_metrics) fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}' sel_log(fold_stats, logger) send_line_notification(fold_stats) fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0) fold_raw_stats = '' for metric_stats_raw in fold_best_metrics_raws_mean: fold_raw_stats += f'{float(metric_stats_raw):.4f},' sel_log(fold_raw_stats, logger) send_line_notification(fold_raw_stats) sel_log('now saving best checkpoints...', logger)
def train_and_evaluate(model, train_data, val_data, optimizer, scheduler, params, model_dir, restore_file=None): """Train the model and evaluate every epoch.""" # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) best_val_loss = 0.0 patience_counter = 0 for epoch in range(1, params.epoch_num + 1): # Run one epoch logging.info("Epoch {}/{}".format(epoch, params.epoch_num)) # Compute number of batches in one epoch params.train_steps = params.train_size // params.batch_size params.val_steps = params.val_size // params.batch_size # data iterator for training train_data_iterator = data_loader.data_iterator(train_data, shuffle=True) # Train for one epoch on training set train(model, train_data_iterator, optimizer, scheduler, params) # data iterator for evaluation train_data_iterator = data_loader.data_iterator(train_data, shuffle=False) val_data_iterator = data_loader.data_iterator(val_data, shuffle=False) # Evaluate for one epoch on training set and validation set params.eval_steps = params.train_steps train_metrics = evaluate(model, train_data_iterator, params, mark='Train') params.eval_steps = params.val_steps val_metrics = evaluate(model, val_data_iterator, params, mark='Val') print("val metrics :", val_metrics) val_loss = val_metrics['loss'] improve_loss = val_loss - best_val_loss # Save weights of the network model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self optimizer_to_save = optimizer.optimizer if args.fp16 else optimizer # model_dir = os.path.join(model_dir,"weightedA") utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model_to_save.state_dict(), 'optim_dict': optimizer_to_save.state_dict() }, is_best=improve_loss > 0, checkpoint=model_dir) if improve_loss > 0: logging.info("- Found new best loss") best_val_loss = val_loss if improve_loss < params.patience: patience_counter += 1 else: patience_counter = 0 else: patience_counter += 1 # Early stopping and logging best f1 if (patience_counter >= params.patience_num and epoch > params.min_epoch_num) or epoch == params.epoch_num: logging.info("Best val loss: {:05.2f}".format(best_val_loss)) break
def train(train_loader, val_multi_loader, model, criterion, optimizer, lr_scheduler, start_iter, tb_logger): global best_loss batch_time = AverageMeter(config.print_freq) fw_time = AverageMeter(config.print_freq) bp_time = AverageMeter(config.print_freq) sy_time = AverageMeter(config.print_freq) step_time = AverageMeter(config.print_freq) data_time = AverageMeter(config.print_freq) losses = AverageMeter(config.print_freq) top1 = AverageMeter(config.print_freq) top2 = AverageMeter(config.print_freq) # switch to train mode model.train() world_size = dist.get_world_size() rank = dist.get_rank() logger = logging.getLogger('global_logger') end = time.time() for i, (input, target) in enumerate(train_loader): curr_step = start_iter + i lr_scheduler.step(curr_step) current_lr = lr_scheduler.get_lr()[0] # measure data loading time data_time.update(time.time() - end) # transfer input to gpu target = target.cuda() input = input.cuda() # forward output = model(input) loss = criterion(output, target) / world_size # measure accuracy and record loss prec1, prec2 = accuracy(output, target, topk=(1, 2)) reduced_loss = loss.clone() reduced_prec1 = prec1.clone() / world_size reduced_prec2 = prec2.clone() / world_size dist.all_reduce(reduced_loss) dist.all_reduce(reduced_prec1) dist.all_reduce(reduced_prec2) losses.update(reduced_loss.item()) top1.update(reduced_prec1.item()) top2.update(reduced_prec2.item()) # backward optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) if curr_step % config.print_freq == 0 and rank == 0: total_batch_size = world_size * args.batch_size epoch = (curr_step * total_batch_size) // len(train_loader.dataset) tb_logger.add_scalar('loss_train', losses.avg, curr_step) tb_logger.add_scalar('acc1_train', top1.avg, curr_step) tb_logger.add_scalar('acc2_train', top2.avg, curr_step) tb_logger.add_scalar('lr', current_lr, curr_step) logger.info('Iter: [{0}/{1}]\t' 'Epoch: {2}\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'LR {lr:.4f}'.format(curr_step, len(train_loader), epoch, batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, lr=current_lr)) if curr_step > 0 and curr_step % config.val_freq == 0: if not args.no_val: total_loss = 0 for dataset_idx in range(len(val_multi_loader)): val_loss, prec1, prec2 = validate( dataset_idx, val_multi_loader[dataset_idx], model, criterion, tb_logger, curr_step=curr_step, save_softmax=True) total_loss += val_loss # average loss over multiple validation sets if len(val_multi_loader) > 0: loss = total_loss / len(val_multi_loader) else: loss = 1e9 if rank == 0: # remember best video logloss recorded at rank 0 and save checkpoint is_best = loss < best_loss best_loss = min(loss, best_loss) save_checkpoint( { 'step': curr_step, 'arch': config.model.arch, 'state_dict': model.state_dict(), 'best_loss': best_loss, 'optimizer': optimizer.state_dict(), }, is_best, args.save_path_dated + '/ckpt') end = time.time()
def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, loss_fn, metrics, params, model_dir, restore_file=None): """Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the neural network train_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches training data val_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches validation data optimizer: (torch.optim) optimizer for parameters of model loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch params: (Params) hyperparameters model_dir: (string) directory containing config, weights and log restore_file: (string) optional- name of file to restore from (without its extension .pth.tar) """ # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) best_val_acc = 0.0 for epoch in range(params.num_epochs): # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) # compute number of batches in one epoch (one full pass over the training set) train(model, optimizer, loss_fn, train_dataloader, metrics, params) # Evaluate for one epoch on validation set val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params) val_acc = val_metrics['accuracy'] is_best = val_acc >= best_val_acc # Save weights utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, is_best=is_best, checkpoint=model_dir) # If best_eval, best_save_path if is_best: logging.info("- Found new best accuracy") best_val_acc = val_acc # Save best val metrics in a json file in the model directory best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics, best_json_path) # Save latest val metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json") utils.save_dict_to_json(val_metrics, last_json_path)
def main(): global best_acc, use_apex, mean, std, scale args = parse_args() args.mean, args.std, args.scale, args.use_apex = mean, std, scale, use_apex args.is_master = args.local_rank == 0 if args.deterministic: cudnn.deterministic = True torch.manual_seed(0) random.seed(0) np.random.seed(0) args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 and args.use_apex if args.is_master: print("opt_level = {}".format(args.opt_level)) print("keep_batchnorm_fp32 = {}".format(args.keep_batchnorm_fp32), type(args.keep_batchnorm_fp32)) print("loss_scale = {}".format(args.loss_scale), type(args.loss_scale)) print("\nCUDNN VERSION: {}\n".format(torch.backends.cudnn.version())) print(f"Use Apex: {args.use_apex}") print(f"Distributed Training Enabled: {args.distributed}") args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() # Scale learning rate based on global batch size # args.lr *= args.batch_size * args.world_size / 256 if args.use_apex: assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled." # create model model = models.ResNet18(args.num_patches, args.num_angles) if args.sync_bn: import apex print("using apex synced BN") model = apex.parallel.convert_syncbn_model(model) model = model.cuda() optimiser = Ranger(model.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss().cuda() # Initialize Amp. Amp accepts either values or strings for the optional override arguments, # for convenient interoperation with argparse. if args.use_apex: model, optimiser = amp.initialize( model, optimiser, opt_level=args.opt_level, keep_batchnorm_fp32=args.keep_batchnorm_fp32, loss_scale=args.loss_scale) # For distributed training, wrap the model with apex.parallel.DistributedDataParallel. # This must be done AFTER the call to amp.initialize. If model = DDP(model) is called # before model, ... = amp.initialize(model, ...), the call to amp.initialize may alter # the types of model's parameters in a way that disrupts or destroys DDP's allreduce hooks. if args.distributed: model = DDP(model, delay_allreduce=True) else: model = nn.DataParallel(model) # Optionally resume from a checkpoint if args.resume: # Use a local scope to avoid dangling references def resume(): global best_acc if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_acc = checkpoint['best_acc'] args.poisson_rate = checkpoint["poisson_rate"] model.load_state_dict(checkpoint['state_dict']) optimiser.load_state_dict(checkpoint['optimiser']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) resume() if args.do_ssl: stl_unlabeled = datasets.STL10(root=args.data, split='unlabeled', download=args.download) indices = list(range(len(stl_unlabeled))) train_indices = indices[:int(len(indices) * 0.9)] val_indices = indices[int(len(indices) * 0.9):] train_dataset = SSLTrainDataset(Subset(stl_unlabeled, train_indices), args.num_patches, args.num_angles, args.poisson_rate) val_dataset = SSLValDataset(Subset(stl_unlabeled, val_indices), args.num_patches, args.num_angles) train_sampler = None val_sampler = None if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) val_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, collate_fn=fast_collate) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=val_sampler, collate_fn=fast_collate) if args.evaluate: rot_val_loss, rot_val_acc, perm_val_loss, perm_val_acc = apex_validate( val_loader, model, criterion, args) if args.is_master: utils.logger.info( f"Rot Val Loss = {rot_val_loss}, Rot Val Accuracy = {rot_val_acc}" ) utils.logger.info( f"Perm Val Loss = {perm_val_loss}, Perm Val Accuracy = {perm_val_acc}" ) return # Create dir to save model and command-line args if args.is_master: model_dir = time.ctime().replace(" ", "_").replace(":", "_") model_dir = os.path.join("models", model_dir) os.makedirs(model_dir, exist_ok=True) with open(os.path.join(model_dir, "args.json"), "w") as f: json.dump(args.__dict__, f, indent=2) writer = SummaryWriter() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) # train for one epoch rot_train_loss, rot_train_acc, perm_train_loss, perm_train_acc = apex_train( train_loader, model, criterion, optimiser, args, epoch) # evaluate on validation set rot_val_loss, rot_val_acc, perm_val_loss, perm_val_acc = apex_validate( val_loader, model, criterion, args) if (epoch + 1) % args.learn_prd == 0: args.poisson_rate += 1 train_loader.dataset.set_poisson_rate(args.poisson_rate) # remember best Acc and save checkpoint if args.is_master: is_best = perm_val_acc > best_acc best_acc = max(perm_val_acc, best_acc) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_acc': best_acc, 'optimiser': optimiser.state_dict(), "poisson_rate": args.poisson_rate }, is_best, model_dir) writer.add_scalars("Rot_Loss", { "train_loss": rot_train_loss, "val_loss": rot_val_loss }, epoch) writer.add_scalars("Perm_Loss", { "train_loss": perm_train_loss, "val_loss": perm_val_loss }, epoch) writer.add_scalars("Rot_Accuracy", { "train_acc": rot_train_acc, "val_acc": rot_val_acc }, epoch) writer.add_scalars("Perm_Accuracy", { "train_acc": perm_train_acc, "val_acc": perm_val_acc }, epoch) writer.add_scalar("Poisson_Rate", train_loader.dataset.pdist.rate, epoch)
def main(): hostname = socket.gethostname() setup_logging(os.path.join(args.results_dir, 'log_{}.txt'.format(hostname))) logging.info("running arguments: %s", args) best_gpu = setup_gpus() torch.cuda.set_device(best_gpu) torch.backends.cudnn.benchmark = True train_transform = get_transform(args.dataset, 'train') train_data = get_dataset(args.dataset, args.train_split, train_transform) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) val_transform = get_transform(args.dataset, 'val') val_data = get_dataset(args.dataset, 'val', val_transform) val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) bit_width_list = list(map(int, args.bit_width_list.split(','))) bit_width_list.sort() model = models.__dict__[args.model](bit_width_list, train_data.num_classes).cuda() lr_decay = list(map(int, args.lr_decay.split(','))) optimizer = get_optimizer_config(model, args.optimizer, args.lr, args.weight_decay) lr_scheduler = None best_prec1 = None if args.resume and args.resume != 'None': if os.path.isdir(args.resume): args.resume = os.path.join(args.resume, 'model_best.pth.tar') if os.path.isfile(args.resume): checkpoint = torch.load(args.resume, map_location='cuda:{}'.format(best_gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler = get_lr_scheduler(args.optimizer, optimizer, lr_decay, checkpoint['epoch']) logging.info("loaded resume checkpoint '%s' (epoch %s)", args.resume, checkpoint['epoch']) else: raise ValueError('Pretrained model path error!') elif args.pretrain and args.pretrain != 'None': if os.path.isdir(args.pretrain): args.pretrain = os.path.join(args.pretrain, 'model_best.pth.tar') if os.path.isfile(args.pretrain): checkpoint = torch.load(args.pretrain, map_location='cuda:{}'.format(best_gpu)) model.load_state_dict(checkpoint['state_dict'], strict=False) logging.info("loaded pretrain checkpoint '%s' (epoch %s)", args.pretrain, checkpoint['epoch']) else: raise ValueError('Pretrained model path error!') if lr_scheduler is None: lr_scheduler = get_lr_scheduler(args.optimizer, optimizer, lr_decay) num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info("number of parameters: %d", num_parameters) criterion = nn.CrossEntropyLoss().cuda() criterion_soft = CrossEntropyLossSoft().cuda() sum_writer = SummaryWriter(args.results_dir + '/summary') for epoch in range(args.start_epoch, args.epochs): model.train() train_loss, train_prec1, train_prec5 = forward(train_loader, model, criterion, criterion_soft, epoch, True, optimizer, sum_writer) model.eval() val_loss, val_prec1, val_prec5 = forward(val_loader, model, criterion, criterion_soft, epoch, False) if isinstance(lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): lr_scheduler.step(val_loss) else: lr_scheduler.step() if best_prec1 is None: is_best = True best_prec1 = val_prec1[-1] else: is_best = val_prec1[-1] > best_prec1 best_prec1 = max(val_prec1[-1], best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'model': args.model, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict() }, is_best, path=args.results_dir + '/ckpt') if sum_writer is not None: sum_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=epoch) for bw, tl, tp1, tp5, vl, vp1, vp5 in zip(bit_width_list, train_loss, train_prec1, train_prec5, val_loss, val_prec1, val_prec5): sum_writer.add_scalar('train_loss_{}'.format(bw), tl, global_step=epoch) sum_writer.add_scalar('train_prec_1_{}'.format(bw), tp1, global_step=epoch) sum_writer.add_scalar('train_prec_5_{}'.format(bw), tp5, global_step=epoch) sum_writer.add_scalar('val_loss_{}'.format(bw), vl, global_step=epoch) sum_writer.add_scalar('val_prec_1_{}'.format(bw), vp1, global_step=epoch) sum_writer.add_scalar('val_prec_5_{}'.format(bw), vp5, global_step=epoch) logging.info('Epoch {}: \ntrain loss {:.2f}, train prec1 {:.2f}, train prec5 {:.2f}\n' ' val loss {:.2f}, val prec1 {:.2f}, val prec5 {:.2f}'.format( epoch, train_loss[-1], train_prec1[-1], train_prec5[-1], val_loss[-1], val_prec1[-1], val_prec5[-1]))
swa_state_dict = checkpoint['swa_state_dict'] if swa_state_dict is not None: swa_model.load_state_dict(swa_state_dict) swa_n_ckpt = checkpoint['swa_n'] if swa_n_ckpt is not None: swa_n = swa_n_ckpt columns = ['ep', 'lr', 'tr_loss', 'tr_acc', 'te_loss', 'te_acc', 'time'] if args.swa: columns = columns[:-1] + ['swa_te_loss', 'swa_te_acc'] + columns[-1:] swa_res = {'loss': None, 'accuracy': None} utils.save_checkpoint( args.dir, start_epoch, state_dict=model.state_dict(), swa_state_dict=swa_model.state_dict() if args.swa else None, swa_n=swa_n if args.swa else None, optimizer=optimizer.state_dict() ) for epoch in range(start_epoch, args.epochs): time_ep = time.time() lr = schedule(epoch) utils.adjust_learning_rate(optimizer, lr) train_res = utils.train_epoch(loaders['train'], model, criterion, optimizer) if epoch == 0 or epoch % args.eval_freq == args.eval_freq - 1 or epoch == args.epochs - 1: test_res = utils.eval(loaders['test'], model, criterion) else: test_res = {'loss': None, 'accuracy': None}
def train_and_evaluate(model, train_data, val_data, optimizer, loss_fn, metrics, params, model_dir, restore_file=None): """Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the neural network train_data: (dict) training data with keys 'data' and 'labels' val_data: (dict) validaion data with keys 'data' and 'labels' optimizer: (torch.optim) optimizer for parameters of model loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch params: (Params) hyperparameters model_dir: (string) directory containing config, weights and log restore_file: (string) optional- name of file to restore from (without its extension .pth.tar) """ # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) best_val_acc = 0.0 for epoch in range(params.num_epochs): # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) # compute number of batches in one epoch (one full pass over the training set) num_steps = (data_params.train_size + 1) // params.batch_size train_data_iterator = data_loader.data_iterator(train_data, params, shuffle=True) #print("before getting into train") #print(num_steps) #print(train_data_iterator) train(model, optimizer, loss_fn, train_data_iterator, metrics, params, num_steps) # Evaluate for one epoch on validation set num_steps = (params.val_size + 1) // params.batch_size #print("before getting into val") #print(num_steps) val_data_iterator = data_loader.data_iterator(val_data, params, shuffle=False) val_metrics = evaluate(model, loss_fn, val_data_iterator, metrics, params, num_steps) val_acc = val_metrics['accuracy'] is_best = val_acc >= best_val_acc # Save weights utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, is_best=is_best, checkpoint=model_dir) # If best_eval, best_save_path if is_best: logging.info("- Found new best accuracy") best_val_acc = val_acc # Save best val metrics in a json file in the model directory best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics, best_json_path) # Save latest val metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json") utils.save_dict_to_json(val_metrics, last_json_path) if (epoch % 100 == 0): plt.plot(train_loss) plt.savefig(str(epoch) + " epoch.jpg")
def train(train_epoch, phase='train'): global global_step lr_decay(global_step) print("epoch %3d with lr=%.02e" % (train_epoch, get_lr())) ssrn.train() if phase == 'train' else ssrn.eval() torch.set_grad_enabled( True) if phase == 'train' else torch.set_grad_enabled(False) data_loader = train_data_loader if phase == 'train' else valid_data_loader it = 0 running_loss = 0.0 running_l1_loss = 0.0 pbar = tqdm(data_loader, unit="audios", unit_scale=data_loader.batch_size, disable=hp.disable_progress_bar) for batch in pbar: M, S = batch['mags'], batch['mels'] M = M.permute(0, 2, 1) # TODO: because of pre processing S = S.permute(0, 2, 1) # TODO: because of pre processing M.requires_grad = False M = M.cuda() S = S.cuda() Z_logit, Z = ssrn(S) l1_loss = F.l1_loss(Z, M) loss = l1_loss if phase == 'train': lr_decay(global_step) optimizer.zero_grad() loss.backward() optimizer.step() global_step += 1 it += 1 loss = loss.item() l1_loss = l1_loss.item() running_loss += loss running_l1_loss += l1_loss if phase == 'train': # update the progress bar pbar.set_postfix({'l1': "%.05f" % (running_l1_loss / it)}) logger.log_step(phase, global_step, {'loss_l1': l1_loss}, { 'mags-true': M[:1, :, :], 'mags-pred': Z[:1, :, :], 'mels': S[:1, :, :] }) if global_step % 5000 == 0: # checkpoint at every 5000th step save_checkpoint(logger.logdir, train_epoch, global_step, ssrn, optimizer) epoch_loss = running_loss / it epoch_l1_loss = running_l1_loss / it logger.log_epoch(phase, global_step, {'loss_l1': epoch_l1_loss}) return epoch_loss
def main(): global best_acc if not os.path.isdir(args.checkpoint): os.makedirs(args.checkpoint) # data transformations = get_transforms(input_size=args.image_size, test_size=args.image_size) train_set = data_gen.Dataset(root=args.train_txt_path, transform=transformations['val_train']) train_loader = data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True) val_set = data_gen.ValDataset(root=args.val_txt_path, transform=transformations['val_test']) val_loader = data.DataLoader(val_set, batch_size=args.batch_size, shuffle=False) # model model = make_model(args) if use_cuda: model.cuda() # define loss function and optimizer if use_cuda: criterion = nn.CrossEntropyLoss().cuda() else: criterion = nn.CrossEntropyLoss() optimizer = get_optimizer(model, args) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, patience=5, verbose=False) # load checkpoint start_epoch = args.start_epoch # if args.resume: # print("===> Resuming from checkpoint") # assert os.path.isfile(args.resume),'Error: no checkpoint directory found' # args.checkpoint = os.path.dirname(args.resume) # 去掉文件名 返回目录 # checkpoint = torch.load(args.resume) # best_acc = checkpoint['best_acc'] # start_epoch = checkpoint['epoch'] # model.module.load_state_dict(checkpoint['state_dict']) # optimizer.load_state_dict(checkpoint['optimizer']) # train for epoch in range(start_epoch, args.epochs): print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, optimizer.param_groups[0]['lr'])) train_loss, train_acc = train(train_loader, model, criterion, optimizer, epoch, use_cuda) test_loss, val_acc = val(val_loader, model, criterion, epoch, use_cuda) scheduler.step(test_loss) print( f'train_loss:{train_loss}\t val_loss:{test_loss}\t train_acc:{train_acc} \t val_acc:{val_acc}' ) # save_model is_best = val_acc >= best_acc best_acc = max(val_acc, best_acc) save_checkpoint( { 'fold': 0, 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'train_acc': train_acc, 'acc': val_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, single=True, checkpoint=args.checkpoint) print("best acc = ", best_acc)
def main(): logger.info("Logger is set - training start") # set default gpu device id torch.cuda.set_device(config.gpus[0]) # set seed np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.benchmark = True # get data with meta info input_size, input_channels, n_classes, train_data, valid_data = utils.get_data( config.dataset, config.data_path, config.cutout_length, validation=True) criterion = nn.CrossEntropyLoss().to(device) use_aux = config.aux_weight > 0.0 if config.dataset in utils.LARGE_DATASETS: model = AugmentCNNImageNet( input_size, input_channels, config.init_channels, n_classes, config.layers, use_aux, config.genotype, ) else: model = AugmentCNN( input_size, input_channels, config.init_channels, n_classes, config.layers, use_aux, config.genotype, ) model = nn.DataParallel(model, device_ids=config.gpus).to(device) # model size mb_params = utils.param_size(model) logger.info("Model size = {:.3f} MB".format(mb_params)) # weights optimizer optimizer = torch.optim.SGD( model.parameters(), config.lr, momentum=config.momentum, weight_decay=config.weight_decay, ) train_loader = torch.utils.data.DataLoader( train_data, batch_size=config.batch_size, shuffle=True, num_workers=config.workers, pin_memory=True, ) valid_loader = torch.utils.data.DataLoader( valid_data, batch_size=config.batch_size, shuffle=False, num_workers=config.workers, pin_memory=True, ) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, config.epochs) best_top1 = 0.0 # training loop for epoch in range(config.epochs): lr_scheduler.step() drop_prob = config.drop_path_prob * epoch / config.epochs model.module.drop_path_prob(drop_prob) # training train(train_loader, model, optimizer, criterion, epoch) # validation cur_step = (epoch + 1) * len(train_loader) top1 = validate(valid_loader, model, criterion, epoch, cur_step) # save if best_top1 < top1: best_top1 = top1 is_best = True else: is_best = False utils.save_checkpoint(model, config.path, is_best) print("") logger.info("Final best Prec@1 = {:.4%} for job {}".format( best_top1, config.name))
best_acc = 0 for epoch in range(config['num_epochs']): train_loss = train(model, data_loader=train_loader, criterion=criterion, optimizer=optimizer, epoch=epoch, to_log=path['log']) test_loss, acc = test(model, test_loader=test_loader, criterion=criterion, to_log=path['log']) if acc >= best_acc: best_acc = acc save_checkpoint(model.state_dict(), is_best=True, checkpoint=path['dir']) else: save_checkpoint(model.state_dict(), is_best=False, checkpoint=path['dir']) lr_scheduler.step() metrics_dic['loss'].append(test_loss) metrics_dic['precision'].append(acc) # print best acc after training write_log("<<<<< Best Accuracy = {:.2f} >>>>>".format(best_acc), path['log'])
def main() -> None: global best_loss args = parser.parse_args() if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') start_epoch = 0 vcf_reader = VCFReader(args.train_data, args.classification_map, args.chromosome, args.class_hierarchy) vcf_writer = vcf_reader.get_vcf_writer() train_dataset, validation_dataset = vcf_reader.get_datasets( args.validation_split) train_sampler = BatchByLabelRandomSampler(args.batch_size, train_dataset.labels) train_loader = DataLoader(train_dataset, batch_sampler=train_sampler) if args.validation_split != 0: validation_sampler = BatchByLabelRandomSampler( args.batch_size, validation_dataset.labels) validation_loader = DataLoader(validation_dataset, batch_sampler=validation_sampler) kwargs = { 'total_size': vcf_reader.positions.shape[0], 'window_size': args.window_size, 'num_layers': args.layers, 'num_classes': len(vcf_reader.label_encoder.classes_), 'num_super_classes': len(vcf_reader.super_label_encoder.classes_) } model = WindowedMLP(**kwargs) model.to(get_device(args)) optimizer = AdamW(model.parameters(), lr=args.learning_rate) ####### if args.resume_path is not None: if os.path.isfile(args.resume_path): print("=> loading checkpoint '{}'".format(args.resume_path)) checkpoint = torch.load(args.resume_path) if kwargs != checkpoint['model_kwargs']: raise ValueError( 'The checkpoint\'s kwargs don\'t match the ones used to initialize the model' ) if vcf_reader.snps.shape[0] != checkpoint['vcf_writer'].snps.shape[ 0]: raise ValueError( 'The data on which the checkpoint was trained had a different number of snp positions' ) start_epoch = checkpoint['epoch'] best_loss = checkpoint['best_loss'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume_path, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) ############# if args.validate: validate(validation_loader, model, nn.functional.binary_cross_entropy_with_logits, len(vcf_reader.label_encoder.classes_), len(vcf_reader.super_label_encoder.classes_), vcf_reader.maf, args) return for epoch in range(start_epoch, args.epochs + start_epoch): loss = train(train_loader, model, nn.functional.binary_cross_entropy_with_logits, optimizer, len(vcf_reader.label_encoder.classes_), len(vcf_reader.super_label_encoder.classes_), vcf_reader.maf, epoch, args) if epoch % args.save_freq == 0 or epoch == args.epochs + start_epoch - 1: if args.validation_split != 0: validation_loss = validate( validation_loader, model, nn.functional.binary_cross_entropy_with_logits, len(vcf_reader.label_encoder.classes_), len(vcf_reader.super_label_encoder.classes_), vcf_reader.maf, args) is_best = validation_loss < best_loss best_loss = min(validation_loss, best_loss) else: is_best = loss < best_loss best_loss = min(loss, best_loss) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'model_kwargs': kwargs, 'best_loss': best_loss, 'optimizer': optimizer.state_dict(), 'vcf_writer': vcf_writer, 'label_encoder': vcf_reader.label_encoder, 'super_label_encoder': vcf_reader.super_label_encoder, 'maf': vcf_reader.maf }, is_best, args.chromosome, args.model_name, args.model_dir)
def main(): global args, best_result, output_directory, train_csv, test_csv # Data loading code print("=> creating data loaders...") # valdir = os.path.join('..', 'data', args.data, 'val') # valdir ="/home/titan-nano/Documents/DLProject/data/rgbd/val/img" data_dir = '/p300/dataset' train_dir = os.path.join(data_dir, 'data', args.data, 'train') val_dir = os.path.join(data_dir, 'data', args.data, 'val') if args.data == 'nyudepthv2': from dataloaders.nyu import NYUDataset train_dataset = NYUDataset(train_dir, split='train', modality=args.modality) val_dataset = NYUDataset(train_dir, split='val', modality=args.modality) elif args.data == 'rgbd': from dataloaders.sist import RGBDDataset train_dataset = RGBDDataset(train_dir, split='train', modality=args.modality) val_dataset = RGBDDataset(val_dir, split='val', modality=args.modality) else: raise RuntimeError('Dataset not found.') # set batch size to be 1 for validation train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=False, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=args.workers, pin_memory=True) print("=> data loaders created.") ############################## Resume Mode ############################## # loading pretrained model print("=> loading model '{}'".format(args.evaluate)) args.start_epoch = 0 checkpoint = torch.load(args.evaluate) if type(checkpoint) is dict: # loading pretrained model model = checkpoint['model'] print("=> loaded best model (epoch {})".format(checkpoint['epoch'])) else: model = checkpoint ############################## Training Setting ############################## optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # define loss function (criterion) and optimizer criterion = None if args.criterion == 'l2': criterion = criteria.MaskedMSELoss().cuda() elif args.criterion == 'l1': criterion = criteria.MaskedL1Loss().cuda() output_directory = os.path.dirname(args.evaluate) best_txt = os.path.join(output_directory, 'best.txt') ############################## Training ############################## for epoch in range(args.epochs): utils.adjust_learning_rate(optimizer, epoch, args.lr) train(train_loader, model, criterion, optimizer, epoch) result, img_merge = validate(val_loader, model, epoch) # remember best rmse and save checkpoint is_best = result.rmse < best_result.rmse if is_best: best_result = result best_model = model with open(best_txt, 'w') as txtfile: txtfile.write( "epoch={}\nmse={:.3f}\nrmse={:.3f}\nabsrel={:.3f}\nlg10={:.3f}\nmae={:.3f}\ndelta1={:.3f}\nt_gpu={:.4f}\n" .format(epoch, result.mse, result.rmse, result.absrel, result.lg10, result.mae, result.delta1, result.gpu_time)) if img_merge is not None: img_filename = output_directory + '/comparison_best.png' utils.save_image(img_merge, img_filename) utils.save_checkpoint( { 'args': args, 'epoch': epoch, 'arch': args.arch, 'model': model, 'best_result': best_result, 'optimizer': optimizer, }, is_best, epoch, output_directory) # save loss file loss_file = np.array(history_loss) np.savetxt(output_directory + '/loss.txt', loss_file) torch.save(best_model.state_dict(), output_directory + '/best_model.pkl')
def train_net(args): torch.manual_seed(7) np.random.seed(7) checkpoint = args.checkpoint start_epoch = 0 best_acc = float('-inf') writer = SummaryWriter() epochs_since_improvement = 0 # Initialize / load checkpoint if checkpoint is None: model = MobileFaceNet() metric_fc = ArcMarginModel(args) optimizer = torch.optim.SGD([{ 'params': model.conv1.parameters() }, { 'params': model.dw_conv.parameters() }, { 'params': model.features.parameters() }, { 'params': model.conv2.parameters() }, { 'params': model.gdconv.parameters() }, { 'params': model.conv3.parameters(), 'weight_decay': 4e-4 }, { 'params': model.bn.parameters() }, { 'params': metric_fc.parameters() }], lr=args.lr, momentum=args.mom, weight_decay=args.weight_decay, nesterov=True) model = nn.DataParallel(model) metric_fc = nn.DataParallel(metric_fc) else: checkpoint = torch.load(checkpoint) start_epoch = checkpoint['epoch'] + 1 epochs_since_improvement = checkpoint['epochs_since_improvement'] model = checkpoint['model'] metric_fc = checkpoint['metric_fc'] optimizer = checkpoint['optimizer'] logger = get_logger() # Move to GPU, if available model = model.to(device) metric_fc = metric_fc.to(device) # Loss function if args.focal_loss: criterion = FocalLoss(gamma=args.gamma).to(device) else: criterion = nn.CrossEntropyLoss().to(device) # Custom dataloaders train_dataset = ArcFaceDataset('train') train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=4) scheduler = MultiStepLR(optimizer, milestones=[5, 10, 15, 20], gamma=0.1) # Epochs for epoch in range(start_epoch, args.end_epoch): # One epoch's training train_loss, train_acc = train(train_loader=train_loader, model=model, metric_fc=metric_fc, criterion=criterion, optimizer=optimizer, epoch=epoch, logger=logger) lr = optimizer.param_groups[0]['lr'] print('\nLearning rate={}\n'.format(lr)) writer.add_scalar('model/train_loss', train_loss, epoch) writer.add_scalar('model/train_acc', train_acc, epoch) writer.add_scalar('model/learning_rate', lr, epoch) # One epoch's validation megaface_acc = megaface_test(model) writer.add_scalar('model/megaface_accuracy', megaface_acc, epoch) # Check if there was an improvement is_best = megaface_acc > best_acc best_acc = max(megaface_acc, best_acc) if not is_best: epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement, )) else: epochs_since_improvement = 0 # Save checkpoint save_checkpoint(epoch, epochs_since_improvement, model, metric_fc, optimizer, best_acc, is_best) scheduler.step(epoch)
def train_net(args): torch.manual_seed(7) np.random.seed(7) checkpoint = args.checkpoint start_epoch = 0 best_acc = 0 writer = SummaryWriter() epochs_since_improvement = 0 # Initialize / load checkpoint if checkpoint is None: if args.network == 'r18': model = resnet18(args) elif args.network == 'r34': model = resnet34(args) elif args.network == 'r50': model = resnet50(args) elif args.network == 'r101': model = resnet101(args) elif args.network == 'r152': model = resnet152(args) elif args.network == 'mobile': model = MobileNet(1.0) else: model = resnet_face18(args.use_se) model = nn.DataParallel(model) metric_fc = ArcMarginModel(args) metric_fc = nn.DataParallel(metric_fc) if args.optimizer == 'sgd': optimizer = torch.optim.SGD([{ 'params': model.parameters() }, { 'params': metric_fc.parameters() }], lr=args.lr, momentum=args.mom, weight_decay=args.weight_decay) else: optimizer = torch.optim.Adam([{ 'params': model.parameters() }, { 'params': metric_fc.parameters() }], lr=args.lr, weight_decay=args.weight_decay) else: checkpoint = torch.load(checkpoint) start_epoch = checkpoint['epoch'] + 1 epochs_since_improvement = checkpoint['epochs_since_improvement'] model = checkpoint['model'] metric_fc = checkpoint['metric_fc'] optimizer = checkpoint['optimizer'] logger = get_logger() # Move to GPU, if available model = model.to(device) metric_fc = metric_fc.to(device) # Loss function if args.focal_loss: criterion = FocalLoss(gamma=args.gamma).to(device) else: criterion = nn.CrossEntropyLoss().to(device) # Custom dataloaders train_dataset = ArcFaceDataset('train') train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) scheduler = StepLR(optimizer, step_size=args.lr_step, gamma=0.1) # Epochs for epoch in range(start_epoch, args.end_epoch): scheduler.step() if args.full_log: lfw_acc, threshold = lfw_test(model) writer.add_scalar('LFW_Accuracy', lfw_acc, epoch) full_log(epoch) start = datetime.now() # One epoch's training train_loss, train_top5_accs = train(train_loader=train_loader, model=model, metric_fc=metric_fc, criterion=criterion, optimizer=optimizer, epoch=epoch, logger=logger) writer.add_scalar('Train_Loss', train_loss, epoch) writer.add_scalar('Train_Top5_Accuracy', train_top5_accs, epoch) end = datetime.now() delta = end - start print('{} seconds'.format(delta.seconds)) # One epoch's validation lfw_acc, threshold = lfw_test(model) writer.add_scalar('LFW_Accuracy', lfw_acc, epoch) # Check if there was an improvement is_best = lfw_acc > best_acc best_acc = max(lfw_acc, best_acc) if not is_best: epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement, )) else: epochs_since_improvement = 0 # Save checkpoint save_checkpoint(epoch, epochs_since_improvement, model, metric_fc, optimizer, best_acc, is_best)
def train(model, loss_fn, optimizer, history, trainset, valset, config): """ Trains the model by optimizing with respect to the given loss function using the given optimizer. Args: model: torch.nn.Module Defines the model. loss_fn: torch.nn.Module Defines the loss function. optimizer: torch.optim.optimizer Defines the optimizer. history: dict Contains histories of desired run metrics. trainset: torch.utils.data.Dataset Contains the training data. valset: torch.utils.data.Dataset Contains the validation data. config: dict Configures the training loop. Contains the following keys: batch_size: int The number of examples to process per batch. Default value is 64. start_epoch: int The epoch to start on for training. Default value is 1. num_epochs: int How many epochs to train the model. Default value is 20. log_every: int How often to save model checkpoint. To turn off logging, set this value to 0. Default value is 5. plot_every: int How often to generate plots. To turn off plotting, set this value to 0. Default value is 5. num_workers: int How many works to assign to the DataLoader. Default value is 4. verbose: boolean Whether or not to print results to console during training. Progress bar is still included. Default value is False. Returns: model: a torch.nn.Module defining the trained model. """ # Get keyword parameter values batch_size = config.get("batch_size", 20) start_epoch = config.get("start_epoch", 1) num_epochs = config.get("num_epochs", 20) log_every = config.get("log_every", 5) plot_every = config.get("plot_every", 5) num_workers = config.get("num_workers", 4) checkpoint_dir = config.get("checkpoint_dir", "checkpoints") verbose = config.get("verbose", False) gamma = config.get("gamma", 0.1) # Learning rate scheduler scheduler = ExponentialLR(optimizer, gamma=gamma) # Use the f1 score to determine best checkpoint best_val_f1 = 0 # Training loop for epoch in tqdm(range(start_epoch, num_epochs + 1), desc="Epochs", position=0): # Process training dataset model, train_results, train_cm = \ process_batches(model, trainset, loss_fn, batch_size, num_workers, desc="train", optimizer=optimizer, is_training=True) if verbose: tqdm.write( PROGRESS_MSG.format(train_results["accuracy"], train_results["precision"], train_results["recall"], train_results["f1"])) # Process validation dataset val_results, val_cm = \ process_batches(model, valset, loss_fn, batch_size, num_workers, desc="val", optimizer=optimizer, is_training=False) if verbose: tqdm.write( PROGRESS_MSG.format(val_results["accuracy"], val_results["precision"], val_results["recall"], val_results["f1"])) # Take step for LR scheduler.step() # Update run history for name, val in train_results.items(): history["train_{}".format(name)].append(val) for name, val in val_results.items(): history["val_{}".format(name)].append(val) # Update best checkpoint if val_results["f1"] > best_val_f1: if verbose: tqdm.write("New best checkpoint!") best_val_f1 = val_results["f1"] filepath = os.path.join(checkpoint_dir, "best_checkpoint") save_checkpoint(model, optimizer, history, epoch + 1, filepath) # Save checkpoint if log_every != 0 and epoch % log_every == 0: if verbose: tqdm.write("Saving checkpoint...") filename = "checkpoint_epoch_{}".format(epoch) filepath = os.path.join(checkpoint_dir, filename) save_checkpoint(model, optimizer, history, epoch + 1, filepath) # Generate plots if plot_every != 0 and epoch % plot_every == 0: if verbose: tqdm.write("Generating plots...") generate_plots(history, checkpoint_dir) return model
def train(train_epoch, phase='train'): global global_step lr_decay(global_step) print("epoch %3d with lr=%.02e" % (train_epoch, get_lr())) text2mel.train() if phase == 'train' else text2mel.eval() torch.set_grad_enabled( True) if phase == 'train' else torch.set_grad_enabled(False) data_loader = train_data_loader if phase == 'train' else valid_data_loader it = 0 running_loss = 0.0 running_l1_loss = 0.0 running_att_loss = 0.0 pbar = tqdm(data_loader, unit="audios", unit_scale=data_loader.batch_size, disable=hp.disable_progress_bar) for batch in pbar: L, S, gates = batch['texts'], batch['mels'], batch['mel_gates'] S = S.permute(0, 2, 1) # TODO: because of pre processing B, N = L.size() # batch size and text count _, n_mels, T = S.size() # number of melspectrogram bins and time assert gates.size(0) == B # TODO: later remove assert gates.size(1) == T S_shifted = torch.cat((S[:, :, 1:], torch.zeros(B, n_mels, 1)), 2) S.requires_grad = False S_shifted.requires_grad = False gates.requires_grad = False def W_nt(_, n, t, g=0.2): return 1.0 - np.exp(-((n / float(N) - t / float(T))**2) / (2 * g**2)) W = np.fromfunction(W_nt, (B, N, T), dtype=np.float32) W = torch.from_numpy(W) L = L.cuda() S = S.cuda() S_shifted = S_shifted.cuda() W = W.cuda() gates = gates.cuda() Y_logit, Y, A = text2mel(L, S) l1_loss = F.l1_loss(Y, S_shifted) masks = gates.reshape(B, 1, T).float() att_loss = (A * W * masks).mean() loss = l1_loss + att_loss if phase == 'train': lr_decay(global_step) optimizer.zero_grad() loss.backward() optimizer.step() global_step += 1 it += 1 loss, l1_loss, att_loss = loss.item(), l1_loss.item(), att_loss.item() running_loss += loss running_l1_loss += l1_loss running_att_loss += att_loss if phase == 'train': # update the progress bar pbar.set_postfix({ 'l1': "%.05f" % (running_l1_loss / it), 'att': "%.05f" % (running_att_loss / it) }) logger.log_step(phase, global_step, { 'loss_l1': l1_loss, 'loss_att': att_loss }, { 'mels-true': S[:1, :, :], 'mels-pred': Y[:1, :, :], 'attention': A[:1, :, :] }) if global_step % 5000 == 0: # checkpoint at every 5000th step save_checkpoint(logger.logdir, train_epoch, global_step, text2mel, optimizer) epoch_loss = running_loss / it epoch_l1_loss = running_l1_loss / it epoch_att_loss = running_att_loss / it logger.log_epoch(phase, global_step, { 'loss_l1': epoch_l1_loss, 'loss_att': epoch_att_loss }) return epoch_loss
def main(): global n_iter args = parser.parse_args() save_path = save_path_formatter(args, parser) args.save_path = 'checkpoints' / save_path print('=> will save everything to {}'.format(args.save_path)) args.save_path.makedirs_p() torch.manual_seed(args.seed) training_writer = SummaryWriter(args.save_path) output_writers = [] if args.log_output: for i in range(3): output_writers.append( SummaryWriter(args.save_path / 'valid' / str(i))) # Data loading code normalize = custom_transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) train_transform = custom_transforms.Compose([ custom_transforms.RandomScaleCrop(), custom_transforms.ArrayToTensor(), normalize ]) valid_transform = custom_transforms.Compose( [custom_transforms.ArrayToTensor(), normalize]) print("=> fetching scenes in '{}'".format(args.data)) train_set = SequenceFolder(args.data, transform=train_transform, seed=args.seed, ttype=args.ttype) val_set = SequenceFolder(args.data, transform=valid_transform, seed=args.seed, ttype=args.ttype2) print('{} samples found in {} train scenes'.format(len(train_set), len(train_set.scenes))) print('{} samples found in {} valid scenes'.format(len(val_set), len(val_set.scenes))) train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_set, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.epoch_size == 0: args.epoch_size = len(train_loader) # create model print("=> creating model") dpsnet = PSNet(args.nlabel, args.mindepth).cuda() if args.pretrained_dps: print("=> using pre-trained weights for DPSNet") weights = torch.load(args.pretrained_dps) dpsnet.load_state_dict(weights['state_dict']) else: dpsnet.init_weights() cudnn.benchmark = True dpsnet = torch.nn.DataParallel(dpsnet) print('=> setting adam solver') parameters = chain(dpsnet.parameters()) optimizer = torch.optim.Adam(parameters, args.lr, betas=(args.momentum, args.beta), weight_decay=args.weight_decay) with open(args.save_path / args.log_summary, 'w') as csvfile: writer = csv.writer(csvfile, delimiter='\t') writer.writerow(['train_loss', 'validation_loss']) with open(args.save_path / args.log_full, 'w') as csvfile: writer = csv.writer(csvfile, delimiter='\t') writer.writerow(['train_loss']) for epoch in range(args.epochs): adjust_learning_rate(args, optimizer, epoch) # train for one epoch train_loss = train(args, train_loader, dpsnet, optimizer, args.epoch_size, training_writer) errors, error_names = validate_with_gt(args, val_loader, dpsnet, epoch, output_writers) error_string = ', '.join('{} : {:.3f}'.format(name, error) for name, error in zip(error_names, errors)) for error, name in zip(errors, error_names): training_writer.add_scalar(name, error, epoch) # Up to you to chose the most relevant error to measure your model's performance, careful some measures are to maximize (such as a1,a2,a3) decisive_error = errors[0] save_checkpoint(args.save_path, { 'epoch': epoch + 1, 'state_dict': dpsnet.module.state_dict() }, epoch) with open(args.save_path / args.log_summary, 'a') as csvfile: writer = csv.writer(csvfile, delimiter='\t') writer.writerow([train_loss, decisive_error])
def run(args): start_epoch = 1 best_loss = 1e+9 # logs args.logdir = get_logdir(args) logger = get_logger(os.path.join(args.logdir, 'main.log')) logger.info(args) writer = SummaryWriter(args.logdir) # data train_set = MovingMNIST(root='./data', train=True, download=True) valid_set = MovingMNIST(root='./data', train=False, download=True, split=args.test_size) train_loader = DataLoader(train_set, batch_size=args.batch_size, num_workers=args.n_workers, shuffle=True) valid_loader = DataLoader(valid_set, batch_size=args.batch_size, num_workers=args.n_workers, shuffle=False) # network model = models.__dict__[args.model](args=args) model = nn.DataParallel(model) args.device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') model = model.to(args.device) # training criterion = get_loss_fn(args) optimizer = get_optimizer(model, args) scheduler = get_scheduler(optimizer, args) if args.resume: if os.path.isfile(args.resume): checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] + 1 best_loss = checkpoint['best/{}'.format(args.loss)] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info('Loaded checkpoint {} (epoch {})'.format( args.resume, start_epoch - 1)) else: raise IOError('No such file {}'.format(args.resume)) for epoch_i in range(start_epoch, args.epochs + 1): training = train(train_loader, model, criterion, optimizer, logger=logger, args=args) validation = validate(valid_loader, model, criterion, logger=logger, args=args) writer.add_scalar('Train/{}'.format(args.loss), training[args.loss], epoch_i) writer.add_scalar('Valid/{}'.format(args.loss), validation[args.loss], epoch_i) writer.add_image('Train/Predict', _get_images(training['output'], args), epoch_i) writer.add_image('Train/Target', _get_images(training['target'], args), epoch_i) writer.add_image('Valid/Predict', _get_images(validation['output'], args), epoch_i) writer.add_image('Valid/Target', _get_images(validation['target'], args), epoch_i) message = '[{}] Epoch {} Train/{} {:.4f} Valid/{} {:.4f} ' message = message.format( args.expid, epoch_i, args.loss, training[args.loss], args.loss, validation[args.loss], ) is_best = validation[args.loss] < best_loss if is_best: best_loss = validation[args.loss] message += '(Best)' save_checkpoint( { 'epoch': epoch_i, 'state_dict': model.state_dict(), 'valid/{}'.format(args.loss): validation[args.loss], 'best/{}'.format(args.loss): best_loss, 'optimizer': optimizer.state_dict(), }, is_best, args.logdir) if scheduler is not None: scheduler.step(epoch=epoch_i) logger.debug('Scheduler stepped.') for param_group in optimizer.param_groups: logger.debug(param_group['lr']) logger.info(message)
def main(conf): warnings.filterwarnings("ignore") best_score = 0. val_score = 0 val_loss = 0 epoch_start = 0 # dataloader train_loader, val_loader = get_dataloader(conf) # model model = networks.get_model(conf) model = nn.DataParallel(model).cuda() if conf.weightfile is not None: wmodel = networks.get_model(conf) wmodel = nn.DataParallel(wmodel).cuda() checkpoint_dict = load_checkpoint(wmodel, conf.weightfile) if 'best_score' in checkpoint_dict: print('best score: {}'.format(best_score)) else: wmodel = model # training setting criterion, optimizer, scheduler = get_train_setting(model, conf) # training and evaluate process for each epoch train, validate = get_proc(conf) if conf.resume: checkpoint_dict = load_checkpoint(model, conf.resume) epoch_start = checkpoint_dict['epoch'] if 'best_score' in checkpoint_dict: best_score = checkpoint_dict['best_score'] print('best score: {}'.format(best_score)) print('Resuming training process from epoch {}...'.format(epoch_start)) optimizer.load_state_dict(checkpoint_dict['optimizer']) scheduler.load_state_dict(checkpoint_dict['scheduler']) print('Resuming lr scheduler') print(checkpoint_dict['scheduler']) if conf.evaluate: print(validate(val_loader, model, criterion, conf)) return detach_epoch = conf.epochs + 1 if 'detach_epoch' in conf: detach_epoch = conf.detach_epoch start_eval = 0 if 'start_eval' in conf: start_eval = conf.start_eval ## ------main loop----- for epoch in range(epoch_start, conf.epochs): lr = optimizer.param_groups[0]['lr'] logging.info("Epoch: [{} | {} LR: {}".format(epoch + 1, conf.epochs, lr)) if epoch == detach_epoch: model.module.set_detach(False) tmp_loss = train(train_loader, model, criterion, optimizer, conf, wmodel) infostr = {'Epoch: {} train_loss: {}'.format(epoch + 1, tmp_loss)} logging.info(infostr) scheduler.step() if epoch > start_eval: with torch.no_grad(): val_score, val_loss, mscore, ascore = validate( val_loader, model, criterion, conf) comscore = val_score if 'midlevel' in conf: if conf.midlevel: comscore = ascore is_best = comscore > best_score best_score = max(comscore, best_score) infostr = { 'Epoch: {:.4f} loss: {:.4f},gs: {:.4f},bs:{:.4f} ,ms:{:.4f},as:{:.4f}' .format(epoch + 1, val_loss, val_score, best_score, mscore, ascore) } logging.info(infostr) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.module.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'best_score': best_score }, is_best, outdir=conf['outdir']) print('Best val acc: {}'.format(best_score)) return 0
def train_and_evaluate_kd(model, teacher_model, train_dataloader, val_dataloader, optimizer, loss_fn_kd, metrics, params, model_dir, restore_file=None): """Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the neural network params: (Params) hyperparameters model_dir: (string) directory containing config, weights and log restore_file: (string) - file to restore (without its extension .pth.tar) """ # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) best_val_acc = 0.0 # Tensorboard logger setup # board_logger = utils.Board_Logger(os.path.join(model_dir, 'board_logs')) # fetch teacher outputs using teacher_model under eval() mode loading_start = time.time() teacher_model.eval() teacher_outputs = fetch_teacher_outputs(teacher_model, train_dataloader, params) elapsed_time = math.ceil(time.time() - loading_start) logging.info("- Finished computing teacher outputs after {} secs..".format( elapsed_time)) # learning rate schedulers for different models: if params.model_version == "resnet18_distill": scheduler = StepLR(optimizer, step_size=150, gamma=0.1) # for cnn models, num_epoch is always < 100, so it's intentionally not using scheduler here elif params.model_version == "cnn_distill": scheduler = StepLR(optimizer, step_size=100, gamma=0.2) for epoch in range(params.num_epochs): scheduler.step() # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) # compute number of batches in one epoch (one full pass over the training set) train_kd(model, teacher_outputs, optimizer, loss_fn_kd, train_dataloader, metrics, params) # Evaluate for one epoch on validation set val_metrics = evaluate_kd(model, val_dataloader, metrics, params) val_acc = val_metrics['accuracy'] is_best = val_acc >= best_val_acc # Save weights utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, is_best=is_best, checkpoint=model_dir) # If best_eval, best_save_path if is_best: logging.info("- Found new best accuracy") best_val_acc = val_acc # Save best val metrics in a json file in the model directory best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics, best_json_path) # Save latest val metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json") utils.save_dict_to_json(val_metrics, last_json_path)
def main(): # parse arg and start experiment global args best_ap = -1. best_iter = 0 args = parser.parse_args() args.config_of_data = config.datasets[args.data] # args.num_classes = config.datasets[args.data]['num_classes'] if configure is None: args.tensorboard = False print(Fore.RED + 'WARNING: you don\'t have tesnorboard_logger installed' + Fore.RESET) # optionally resume from a checkpoint if args.resume: if args.resume and os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) old_args = checkpoint['args'] print('Old args:') print(old_args) # set args based on checkpoint if args.start_iter <= 0: args.start_iter = checkpoint['iter'] + 1 best_iter = args.start_iter - 1 best_ap = checkpoint['best_ap'] for name in arch_resume_names: if name in vars(args) and name in vars(old_args): setattr(args, name, getattr(old_args, name)) model = get_model(**vars(args)) model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (iter {})" .format(args.resume, checkpoint['iter'])) else: print( "=> no checkpoint found at '{}'".format( Fore.RED + args.resume + Fore.RESET), file=sys.stderr) return else: # create model print("=> creating model '{}'".format(args.arch)) model = get_model(**vars(args)) # cudnn.benchmark = True cudnn.enabled = False # create dataloader if args.evaluate == 'val': train_loader, val_loader, test_loader = getDataloaders( splits=('val'), **vars(args)) validate(val_loader, model, best_iter) return elif args.evaluate == 'test': train_loader, val_loader, test_loader = getDataloaders( splits=('test'), **vars(args)) validate(test_loader, model, best_iter) return else: train_loader, val_loader, test_loader = getDataloaders( splits=('train', 'val'), **vars(args)) # define optimizer optimizer = get_optimizer(model, args) # check if the folder exists if os.path.exists(args.save): print(Fore.RED + args.save + Fore.RESET + ' already exists!', file=sys.stderr) if not args.force: ans = input('Do you want to overwrite it? [y/N]:') if ans not in ('y', 'Y', 'yes', 'Yes'): os.exit(1) print('remove existing ' + args.save) shutil.rmtree(args.save) os.makedirs(args.save) print('create folder: ' + Fore.GREEN + args.save + Fore.RESET) # copy code to save folder if args.save.find('debug') < 0: shutil.copytree( '.', os.path.join( args.save, 'src'), symlinks=True, ignore=shutil.ignore_patterns( '*.pyc', '__pycache__', '*.path.tar', '*.pth', '*.ipynb', '.*', 'data', 'save', 'save_backup')) # set up logging global log_print, f_log f_log = open(os.path.join(args.save, 'log.txt'), 'w') def log_print(*args): print(*args) print(*args, file=f_log) log_print('args:') log_print(args) print('model:', file=f_log) print(model, file=f_log, flush=True) # log_print('model:') # log_print(model) # log_print('optimizer:') # log_print(vars(optimizer)) log_print('# of params:', str(sum([p.numel() for p in model.parameters()]))) torch.save(args, os.path.join(args.save, 'args.pth')) scores = ['iter\tlr\ttrain_loss\tval_ap'] if args.tensorboard: configure(args.save, flush_secs=5) for i in range(args.start_iter, args.niters + 1, args.eval_freq): # print('iter {:3d} lr = {:.6e}'.format(i, lr)) # if args.tensorboard: # log_value('lr', lr, i) # train for args.eval_freq iterations train_loss = train(train_loader, model, optimizer, i, args.eval_freq) i += args.eval_freq - 1 # evaluate on validation set val_ap = validate(val_loader, model, i) # save scores to a tsv file, rewrite the whole file to prevent # accidental deletion scores.append(('{}\t{}' + '\t{:.4f}' * 2) .format(i, lr, train_loss, val_ap)) with open(os.path.join(args.save, 'scores.tsv'), 'w') as f: print('\n'.join(scores), file=f) # remember best err@1 and save checkpoint # TODO: change this is_best = val_ap > best_ap if is_best: best_ap = val_ap best_iter = i print(Fore.GREEN + 'Best var_err1 {}'.format(best_ap) + Fore.RESET) save_checkpoint({ 'args': args, 'iter': i, 'best_iter': best_iter, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_ap': best_ap, }, is_best, args.save) if not is_best and i - best_iter >= args.patience > 0: break print('Best val_ap: {:.4f} at iter {}'.format(best_ap, best_iter))
filepath = args.model_dir + 'val_best_weights.json' if os.path.exists(filepath): f = open(filepath) data = json.load(f) best_val_acc = data['accuracy'] f.close() for epoch in range(args.max_epochs): train(train_set, train_set2, model, args, 'train') val_acc = val(val_set, val_set2, model, args, 'val') val_metrics = {'accuracy': val_acc} is_best = val_acc >= best_val_acc utils.save_checkpoint({'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict()}, is_best=is_best, checkpoint=args.model_dir) if is_best: logging.info('- Found new best accuracy') counter = 0 # reset counter best_val_acc = val_acc best_json_path = os.path.join( args.model_dir, 'val_best_weights.json') utils.save_dict_to_json(val_metrics, best_json_path) else: counter += 1 if counter > patience: logging.info('- No improvement in a while, stopping training...')
train() if 't0' in optimizer.param_groups[0]: tmp = {} for prm in model.parameters(): tmp[prm] = prm.data.clone() prm.data = optimizer.state[prm]['ax'].clone() val_loss2 = evaluate(val_data) logging('-' * 89) logging('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), val_loss2, math.exp(val_loss2))) logging('-' * 89) if val_loss2 < stored_loss: save_checkpoint(model, optimizer, args.save) logging('Saving Averaged!') stored_loss = val_loss2 for prm in model.parameters(): prm.data = tmp[prm].clone() else: val_loss = evaluate(val_data, eval_batch_size) logging('-' * 89) logging('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) logging('-' * 89) if val_loss < stored_loss:
def train_model(model, dataloaders, criterion, optimizer, start_epoch, num_epochs=args.epochs): ''' Train model model: Model dataloaders: dataloader dict: {train: , val: } criterion: Loss function optimizer: Optimizer for training num_epochs: Number of epochs to train Out: Best model, val_acc_history ''' since = time.time() val_acc_history = [] lr = args.lr best_model_wts = copy.deepcopy(model.state_dict()) best_acc = 0.0 learning_rate_decay_start = args.lr_decay_start learning_rate_decay_every = args.lr_decay_every learning_rate_decay_rate = args.lr_decay_rate for epoch in range(start_epoch, num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print("-" * 10) if epoch > learning_rate_decay_start and learning_rate_decay_every > 0: frac = (epoch - learning_rate_decay_start) // learning_rate_decay_every decay_factor = learning_rate_decay_rate**frac current_lr = lr * decay_factor set_lr(optimizer, current_lr) print("Learning rate: ", current_lr) for phase in ["train", "val"]: if phase == "train": model.train() else: model.eval() running_loss = 0.0 running_corrects = 0 for inputs, labels in dataloaders[phase]: t = inputs.size(0) if phase == "val": bs, ncrops, c, h, w = np.shape(inputs) inputs = inputs.view(-1, c, h, w) #(bs*n_crops, c, h, w) inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() with torch.set_grad_enabled(phase == 'train'): outputs = model(inputs) if phase == "val": outputs = outputs.view(bs, ncrops, -1).mean(1) loss = criterion(outputs, labels) _, preds = torch.max(outputs, 1) if phase == 'train': loss.backward() clip_gradient(optimizer, 0.1) optimizer.step() running_loss += loss.item() * t running_corrects += torch.sum(preds == labels.data) epoch_loss = running_loss / (dataloader_length[phase]) epoch_acc = running_corrects.double() / (dataloader_length[phase]) print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc)) if phase == 'val' and epoch_acc > best_acc: best_acc = epoch_acc best_model_wts = copy.deepcopy(model.state_dict()) if phase == 'val': val_acc_history.append(epoch_acc) save_checkpoint(epoch, best_model_wts, optimizer) print() time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best val Acc: {:4f}'.format(best_acc)) model.load_state_dict(best_model_wts) return model, val_acc_history
"EM": np.round(valid_em / n_samples, 2), "F1": np.round(valid_f1 / n_samples, 2), "epoch": epoch + 1 }) print("Valid loss of the model at epoch {} is: {}".format( epoch + 1, np.round(valid_losses / len(valid_dataloader), 2))) print("Valid EM of the model at epoch {} is: {}".format( epoch + 1, np.round(valid_em / n_samples, 2))) print("Valid F1 of the model at epoch {} is: {}".format( epoch + 1, np.round(valid_f1 / n_samples, 2))) # save last model weights save_checkpoint( { "epoch": epoch + 1 + epoch_checkpoint, "state_dict": model.state_dict(), "best_valid_loss": np.round(valid_losses / len(valid_dataloader), 2) }, True, os.path.join(experiment_path, "model_last_checkpoint.pkl")) # save model with best validation error is_best = bool( np.round(valid_losses / len(valid_dataloader), 2) < best_valid_loss) best_valid_loss = min(np.round(valid_losses / len(valid_dataloader), 2), best_valid_loss) save_checkpoint( { "epoch": epoch + 1 + epoch_checkpoint, "state_dict": model.state_dict(), "best_valid_loss": best_valid_loss }, is_best, os.path.join(experiment_path, "model.pkl"))
def save(self, prefix, epoch): arg_params = {} for name, tensor in self.arg_dict.items(): if is_param_name(name): arg_params[name] = tensor save_checkpoint(prefix, epoch, arg_params)