def main(): args = parse_args() n, prefix = args['n_epochs'], args['prefix'] bs, img_sz = args['batch_size'], args['image_size'] prefix += '_' if prefix else '' bunch = create_data_bunch(bs, img_sz, args['train_size'], args['valid_size'], use_cache=args['use_cache']) train_sz, valid_sz = len(bunch.train_dl) / bunch.c, len( bunch.valid_dl) / bunch.c learn = create_cnn(bunch, args['network']) learn.metrics = [accuracy, error_rate] if args['continue']: log.info('Continue training using cached data') log.info('Epochs: %d', args['n_epochs']) log.info('Model: %s', args['network_name']) log.info('# of classes: %d', bunch.c) log.info('Train size (per class): %d', train_sz) log.info('Valid size (per class): %d', valid_sz) if args['continue']: cbs = [SaveModelCallback(learn, name='bestmodel_continue')] try: learn.load(f'{prefix}final_224') except Exception as e: log.error('Cannot restore model') log.error(e) sys.exit(1) learn.unfreeze() learn.fit_one_cycle(n, callabacks=cbs, max_lr=slice(3e-5, 3e-5)) learn.save(f'{prefix}continued_224') else: cbs = [SaveModelCallback(learn)] learn.fit_one_cycle(1) learn.save(f'{prefix}one_224') learn.unfreeze() learn.freeze_to(-2) learn.fit_one_cycle(n - 2, max_lr=slice(1e-4, 1e-3)) learn.save(f'{prefix}unfreeze_224') learn.unfreeze() learn.fit_one_cycle(1, callbacks=cbs, max_lr=slice(10e-5, 5e-5)) learn.save(f'{prefix}final_224') log.info('Done!')
def train_model(model, epochs, lr, wd, module_string, ct, path): plt.close('all') learn = basic_train.Learner(data=db, model=model, loss_func=loss_func, wd=wd, callback_fns=ActivationStats, bn_wd=bn_wd, true_wd=true_wd) start = time.perf_counter() if ct: learn.load(path) print('Model loaded: ', path) learn.fit_one_cycle( epochs, max_lr=lr, wd=wd, callbacks=[ SaveModelCallback(learn, every='improvement', monitor='valid_loss', name='best_%s_bs%s_lr%.0e_wd%.0e' % (module_string, bs, lr, wd)) ]) end = time.perf_counter() delta_t = end - start return learn, delta_t
def _get_callbacks(self, name): """Retrieve callbacks to be used for training. Args: name (str): Name of training stage (used to save files). csv saved in: f'saved/model_csv/{exp_name}_{name}.csv' model weights saved in: f'saved/model_weights/{exp_name}_{name}.pth' Returns: List of Callbacks. """ callbacks = [] # Logs metrics for each training stage callbacks.append(CSVLogger( learn=self.learn, append=False, filename=os.path.join(CSV_FOLDER, f'{self.exp_name}_{name}') )) # Saves the best model weights callbacks.append(SaveModelCallback( learn=self.learn, # Also loads best model weights at the end of training every='improvement', name=f'{self.exp_name}_{name}', )) return callbacks
def train_lm(path, filename, model='AWD_LSTM', epochs=8, pretrained_fnames=None, preds=True): #get data after running preprocess print(f'loading data from {path}/{filename};') data_lm = load_data(path, filename, bs=64, bptt=70) #change config if XL if model == 'XL': config = tfmerXL_lm_config.copy() config['mem_len'] = 150 config['output_p'] = 0.1 config['embed_p'] = 0.1 config['ff_p'] = 0.1 config['resid_p'] = 0.1 config['d_inner'] = 1024 config['d_model'] = 128 else: config = None #load pretrained weights if pretrained_fnames: pretrained_fnames = pretrained_fnames.split(',') learn = language_model_learner(data_lm, models[model], config=config, pretrained=False, pretrained_fnames=pretrained_fnames) print(f'training lm model {model}; pretrained from {pretrained_fnames};') #early stopping and saving at every epoch cb = [SaveModelCallback(learn), EarlyStoppingCallback(learn)] if pretrained_fnames: #layered training print(f'training lm model head;') learn.fit_one_cycle(1, 3e-3, moms=(0.8, 0.7)) print(f'saving lm model head to {path}/{filename}_head;') learn.save(filename + '_head') learn.unfreeze() print(f'training for {epochs} epochs') learn.fit_one_cycle(epochs, 3e-4, moms=(0.8, 0.7), callbacks=cb) print(f'saving model to {path}/{filename}_finetuned') learn.save(filename + '_finetuned') #generate outputs from validation set if preds: print( f'generating predictions and saving to {path}/{filename}_preds.txt;' ) get_valid_preds(learn, data_lm, filename + '_' + model + '_preds.txt')
def multi_train(get_learn, epoch_len, epochs, opts, lrs, checkpoints, tb_log_root,autoSave=True): ''' 可以从checkpoint继续训练,为了保证训练连续性,需要手动设置lr与checkpoint保存时一致。 ''' # 清理tensorboard log dir if os.path.exists(tb_log_root): shutil.rmtree(tb_log_root) os.mkdir(tb_log_root) if not os.path.exists('./run_log/'): os.mkdir('./run_log/') txtlog = open('./run_log/log.txt',mode='w') for i,(opt,lr,checkpoint) in enumerate(zip(opts,lrs,checkpoints)): # create a learner learn = get_learn() # set optimizer learn.opt_func = opt # load checkpoint if checkpoint is not None: with open(checkpoint,'rb') as f: learn.load(f) # 在txt log中记录 csv_log_dir = f'csv_log/' if not os.path.exists(learn.path/csv_log_dir): os.mkdir(learn.path/csv_log_dir) csv_fname = csv_log_dir+f'run_{i}' txt_write(txtlog,i,opt,lr,learn.path,csv_fname) callbacks = [] # get csvlogger callback csvLog = CSVLogger(learn,filename=csv_fname) callbacks += [csvLog] if autoSave: # savemodel callback autoSave = SaveModelCallback(learn,monitor='valid_loss',mode='min',every='improvement',name=f'run_{i}') callbacks += [autoSave] # get tensorboard callback tbCb = get_tbCb(learn,tb_log_root+f'run_{i}') callbacks += [tbCb] # train fit(learn=learn, epoch_len=epoch_len, epochs=epochs, lr=lr, callbacks=callbacks) txtlog.close()
def main2(data_dir, output_dir, epochs): crate_dir(output_dir) em_sz, nh, nl = 400, 1150, 3 wd = 1e-7 bptt = 70 opt_fn = partial(optim.Adam, betas=(0.8, 0.99)) bs = 32 lr = 1e-3 last_epoch = get_last_epoch(path.join(args.data_dir, 'models')) print('\033[1;34m', 'Loading data', '\033[0;0m') data = load_data(data_dir, 'data_save.pkl') model = language_model_learner(data, text.models.AWD_LSTM, drop_mult=0.5, metrics=[accuracy]) try: print('\033[1;34m', 'Loading checkpoint', '\033[0;0m') model.load("last") print('\033[0;32m', 'Loaded last checkpoint', '\033[0;0m') except FileNotFoundError: print('\033[1;31m', 'No checkpoint founded', '\033[0;0m') pass model.fit(epochs, lr=slice(lr / 2.6, lr), wd=1e-7, callbacks=[ SaveModelCallback(model, every='epoch', monitor='accuracy', name=f'check_{last_epoch}') ]) print('\033[0;32m', 'Saving model', '\033[0;0m') model.save("last") model.export("model.pkl")
def create_callbacks(learn): return [ EarlyStoppingCallback(learn, patience=3), SaveModelCallback(learn), CSVLogger(learn)]
def train(train_dataset: torch.utils.data.Dataset, test_dataset: torch.utils.data.Dataset, training_config: dict = train_config, global_config: dict = global_config): """ Template training routine. Takes a training and a test dataset wrapped as torch.utils.data.Dataset type and two corresponding generic configs for both gobal path settings and training settings. Returns the fitted fastai.train.Learner object which can be used to assess the resulting metrics and error curves etc. """ for path in global_config.values(): create_dirs(path) # wrap datasets with Dataloader classes train_loader = torch.utils.data.DataLoader( train_dataset, **train_config["DATA_LOADER_CONFIG"]) test_loader = torch.utils.data.DataLoader( test_dataset, **train_config["DATA_LOADER_CONFIG"]) databunch = DataBunch(train_loader, test_loader) # instantiate model and learner if training_config["WEIGHTS"] is None: model = training_config["MODEL"](**training_config["MODEL_CONFIG"]) else: model = load_model(training_config["MODEL"], training_config["MODEL_CONFIG"], training_config["WEIGHTS"], training_config["DEVICE"]) learner = Learner(databunch, model, metrics=train_config["METRICS"], path=global_config["ROOT_PATH"], model_dir=global_config["WEIGHT_DIR"], loss_func=train_config["LOSS"]) # model name & paths name = "_".join([train_config["DATE"], train_config["SESSION_NAME"]]) modelpath = os.path.join(global_config["WEIGHT_DIR"], name) if train_config["MIXED_PRECISION"]: learner.to_fp16() learner.save(modelpath) torch.backends.cudnn.benchmark = True cbs = [ SaveModelCallback(learner), LearnerTensorboardWriter( learner, Path(os.path.join(global_config["LOG_DIR"]), "tensorboardx"), name), TerminateOnNaNCallback() ] # perform training iteration try: if train_config["ONE_CYCLE"]: learner.fit_one_cycle(train_config["EPOCHS"], max_lr=train_config["LR"], callbacks=cbs) else: learner.fit(train_config["EPOCHS"], lr=train_config["LR"], callbacks=cbs) # save model files except KeyboardInterrupt: learner.save(modelpath) raise KeyboardInterrupt learner.save(modelpath) val_loss = min(learner.recorder.val_losses) val_metrics = learner.recorder.metrics # log using the logging tool logger = log.Log(train_config, run_name=train_config['SESSION_NAME']) logger.log_metric('Validation Loss', val_loss) logger.log.metrics(val_metrics) logger.end_run() #write csv log file log_content = train_config.copy() log_content["VAL_LOSS"] = val_loss log_content["VAL_METRICS"] = val_metrics log_path = os.path.join(global_config["LOG_DIR"], train_config["LOGFILE"]) write_log(log_path, log_content) return learner, log_content, name
print(learn.callback_fns) # --- TRAINING --- if config['FINETUNE']: stage0_logger = pd.read_csv( learn.path / 'logs_{}fold.csv'.format(config['FOLD_NUMBER'])) best_epoch = stage0_logger['metric_tot'].idxmax() learn.load('{0}{1}/models/{1}_{3}_{2}'.format(config['PATH_WEIGHTS'], config['MODEL_NAME'], best_epoch, 'stage0')) learn.unfreeze() checkpoint_callback = SaveModelCallback(learn, name=config['MODEL_NAME'] + '_stage1', every='epoch', monitor='valid_loss') # reduce lr by factor after patience epochs reduce_lr_callback = ReduceLROnPlateauCallback(learn, monitor='metric_tot', factor=0.5, patience=5, min_lr=1e-6) logger = CSVLogger(learn, 'logs_{}fold'.format(config['FOLD_NUMBER'])) learn.fit( 40, lr=1e-2 / 10., wd=0., callbacks=[checkpoint_callback, reduce_lr_callback],