def train_model(learn, lr=0.001, lr_decay=0.8, batch_size=512, n_epochs=20, model_name='fastai_'): n = len(learn.data.train_dl) phases = [(TrainingPhase(n).schedule_hp('lr', lr * (lr_decay ** (i)))) for i in range(n_epochs)] sched = GeneralScheduler(learn, phases) learn.callbacks.append(sched) learn.fit(n_epochs, callbacks=[SaveModelCallback(learn, name=model_name), EarlyStoppingCallback(learn, min_delta=0.001, patience=5)])
def fit(self, learner: Learner, weight_decay): if self.early_stop: learner.callbacks.append( EarlyStoppingCallback(learner, patience=self.early_stop.patience)) fit_one_cycle(learner, cyc_len=self.cyc_len, tot_epochs=self.max_epochs, max_lr=self.max_lr, wd=weight_decay)
def fit(self, epochs=10, lr=slice(1e-4, 3e-3), one_cycle=True, early_stopping=False, checkpoint=True, **kwargs): """ Train the model for the specified number of epocs and using the specified learning rates ===================== =========================================== **Argument** **Description** --------------------- ------------------------------------------- epochs Required integer. Number of cycles of training on the data. Increase it if underfitting. --------------------- ------------------------------------------- lr Required float or slice of floats. Learning rate to be used for training the model. Select from the `lr_find` plot. --------------------- ------------------------------------------- one_cycle Optional boolean. Parameter to select 1cycle learning rate schedule. If set to `False` no learning rate schedule is used. --------------------- ------------------------------------------- early_stopping Optional boolean. Parameter to add early stopping. If set to `True` training will stop if validation loss stops improving for 5 epochs. --------------------- ------------------------------------------- checkpoint Optional boolean. Parameter to save the best model during training. If set to `True` the best model based on validation loss will be saved during training. ===================== =========================================== """ callbacks = kwargs['callbacks'] if 'callbacks' in kwargs.keys() else [] kwargs.pop('callbacks', None) if early_stopping: callbacks.append( EarlyStoppingCallback(learn=self.learn, monitor='val_loss', min_delta=0.01, patience=5)) if checkpoint: callbacks.append( SaveModelCallback(self, monitor='val_loss', every='improvement', name='checkpoint')) if one_cycle: self.learn.fit_one_cycle(epochs, lr, callbacks=callbacks, **kwargs) else: self.learn.fit(epochs, lr, callbacks=callbacks, **kwargs)
def fastai_random_data_run_with_callback(iris_data, fit_variant, manual_run, callback, patience): # pylint: disable=unused-argument mlflow.fastai.autolog() model = fastai_model(iris_data) callbacks = [] if callback == "early": callback = EarlyStoppingCallback(learn=model, patience=patience, min_delta=MIN_DELTA) callbacks.append(callback) if fit_variant == "fit_one_cycle": model.fit_one_cycle(NUM_EPOCHS, callbacks=callbacks) else: model.fit(NUM_EPOCHS, callbacks=callbacks) client = mlflow.tracking.MlflowClient() return model, client.get_run(client.list_run_infos(experiment_id="0")[0].run_id)
def fastai_random_data_run_with_callback(iris_data, fit_variant, manual_run, callback, patience): mlflow.fastai.autolog() callbacks = [] if callback == 'early': # min_delta is set as such to guarantee early stopping callbacks.append(lambda learn: EarlyStoppingCallback( learn, patience=patience, min_delta=MIN_DELTA)) model = fastai_model(iris_data, callback_fns=callbacks) if fit_variant == 'fit_one_cycle': model.fit_one_cycle(NUM_EPOCHS) else: model.fit(NUM_EPOCHS) client = mlflow.tracking.MlflowClient() return model, client.get_run( client.list_run_infos(experiment_id='0')[0].run_id)
def get_callbacks(learner, mod_name, early_stop=True, patience=5, monitor='accuracy', min_delta=0.01): callbacks = [ SaveModelCallback(learner, every='improvement', name=f'{mod_name}-opt_accuracy', monitor='accuracy'), SaveModelCallback(learner, every='improvement', name=f'{mod_name}-opt_val_loss'), WandbCallback(learner, monitor=monitor, input_type='images', log='all') ] if early_stop: callbacks.append( EarlyStoppingCallback(learner, patience=patience, min_delta=min_delta, monitor=monitor)) return callbacks
def train(sacred_conf): valid_fold = sacred_conf.fold image_size = sacred_conf.image_size conf = sacred_conf # class_cnt = 5 backbone_name = conf.backbone unfreeze = True #conf.unfreeze if 'unfreeze' in conf else False epoch = 50 assert int(valid_fold) <= 4 # batch_id = str(round(time.time())) df = pd.read_csv('./input/train.csv', names=['file_name', 'label']) df['fold'] = df.file_name%5 df['file_name'] = df.file_name.astype('str')+'.jpg' # #print(df.head(), df.shape) # if class_cnt <= 2: # df.label = np.where(df.label>=1, 1, 0) # def get_transforms(do_flip: bool = True, flip_vert: bool = False, max_rotate: float = 10., max_zoom: float = 1.1, # max_lighting: float = 0.2, max_warp: float = 0.2, p_affine: float = 0.75, # p_lighting: float = 0.75, xtra_tfms: Optional[Collection[Transform]] = None) -> Collection[Transform]: # "Utility func to easily create a list of flip, rotate, `zoom`, warp, lighting transforms." # res = [rand_crop()] # if do_flip: res.append(dihedral_affine() if flip_vert else flip_lr(p=0.5)) # if max_warp: res.append(symmetric_warp(magnitude=(-max_warp, max_warp), p=p_affine)) # if max_rotate: res.append(rotate(degrees=(-max_rotate, max_rotate), p=p_affine)) # if max_zoom > 1: res.append(rand_zoom(scale=(1., max_zoom), p=p_affine)) # if max_lighting: # res.append(brightness(change=(0.5 * (1 - max_lighting), 0.5 * (1 + max_lighting)), p=p_lighting)) # res.append(contrast(scale=(1 - max_lighting, 1 / (1 - max_lighting)), p=p_lighting)) # # train , valid # return (res + listify(xtra_tfms)+zoom_crop(scale=0.1), zoom_crop(scale=0.1)) data = (ImageList.from_df(df, './input/train/', ) .split_by_idx(df.loc[df.fold == valid_fold].index) #.split_from_df('label') # split_by_valid_func(lambda o: int(os.path.basename(o).split('.')[0])%5==i) .label_from_df() # .add_test_folder('./input/test') .transform(get_transforms(), size=image_size) .databunch(bs=16)).normalize(imagenet_stats) test_data = ImageList.from_folder(path="./input/test") data.add_test(test_data) # backbone = get_backbone(backbone_name) #print(to_models.resnet34()) # model_fun = to_models.resnet34 # model_name = model_fun.__name__ if ',' in backbone_name or isinstance(backbone_name, list): learn = get_ens_learn(sacred_conf, data,) print(learn.model) elif 'raw' == conf.model_type: learn = get_cus_learner(sacred_conf, data, ) else: learn = get_fastai_learn(sacred_conf, data) # learn = get_test_learn(data, backbone_name) model_name = backbone_name #print(model_name, learn.model) # ch_prefix = os.path.basename(file_name) # checkpoint_name = f'{model_name}_f{valid_fold}' callbacks = [EarlyStoppingCallback(learn, monitor='accuracy', min_delta=1e-5, patience=5), #SaveModelCallback(learn, monitor='accuracy', name=checkpoint_name, every='improvement'), Recorder_scared(ex, learn ) ] print(f'=====Fold:{valid_fold}, Total epoch:{epoch}, type#{conf.model_type}, lock#{conf.lock_layer}, model:{model_name}, image:{image_size} =========') learn.fit_one_cycle(epoch, callbacks=callbacks) oof_val = get_oof_df(learn, DatasetType.Valid) oof_test = get_oof_df(learn, DatasetType.Test) os.makedirs('./output/stacking/', exist_ok=True) import socket host_name = socket.gethostname() # score_list = np.array(learn.recorder.metrics) # best_epoch = np.argmax(score_list) # best_score = np.max(score_list) val_len = len(learn.data.valid_ds.items) train_len = len(learn.data.train_ds.items) from sklearn.metrics import accuracy_score best_score = accuracy_score(oof_val.iloc[:, :-1].idxmax(axis=1), oof_val.iloc[:, -1]) conf_name_base = backbone_name oof_file = f'./output/stacking/{version}_{host_name[:5]}_s{best_score:6.5f}_{conf_name_base}_{conf.model_type}_f{valid_fold}_val{val_len}_trn{train_len}.h5' print(f'Stacking file save to:{oof_file}') save_stack_feature(oof_val, oof_test, oof_file)
metrics=[METRIC], device=DEVICE, model_dir=LOGGING_FOLDER) if HS_MODEL is not None: learn.model.load_state_dict(torch.load(HS_MODEL)['model']) set_BN_momentum(learn.model, batch_size=BATCH_SIZE) learn.clip_grad(1.) # callbacks csv_logger = CSVLogger(learn=learn, filename=f'{LOGGING_FOLDER}/fit_trace', append=True) early_stopping = EarlyStoppingCallback(learn=learn, monitor='dice', patience=PATIENCE) save_model = SaveModelCallback(learn=learn, monitor='dice', name='best_model') acc_grad = AccumulateStep(learn, 64 // BATCH_SIZE) # # find optimal LR # learn.lr_find(stop_div=True, num_it=100) # learn.recorder.plot(suggestion=True) # opt_lr = learn.recorder.min_grad_lr # print(f'Initial optimal lr: {opt_lr}') if TRAIN_MODE: if HS_MODEL is None:
data = (ImageList.from_csv( path, 'train_fastai_format.csv', folder='preprocessed/224/train').split_by_rand_pct(seed=42).label_from_df( label_delim=' ').transform(tfms, size=( sz, sz)).add_test(str(path) + '/preprocessed/224/test/' + test_fns).databunch( bs=bs, num_workers=8).normalize(imagenet_stats)) model = EfficientNet.from_pretrained('efficientnet-b4', num_classes=6) learn = Learner(data, model, metrics=[accuracy_thresh], model_dir=path / 'models/eff_net').to_fp16() learn.unfreeze() learn.load(pretrained_model) learn.fit_one_cycle(10, lr, callbacks=[ EarlyStoppingCallback(learn, min_delta=0.001, patience=3), SaveModelCallback(learn, every='epoch', name='effb0-224') ])
def fit(self, epochs=10, lr=None, one_cycle=True, early_stopping=False, checkpoint=True, tensorboard=False, **kwargs): """ Train the model for the specified number of epochs and using the specified learning rates ===================== =========================================== **Argument** **Description** --------------------- ------------------------------------------- epochs Required integer. Number of cycles of training on the data. Increase it if underfitting. --------------------- ------------------------------------------- lr Optional float or slice of floats. Learning rate to be used for training the model. If ``lr=None``, an optimal learning rate is automatically deduced for training the model. --------------------- ------------------------------------------- one_cycle Optional boolean. Parameter to select 1cycle learning rate schedule. If set to `False` no learning rate schedule is used. --------------------- ------------------------------------------- early_stopping Optional boolean. Parameter to add early stopping. If set to 'True' training will stop if validation loss stops improving for 5 epochs. --------------------- ------------------------------------------- checkpoint Optional boolean. Parameter to save the best model during training. If set to `True` the best model based on validation loss will be saved during training. --------------------- ------------------------------------------- tensorboard Optional boolean. Parameter to write the training log. If set to 'True' the log will be saved at <dataset-path>/training_log which can be visualized in tensorboard. Required tensorboardx version=1.7 (Experimental support). The default value is 'False'. ===================== =========================================== """ self._check_requisites() if lr is None: print('Finding optimum learning rate.') lr = self.lr_find(allow_plot=False) lr = slice(lr / 10, lr) self._learning_rate = lr if arcgis.env.verbose: logger.info('Fitting the model.') if getattr(self, '_backend', 'pytorch') == 'tensorflow': checkpoint = False callbacks = kwargs['callbacks'] if 'callbacks' in kwargs.keys() else [] kwargs.pop('callbacks', None) if early_stopping: callbacks.append( EarlyStoppingCallback(learn=self.learn, monitor='valid_loss', min_delta=0.01, patience=5)) if checkpoint: from datetime import datetime now = datetime.now() callbacks.append( SaveModelCallback( self, monitor='valid_loss', every='improvement', name=now.strftime("checkpoint_%Y-%m-%d_%H-%M-%S"))) # If tensorboardx is installed write a log with name as timestamp if tensorboard and HAS_TENSORBOARDX: training_id = time.strftime("log_%Y-%m-%d_%H-%M-%S") log_path = Path(os.path.dirname(self._data.path)) / 'training_log' callbacks.append( LearnerTensorboardWriter(learn=self.learn, base_dir=log_path, name=training_id)) hostname = socket.gethostname() print( "Monitor training using Tensorboard using the following command: 'tensorboard --host={} --logdir={}'" .format(hostname, log_path)) # Send out a warning if tensorboardX is not installed elif tensorboard: warn( "Install tensorboardX 1.7 'pip install tensorboardx==1.7' to write training log" ) if one_cycle: self.learn.fit_one_cycle(epochs, lr, callbacks=callbacks, **kwargs) else: self.learn.fit(epochs, lr, callbacks=callbacks, **kwargs)
def __init__(self, data_path: str = 'lang_model', emb_sz: int = 800, qrnn: bool = False, bidir: bool = False, n_layers: int = 4, n_hid: int = 2500, bs: int = 104, bptt: int = 67, lr: float = 0.0013, wd: float = .012, one_cycle: bool = True, cycle_len: int = 1) -> None: """ Instantiate AWD_LSTM Language Model with hyper-parameters. data_path: str path where databunch is loaded from emb_sz: int size of word embeddings qrnn: bool whether or not to use qrnn (requires CudNN) bidir: bool if RNN should be bi-directional n_layers: int number of layers in lang model n_hid: int number of hidden units in model lr: float learning rate bptt: int back-propigation-through-time; max sequence length through which gradients will be accumulated. bs: int batch size The hyper-parameters are stored in a fastai dict called `fastai.text.models.awd_lstm_lm_config`: {'emb_sz': 400, 'n_hid': 1150, 'n_layers': 3, 'pad_token': 1, 'qrnn': False, 'bidir': False, 'output_p': 0.1, 'hidden_p': 0.15, 'input_p': 0.25, 'embed_p': 0.02,'weight_p': 0.2, 'tie_weights': True, 'out_bias': True} """ self.lr, self.wd, self.one_cycle, self.cycle_len = lr, wd, one_cycle, cycle_len awd_lstm_lm_config.update( dict(emb_sz=emb_sz, qrnn=qrnn, bidir=bidir, n_layers=n_layers, n_hid=n_hid)) #log params wb_handle = wandb.init(config=awd_lstm_lm_config) wandb.config.update({ 'data_path': str(data_path), 'bs': bs, 'bptt': bptt, 'lr': lr }) self.csv_name = 'history_' + wb_handle.name wandb.config.update({'csvlog_save_path': self.csv_name}) # instantiate databunch self.data_lm = load_data(data_path, bs=bs, bptt=bptt) # instantiate language model self.learn = language_model_learner(data=self.data_lm, arch=AWD_LSTM, pretrained=False, model_dir=Path('models_' + wb_handle.name), config=awd_lstm_lm_config) self.full_model_path = str(self.learn.path / self.learn.model_dir) wandb.config.update({'model_save_path': self.full_model_path}) # prepare callbacks escb = EarlyStoppingCallback(learn=self.learn, patience=2) smcb = SaveModelCallback(learn=self.learn, name='best_' + wb_handle.name) rpcb = ReduceLROnPlateauCallback(learn=self.learn, patience=1) csvcb = CSVLogger(learn=self.learn, filename=self.csv_name) wb = wandbCallback(self.learn) self.callbacks = [escb, smcb, rpcb, csvcb, wb] self.fit()
base_arch=ENCODER, pretrained=PRETRAINED, is_se_resnext=IS_SE_RESNEXT, metrics=[METRIC], device=DEVICE, model_dir=LOGGING_FOLDER) set_BN_momentum(learn.model, batch_size=BATCH_SIZE) learn.clip_grad(1.) # callbacks csv_logger = CSVLogger(learn=learn, filename=f'{LOGGING_FOLDER}/fit_trace', append=True) early_stopping = EarlyStoppingCallback(learn=learn, monitor='accuracy', patience=PATIENCE) save_model = SaveModelCallback(learn=learn, monitor='accuracy', name='best_model') acc_grad = AccumulateStep(learn, 64 // BATCH_SIZE) opt_lr = 0.001 # fit with frozen learn.fit_one_cycle( cyc_len=3, max_lr=opt_lr, callbacks=[acc_grad, csv_logger, early_stopping, save_model]) # fit entire model with saving on the best epoch
def train(valid_fold, conf_name): f = open(f'./configs/{conf_name}.yaml') conf = edict(yaml.load(f)) class_cnt = conf.class_cnt backbone_name = conf.backbone unfreeze = True #conf.unfreeze if 'unfreeze' in conf else False epoch = 50 assert int(valid_fold) <= 4 # batch_id = str(round(time.time())) backbone = get_backbone(backbone_name) df = pd.read_csv('./input/train.csv', names=['file_name', 'label']) df['fold'] = df.file_name % 5 df['file_name'] = df.file_name.astype('str') + '.jpg' # #print(df.head(), df.shape) # if class_cnt <= 2: # df.label = np.where(df.label>=1, 1, 0) data = ( ImageList.from_df( df, './input/train/', ).split_by_idx(df.loc[df.fold == valid_fold].index) # split_by_valid_func(lambda o: int(os.path.basename(o).split('.')[0])%5==i) .label_from_df(cols='label', label_cls=FloatList) # .add_test_folder('./input/test') .transform(get_transforms(), size=200).databunch(bs=16)).normalize(imagenet_stats) test_data = ImageList.from_folder(path="./input/test") data.add_test(test_data) #data.show_batch(rows=3, figsize=(15,15)) #head = create_head(nf, nc, lin_ftrs, ps=ps, concat_pool=concat_pool, bn_final=bn_final) learn = cnn_learner(data, backbone, metrics=[root_mean_squared_error], loss_func=nn.MSELoss(), custom_head=None) print(learn.model) checkpoint_name = f'{backbone()._get_name()}_rf{valid_fold}' callbacks = [ EarlyStoppingCallback(learn, monitor='root_mean_squared_error', min_delta=1e-5, patience=5), SaveModelCallback(learn, monitor='root_mean_squared_error', name=checkpoint_name, every='improvement') ] print( f'=====Fold:{valid_fold}, Total epoch:{epoch}, {conf_name}, backbone:{backbone_name}=========' ) if unfreeze: learn.freeze_to(-2) learn.fit_one_cycle(epoch, callbacks=callbacks) oof_val = get_oof_df(learn, DatasetType.Valid) oof_test = get_oof_df(learn, DatasetType.Test) os.makedirs('./output/stacking/', exist_ok=True) import socket host_name = socket.gethostname() # score_list = np.array(learn.recorder.metrics) # best_epoch = np.argmax(score_list) # best_score = np.max(score_list) val_len = len(learn.data.valid_ds.items) train_len = len(learn.data.train_ds.items) from sklearn.metrics import accuracy_score best_score = accuracy_score(oof_val.iloc[:, 0].astype(int), oof_val.iloc[:, -1].astype(int)) oof_file = f'./output/stacking/{version}_{host_name[:5]}_s{best_score:6.5f}_{conf_name}_f{valid_fold}_val{val_len}_trn{train_len}.h5' print(f'Stacking file save to:{oof_file}') save_stack_feature(oof_val, oof_test, oof_file)
loss_func=CRITERION, metrics=[METRIC], opt_func=OPTIMIZER, wd=WD) set_BN_momentum(learn.model, n_acc=N_ACC) learn.clip_grad(1.) learn.model = convert_model(learn.model) learn.model = nn.DataParallel(learn.model).to(DEVICE) # init callbacks csv_logger = CSVLogger(learn=learn, filename=f'{LOGGING_FOLDER}/fit_trace', append=True) early_stopping = EarlyStoppingCallback(learn=learn, monitor='valid_loss', patience=PATIENCE) save_model = SaveModelCallback(learn=learn, monitor='valid_loss', name='inter_model', every='epoch') acc_grad = AccumulateStep(learn, N_ACC) # fit one cycle learn.fit_one_cycle( cyc_len=NUM_EPOCHS, max_lr=LEARNING_RATE, div_factor=DIV_FACTOR, final_div=DIV_FACTOR, annealing_func=ANNEALING, start_epoch=START_EPOCH,
def cb_estop(learner, patience=5, min_delta=0.01, monitor='accuracy'): return EarlyStoppingCallback(learner, patience=patience, min_delta=min_delta, monitor=monitor)