示例#1
0
def main():
    args = parse_args()
    n, prefix = args['n_epochs'], args['prefix']
    bs, img_sz = args['batch_size'], args['image_size']
    prefix += '_' if prefix else ''

    bunch = create_data_bunch(bs,
                              img_sz,
                              args['train_size'],
                              args['valid_size'],
                              use_cache=args['use_cache'])
    train_sz, valid_sz = len(bunch.train_dl) / bunch.c, len(
        bunch.valid_dl) / bunch.c
    learn = create_cnn(bunch, args['network'])
    learn.metrics = [accuracy, error_rate]

    if args['continue']:
        log.info('Continue training using cached data')

    log.info('Epochs: %d', args['n_epochs'])
    log.info('Model: %s', args['network_name'])
    log.info('# of classes: %d', bunch.c)
    log.info('Train size (per class): %d', train_sz)
    log.info('Valid size (per class): %d', valid_sz)

    if args['continue']:
        cbs = [SaveModelCallback(learn, name='bestmodel_continue')]

        try:
            learn.load(f'{prefix}final_224')
        except Exception as e:
            log.error('Cannot restore model')
            log.error(e)
            sys.exit(1)

        learn.unfreeze()
        learn.fit_one_cycle(n, callabacks=cbs, max_lr=slice(3e-5, 3e-5))
        learn.save(f'{prefix}continued_224')

    else:
        cbs = [SaveModelCallback(learn)]

        learn.fit_one_cycle(1)
        learn.save(f'{prefix}one_224')

        learn.unfreeze()
        learn.freeze_to(-2)
        learn.fit_one_cycle(n - 2, max_lr=slice(1e-4, 1e-3))
        learn.save(f'{prefix}unfreeze_224')

        learn.unfreeze()
        learn.fit_one_cycle(1, callbacks=cbs, max_lr=slice(10e-5, 5e-5))
        learn.save(f'{prefix}final_224')

    log.info('Done!')
示例#2
0
def train_model(model, epochs, lr, wd, module_string, ct, path):
    plt.close('all')
    learn = basic_train.Learner(data=db,
                                model=model,
                                loss_func=loss_func,
                                wd=wd,
                                callback_fns=ActivationStats,
                                bn_wd=bn_wd,
                                true_wd=true_wd)
    start = time.perf_counter()
    if ct:
        learn.load(path)
        print('Model loaded: ', path)
    learn.fit_one_cycle(
        epochs,
        max_lr=lr,
        wd=wd,
        callbacks=[
            SaveModelCallback(learn,
                              every='improvement',
                              monitor='valid_loss',
                              name='best_%s_bs%s_lr%.0e_wd%.0e' %
                              (module_string, bs, lr, wd))
        ])
    end = time.perf_counter()
    delta_t = end - start
    return learn, delta_t
示例#3
0
    def _get_callbacks(self, name):
        """Retrieve callbacks to be used for training.

        Args:
            name (str): Name of training stage (used to save files).
                csv saved in: f'saved/model_csv/{exp_name}_{name}.csv'
                model weights saved in: f'saved/model_weights/{exp_name}_{name}.pth'

        Returns:
            List of Callbacks.
        """
        callbacks = []

        # Logs metrics for each training stage
        callbacks.append(CSVLogger(
            learn=self.learn,
            append=False,
            filename=os.path.join(CSV_FOLDER, f'{self.exp_name}_{name}')
        ))

        # Saves the best model weights
        callbacks.append(SaveModelCallback(
            learn=self.learn,
            # Also loads best model weights at the end of training
            every='improvement',
            name=f'{self.exp_name}_{name}',
        ))

        return callbacks
示例#4
0
def train_lm(path,
             filename,
             model='AWD_LSTM',
             epochs=8,
             pretrained_fnames=None,
             preds=True):

    #get data after running preprocess
    print(f'loading data from {path}/{filename};')
    data_lm = load_data(path, filename, bs=64, bptt=70)

    #change config if XL
    if model == 'XL':
        config = tfmerXL_lm_config.copy()
        config['mem_len'] = 150
        config['output_p'] = 0.1
        config['embed_p'] = 0.1
        config['ff_p'] = 0.1
        config['resid_p'] = 0.1
        config['d_inner'] = 1024
        config['d_model'] = 128
    else:
        config = None

    #load pretrained weights
    if pretrained_fnames: pretrained_fnames = pretrained_fnames.split(',')
    learn = language_model_learner(data_lm,
                                   models[model],
                                   config=config,
                                   pretrained=False,
                                   pretrained_fnames=pretrained_fnames)
    print(f'training lm model {model}; pretrained from {pretrained_fnames};')

    #early stopping and saving at every epoch
    cb = [SaveModelCallback(learn), EarlyStoppingCallback(learn)]

    if pretrained_fnames:
        #layered training
        print(f'training lm model head;')
        learn.fit_one_cycle(1, 3e-3, moms=(0.8, 0.7))
        print(f'saving lm model head to {path}/{filename}_head;')
        learn.save(filename + '_head')
        learn.unfreeze()

    print(f'training for {epochs} epochs')
    learn.fit_one_cycle(epochs, 3e-4, moms=(0.8, 0.7), callbacks=cb)
    print(f'saving model to {path}/{filename}_finetuned')
    learn.save(filename + '_finetuned')

    #generate outputs from validation set
    if preds:
        print(
            f'generating predictions and saving to {path}/{filename}_preds.txt;'
        )
        get_valid_preds(learn, data_lm, filename + '_' + model + '_preds.txt')
示例#5
0
def multi_train(get_learn, epoch_len, epochs, opts, lrs, checkpoints, tb_log_root,autoSave=True):
    '''
    可以从checkpoint继续训练,为了保证训练连续性,需要手动设置lr与checkpoint保存时一致。
    '''
    # 清理tensorboard log dir
    if os.path.exists(tb_log_root): shutil.rmtree(tb_log_root)
    os.mkdir(tb_log_root)

    if not os.path.exists('./run_log/'): os.mkdir('./run_log/')
    txtlog = open('./run_log/log.txt',mode='w')
    for i,(opt,lr,checkpoint) in enumerate(zip(opts,lrs,checkpoints)):
        # create a learner
        learn = get_learn()

        # set optimizer
        learn.opt_func = opt

        # load checkpoint
        if checkpoint is not None:
            with open(checkpoint,'rb') as f:
                learn.load(f)

        # 在txt log中记录
        csv_log_dir = f'csv_log/'
        if not os.path.exists(learn.path/csv_log_dir): os.mkdir(learn.path/csv_log_dir)
        csv_fname = csv_log_dir+f'run_{i}'
        txt_write(txtlog,i,opt,lr,learn.path,csv_fname)

        callbacks = []
        # get csvlogger callback
        csvLog = CSVLogger(learn,filename=csv_fname)
        callbacks += [csvLog]

        if autoSave:
            # savemodel callback
            autoSave = SaveModelCallback(learn,monitor='valid_loss',mode='min',every='improvement',name=f'run_{i}')
            callbacks += [autoSave]

        # get tensorboard callback
        tbCb = get_tbCb(learn,tb_log_root+f'run_{i}')
        callbacks += [tbCb]

        # train
        fit(learn=learn, epoch_len=epoch_len, epochs=epochs, lr=lr, callbacks=callbacks)

    txtlog.close()
示例#6
0
def main2(data_dir, output_dir, epochs):

    crate_dir(output_dir)

    em_sz, nh, nl = 400, 1150, 3
    wd = 1e-7
    bptt = 70
    opt_fn = partial(optim.Adam, betas=(0.8, 0.99))
    bs = 32
    lr = 1e-3

    last_epoch = get_last_epoch(path.join(args.data_dir, 'models'))

    print('\033[1;34m', 'Loading data', '\033[0;0m')
    data = load_data(data_dir, 'data_save.pkl')
    model = language_model_learner(data,
                                   text.models.AWD_LSTM,
                                   drop_mult=0.5,
                                   metrics=[accuracy])

    try:
        print('\033[1;34m', 'Loading checkpoint', '\033[0;0m')
        model.load("last")
        print('\033[0;32m', 'Loaded last checkpoint', '\033[0;0m')
    except FileNotFoundError:
        print('\033[1;31m', 'No checkpoint founded', '\033[0;0m')
        pass

    model.fit(epochs,
              lr=slice(lr / 2.6, lr),
              wd=1e-7,
              callbacks=[
                  SaveModelCallback(model,
                                    every='epoch',
                                    monitor='accuracy',
                                    name=f'check_{last_epoch}')
              ])

    print('\033[0;32m', 'Saving model', '\033[0;0m')
    model.save("last")
    model.export("model.pkl")
示例#7
0
def create_callbacks(learn):
    return [
        EarlyStoppingCallback(learn, patience=3),
        SaveModelCallback(learn),
        CSVLogger(learn)]
示例#8
0
def train(train_dataset: torch.utils.data.Dataset,
          test_dataset: torch.utils.data.Dataset,
          training_config: dict = train_config,
          global_config: dict = global_config):
    """
    Template training routine. Takes a training and a test dataset wrapped
    as torch.utils.data.Dataset type and two corresponding generic
    configs for both gobal path settings and training settings.
    Returns the fitted fastai.train.Learner object which can be
    used to assess the resulting metrics and error curves etc.
    """

    for path in global_config.values():
        create_dirs(path)

    # wrap datasets with Dataloader classes
    train_loader = torch.utils.data.DataLoader(
        train_dataset, **train_config["DATA_LOADER_CONFIG"])
    test_loader = torch.utils.data.DataLoader(
        test_dataset, **train_config["DATA_LOADER_CONFIG"])
    databunch = DataBunch(train_loader, test_loader)

    # instantiate model and learner
    if training_config["WEIGHTS"] is None:
        model = training_config["MODEL"](**training_config["MODEL_CONFIG"])
    else:
        model = load_model(training_config["MODEL"],
                           training_config["MODEL_CONFIG"],
                           training_config["WEIGHTS"],
                           training_config["DEVICE"])

    learner = Learner(databunch,
                      model,
                      metrics=train_config["METRICS"],
                      path=global_config["ROOT_PATH"],
                      model_dir=global_config["WEIGHT_DIR"],
                      loss_func=train_config["LOSS"])

    # model name & paths
    name = "_".join([train_config["DATE"], train_config["SESSION_NAME"]])
    modelpath = os.path.join(global_config["WEIGHT_DIR"], name)

    if train_config["MIXED_PRECISION"]:
        learner.to_fp16()

    learner.save(modelpath)

    torch.backends.cudnn.benchmark = True

    cbs = [
        SaveModelCallback(learner),
        LearnerTensorboardWriter(
            learner,
            Path(os.path.join(global_config["LOG_DIR"]), "tensorboardx"),
            name),
        TerminateOnNaNCallback()
    ]

    # perform training iteration
    try:
        if train_config["ONE_CYCLE"]:
            learner.fit_one_cycle(train_config["EPOCHS"],
                                  max_lr=train_config["LR"],
                                  callbacks=cbs)
        else:
            learner.fit(train_config["EPOCHS"],
                        lr=train_config["LR"],
                        callbacks=cbs)
    # save model files
    except KeyboardInterrupt:
        learner.save(modelpath)
        raise KeyboardInterrupt

    learner.save(modelpath)
    val_loss = min(learner.recorder.val_losses)
    val_metrics = learner.recorder.metrics

    # log using the logging tool
    logger = log.Log(train_config, run_name=train_config['SESSION_NAME'])
    logger.log_metric('Validation Loss', val_loss)
    logger.log.metrics(val_metrics)
    logger.end_run()

    #write csv log file
    log_content = train_config.copy()
    log_content["VAL_LOSS"] = val_loss
    log_content["VAL_METRICS"] = val_metrics
    log_path = os.path.join(global_config["LOG_DIR"], train_config["LOGFILE"])
    write_log(log_path, log_content)

    return learner, log_content, name
print(learn.callback_fns)

# --- TRAINING ---
if config['FINETUNE']:
    stage0_logger = pd.read_csv(
        learn.path / 'logs_{}fold.csv'.format(config['FOLD_NUMBER']))
    best_epoch = stage0_logger['metric_tot'].idxmax()

    learn.load('{0}{1}/models/{1}_{3}_{2}'.format(config['PATH_WEIGHTS'],
                                                  config['MODEL_NAME'],
                                                  best_epoch, 'stage0'))
    learn.unfreeze()

    checkpoint_callback = SaveModelCallback(learn,
                                            name=config['MODEL_NAME'] +
                                            '_stage1',
                                            every='epoch',
                                            monitor='valid_loss')
    # reduce lr by factor after patience epochs
    reduce_lr_callback = ReduceLROnPlateauCallback(learn,
                                                   monitor='metric_tot',
                                                   factor=0.5,
                                                   patience=5,
                                                   min_lr=1e-6)
    logger = CSVLogger(learn, 'logs_{}fold'.format(config['FOLD_NUMBER']))

    learn.fit(
        40,
        lr=1e-2 / 10.,
        wd=0.,
        callbacks=[checkpoint_callback, reduce_lr_callback],