parser.add_argument('-data_dir', type=str, default='data/')
    return parser


if __name__ == '__main__':

    parser = build_parser()
    args = parser.parse_args()

    model_name = args.model_name
    model_type = 'double' if model_name == 'double_albert' else 'siamese'
    checkpoint_dir = args.checkpoint_dir
    log_dir = args.log_dir
    os.makedirs(checkpoint_dir, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)
    main_logger = init_logger(log_dir, f'finetune_main_{model_name}.log')

    # Import data
    test = pd.read_csv(f'{args.data_dir}test.csv')
    train = pd.read_csv(f'{args.data_dir}train.csv')

    # Min Max scale target after rank transformation
    for col in TARGETS:
        train[col] = train[col].rank(method="average")
    train[TARGETS] = MinMaxScaler().fit_transform(train[TARGETS])
    y = train[TARGETS].values

    # Get model inputs
    ids_train, seg_ids_train = tokenize(
        train, pretrained_model_str=pretrained_models[model_name])
    cat_features_train, _ = get_ohe_categorical_features(
def main():
    args = argparser()
    config_folder = Path(args.train_cfg.strip("/"))
    # config_folder = Path('experiments/albunet_public/01_train_config_part0.yaml'.strip("/"))
    experiment_folder = config_folder.parents[0]

    train_config = load_yaml(config_folder)

    log_dir = Path(experiment_folder, train_config['LOGGER_DIR'])
    log_dir.mkdir(exist_ok=True, parents=True)

    main_logger = init_logger(log_dir, 'train_main.log')

    seed = train_config['SEED']
    init_seed(seed)
    main_logger.info(train_config)

    if "DEVICE_LIST" in train_config:
        os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(
            map(str, train_config["DEVICE_LIST"]))

    pipeline_name = train_config['PIPELINE_NAME']

    train_transform = albu.load(train_config['TRAIN_TRANSFORMS'])
    valid_transform = albu.load(train_config['VALID_TRANSFORMS'])

    non_empty_mask_proba = train_config.get('NON_EMPTY_MASK_PROBA', 0)
    use_sampler = train_config['USE_SAMPLER']

    dataset_folder = train_config['DATA_DIRECTORY']
    folds_distr_path = train_config['FOLD']['FILE']

    num_workers = train_config['WORKERS']
    batch_size = train_config['BATCH_SIZE']
    n_folds = train_config['FOLD']['NUMBER']

    usefolds = map(str, train_config['FOLD']['USEFOLDS'])
    # local_metric_fn, global_metric_fn = init_eval_fns(train_config)

    binarizer_module = importlib.import_module(
        train_config['MASK_BINARIZER']['PY'])
    binarizer_class = getattr(binarizer_module,
                              train_config['MASK_BINARIZER']['CLASS'])
    binarizer_fn = binarizer_class(**train_config['MASK_BINARIZER']['ARGS'])

    eval_module = importlib.import_module(
        train_config['EVALUATION_METRIC']['PY'])
    eval_fn = getattr(eval_module, train_config['EVALUATION_METRIC']['CLASS'])
    eval_fn = functools.partial(eval_fn,
                                **train_config['EVALUATION_METRIC']['ARGS'])

    for fold_id in usefolds:
        main_logger.info('Start training of {} fold....'.format(fold_id))

        train_dataset = BodyMorpDataset(data_folder=dataset_folder,
                                        mode='train',
                                        transform=train_transform,
                                        fold_index=fold_id,
                                        folds_distr_path=folds_distr_path)
        train_sampler = PartDataSampler(folds_distr_path, fold_id,
                                        non_empty_mask_proba)
        if use_sampler:
            train_dataloader = DataLoader(dataset=train_dataset,
                                          batch_size=batch_size,
                                          num_workers=num_workers,
                                          sampler=train_sampler)
        else:
            train_dataloader = DataLoader(dataset=train_dataset,
                                          batch_size=batch_size,
                                          num_workers=num_workers,
                                          shuffle=True)

        valid_dataset = BodyMorpDataset(
            data_folder=dataset_folder,
            mode='val',
            transform=valid_transform,
            fold_index=str(fold_id),
            folds_distr_path=folds_distr_path,
        )
        valid_dataloader = DataLoader(dataset=valid_dataset,
                                      batch_size=batch_size,
                                      num_workers=num_workers,
                                      shuffle=False)

        train_fold(train_config, experiment_folder, pipeline_name, log_dir,
                   fold_id, train_dataloader, valid_dataloader, binarizer_fn,
                   eval_fn)
def train_fold(train_config, experiment_folder, pipeline_name, log_dir,
               fold_id, train_dataloader, valid_dataloader, binarizer_fn,
               eval_fn):

    fold_logger = init_logger(log_dir, 'train_fold_{}.log'.format(fold_id))

    best_checkpoint_folder = Path(experiment_folder,
                                  train_config['CHECKPOINTS']['BEST_FOLDER'])
    best_checkpoint_folder.mkdir(exist_ok=True, parents=True)

    checkpoints_history_folder = Path(
        experiment_folder, train_config['CHECKPOINTS']['FULL_FOLDER'],
        'fold{}'.format(fold_id))
    checkpoints_history_folder.mkdir(exist_ok=True, parents=True)
    checkpoints_topk = train_config['CHECKPOINTS']['TOPK']

    calculation_name = '{}_fold{}'.format(pipeline_name, fold_id)

    device = train_config['DEVICE']

    module = importlib.import_module(train_config['MODEL']['PY'])
    model_class = getattr(module, train_config['MODEL']['CLASS'])
    model = model_class(**train_config['MODEL']['ARGS'])

    pretrained_model_config = train_config['MODEL'].get('PRETRAINED', False)
    if pretrained_model_config:
        loaded_pipeline_name = pretrained_model_config['PIPELINE_NAME']
        pretrained_model_path = Path(
            pretrained_model_config['PIPELINE_PATH'],
            pretrained_model_config['CHECKPOINTS_FOLDER'],
            '{}_fold{}.pth'.format(loaded_pipeline_name, fold_id))
        if pretrained_model_path.is_file():
            model.load_state_dict(torch.load(pretrained_model_path))
            fold_logger.info(
                'load model from {}'.format(pretrained_model_path))

    if len(train_config['DEVICE_LIST']) > 1:
        model = torch.nn.DataParallel(model)

    module = importlib.import_module(train_config['CRITERION']['PY'])
    loss_class = getattr(module, train_config['CRITERION']['CLASS'])
    loss_fn = loss_class(**train_config['CRITERION']['ARGS'])

    optimizer_class = getattr(torch.optim, train_config['OPTIMIZER']['CLASS'])
    optimizer = optimizer_class(model.parameters(),
                                **train_config['OPTIMIZER']['ARGS'])
    scheduler_class = getattr(torch.optim.lr_scheduler,
                              train_config['SCHEDULER']['CLASS'])
    scheduler = scheduler_class(optimizer, **train_config['SCHEDULER']['ARGS'])

    n_epoches = train_config['EPOCHES']
    grad_clip = train_config['GRADIENT_CLIPPING']
    grad_accum = train_config['GRADIENT_ACCUMULATION_STEPS']
    early_stopping = train_config['EARLY_STOPPING']
    validation_frequency = train_config.get('VALIDATION_FREQUENCY', 1)

    freeze_model = train_config['MODEL']['FREEZE']

    Learning(optimizer, binarizer_fn, loss_fn, eval_fn, device, n_epoches,
             scheduler, freeze_model, grad_clip, grad_accum, early_stopping,
             validation_frequency, calculation_name, best_checkpoint_folder,
             checkpoints_history_folder, checkpoints_topk,
             fold_logger).run_train(model, train_dataloader, valid_dataloader)
def main():
    args = argparser()
    config_folder = Path(args.train_cfg.strip("/"))
    experiment_folder = config_folder.parents[0]

    train_config = load_yaml(config_folder)

    log_dir = Path(experiment_folder, train_config['LOGGER_DIR'])
    log_dir.mkdir(exist_ok=True, parents=True)

    main_logger = init_logger(log_dir, 'train_main.log')

    seed = train_config['SEED']
    init_seed(seed)
    main_logger.info(train_config)

    if "DEVICE_LIST" in train_config:
        os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(
            map(str, train_config["DEVICE_LIST"]))

    pipeline_name = train_config['PIPELINE_NAME']
    dataset_folder = train_config['DATA_DIRECTORY']

    train_transform = albu.load(train_config['TRAIN_TRANSFORMS'])
    valid_transform = albu.load(train_config['VALID_TRANSFORMS'])

    non_empty_mask_proba = train_config.get('NON_EMPTY_MASK_PROBA', 0)
    use_sampler = train_config['USE_SAMPLER']

    dataset_folder = train_config['DATA_DIRECTORY']
    folds_distr_path = train_config['FOLD']['FILE']

    num_workers = train_config['WORKERS']
    batch_size = train_config['BATCH_SIZE']
    n_folds = train_config['FOLD']['NUMBER']

    usefolds = map(str, train_config['FOLD']['USEFOLDS'])
    local_metric_fn, global_metric_fn = init_eval_fns(train_config)

    for fold_id in usefolds:
        main_logger.info('Start training of {} fold....'.format(fold_id))

        train_dataset = PneumothoraxDataset(
            data_folder=dataset_folder,
            mode='train',
            transform=train_transform,
            fold_index=fold_id,
            folds_distr_path=folds_distr_path,
        )
        train_sampler = PneumoSampler(folds_distr_path, fold_id,
                                      non_empty_mask_proba)
        if use_sampler:
            train_dataloader = DataLoader(dataset=train_dataset,
                                          batch_size=batch_size,
                                          num_workers=num_workers,
                                          sampler=train_sampler)
        else:
            train_dataloader = DataLoader(dataset=train_dataset,
                                          batch_size=batch_size,
                                          num_workers=num_workers,
                                          shuffle=True)

        valid_dataset = PneumothoraxDataset(
            data_folder=dataset_folder,
            mode='val',
            transform=valid_transform,
            fold_index=str(fold_id),
            folds_distr_path=folds_distr_path,
        )
        valid_dataloader = DataLoader(dataset=valid_dataset,
                                      batch_size=batch_size,
                                      num_workers=num_workers,
                                      shuffle=False)

        train_fold(train_config, experiment_folder, pipeline_name, log_dir,
                   fold_id, train_dataloader, valid_dataloader,
                   local_metric_fn, global_metric_fn)
示例#5
0
    parser.add_argument('-data_dir', type=str, default='data/')
    return parser


if __name__ == '__main__':

    parser = build_parser()
    args = parser.parse_args()

    model_name = args.model_name
    model_type = 'double' if model_name == 'double_albert' else 'siamese'
    checkpoint_dir = args.checkpoint_dir
    log_dir = args.log_dir
    os.makedirs(checkpoint_dir, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)
    main_logger = init_logger(log_dir, f'train_main_{model_name}.log')

    # Import data
    test = pd.read_csv(f'{args.data_dir}test.csv')
    train = pd.read_csv(f'{args.data_dir}train.csv')

    # Min Max scale target after rank transformation
    for col in TARGETS:
        train[col] = train[col].rank(method="average")
    train[TARGETS] = MinMaxScaler().fit_transform(train[TARGETS])
    y = train[TARGETS].values

    # Get model inputs
    ids_train, seg_ids_train = tokenize(
        train, pretrained_model_str=pretrained_models[model_name])
    cat_features_train, _ = get_ohe_categorical_features(
             validation_frequency, calculation_name, best_checkpoint_folder,
             checkpoints_history_folder, checkpoints_topk,
             fold_logger).run_train(model, train_dataloader, val_dataloader)


if __name__ == '__main__':
    args = argparser()
    config_file = Path(args['train_config'].strip('/'))
    experiment_folder = config_file.parents[0]

    train_config = helpers.load_yaml(config_file)

    log_dir = Path(experiment_folder, train_config['LOGGER_DIR'])
    log_dir.mkdir(parents=True, exist_ok=True)

    main_logger = helpers.init_logger(log_dir, 'train_main.log')

    seed = train_config['SEED']
    helpers.init_seed(seed)
    main_logger.info(train_config)

    if "DEVICE_LIST" in train_config:
        os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(
            map(str, train_config['DEVICE_LIST']))

    pipeline_name = train_config['PIPELINE_NAME']
    dataset_folder = train_config['DATA_DIRECTORY']

    train_transform = albu.load(train_config['TRAIN_TRANSFORMS'])
    val_transform = albu.load(train_config['VAL_TRANSFORMS'])