示例#1
0
def run_training(fold_number):
    seed_everything(config.seed)
    device = torch.device(config.device)

    # read csv
    data_frame = read_csv(config.train_csv)

    # create stratify kfold
    df_folds = kfold(data_frame)

    # create dataset
    train_dataset = WheatDataset(
        image_ids=df_folds[df_folds['fold'] != fold_number].index.values,
        data_frame=data_frame,
        transforms=get_train_transforms(),
        test=False,
    )

    validation_dataset = WheatDataset(
        image_ids=df_folds[df_folds['fold'] == fold_number].index.values,
        data_frame=data_frame,
        transforms=get_valid_transforms(),
        test=True,
    )

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.batch_size,
        sampler=RandomSampler(train_dataset),
        pin_memory=False,
        drop_last=True,
        num_workers=config.num_workers,
        collate_fn=collate_fn,
    )
    val_loader = torch.utils.data.DataLoader(
        validation_dataset,
        batch_size=config.batch_size,
        num_workers=config.num_workers,
        shuffle=False,
        sampler=SequentialSampler(validation_dataset),
        pin_memory=False,
        collate_fn=collate_fn,
    )

    # model
    model = get_net()
    if len(config.gpu_ids) > 1:
        model = nn.DataParallel(model)
    model.to(device)

    # training
    trainer = Trainner(model=model, config=config, fold_number=fold_number)
    trainer.train(train_loader, val_loader)
示例#2
0
def train():

    path_to_data = '../../data/processed/'
    path_to_output = '../../data/submissions/'
    path_to_preds = '../../data/predictions/'

    version = '1.1'
    random_seed = 8675309
    sample_size = 50000
    n_folds = 5

    params = {
        'nthread': 8,
        'n_estimators': 10000,
        'learning_rate': 0.02,
        'num_leaves': 34,
        'colsample_bytree': 0.9497036,
        'subsample': 0.8715623,
        'max_depth': 8,
        'reg_alpha': 0.041545473,
        'reg_lambda': 0.0735294,
        'min_split_gain': 0.0222415,
        'min_child_weight': 39.3259775,
        'silent': -1,
        'verbose': -1
    }

    train, labels, test, train_ids, test_ids = utils.load_features(
        path_to_data, version, sample_size)
    oof_train, oof_test = utils.kfold(classifier_builder=LightGBMWrapper,
                                      base_classifier=lightgbm.LGBMClassifier,
                                      classifier_params=params,
                                      train=train,
                                      labels=labels,
                                      test=test,
                                      n_folds=n_folds,
                                      random_seed=random_seed,
                                      use_smote=True)

    df_oof_train = pd.DataFrame({
        'SK_ID_CURR': train_ids,
        'TARGET': labels,
        'lightgbm': oof_train
    })
    #    df_oof_train['SK_ID_CURR'] = df_oof_train['SK_ID_CURR'].astype('int32')

    df_oof_test = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': oof_test})
    #    df_oof_test['SK_ID_CURR'] = df_oof_test['SK_ID_CURR'].astype('int32')

    df_oof_train.to_csv(path_to_preds + version + '-lightgbm.csv', index=False)
    df_oof_test.to_csv(path_to_output + version + '-lightgbm.csv', index=False)
示例#3
0
def train():

    path_to_data   = '../../data/processed/'
    path_to_output = '../../data/submissions/'
    path_to_preds  = '../../data/predictions/'

    version = '1.3'
    random_seed = 8675309
    sample_size = None
    n_folds = 5

    xgb_params = {
        'learning_rate':0.1,
        'n_estimators':10000,
        'max_depth':4,
        'min_child_weight':5,
        'subsample':0.8,
        'colsample_bytree':0.8,
        'objective':'binary:logistic',
        'nthread':8,
        'seed':random_seed,
        'scale_pos_weight':2.5,
        'reg_alpha':1.2,
        'early_stopping_rounds':50,
        'verbose':20,
        'eval_metric':'auc'
    }

    train, labels, test, train_ids, test_ids = utils.load_features(path_to_data, version, sample_size)
    oof_train, oof_test = utils.kfold(classifier_builder=XgboostWrapper,
                                      base_classifier=XGBClassifier,
                                      classifier_params=xgb_params,
                                      train=train,
                                      labels=labels,
                                      test=test,
                                      n_folds=n_folds,
                                      random_seed=random_seed)

    df_oof_train = pd.DataFrame({'SK_ID_CURR':train_ids, 'TARGET':labels, 'xgboost':oof_train})
    df_oof_train.fillna(0, inplace=True)
    df_oof_train['SK_ID_CURR'] = df_oof_train['SK_ID_CURR'].astype('int32')

    df_oof_test = pd.DataFrame({'SK_ID_CURR':test_ids, 'TARGET':oof_test})
    df_oof_test['SK_ID_CURR'] = df_oof_test['SK_ID_CURR'].astype('int32')

    df_oof_train.to_csv(path_to_preds + version + '-xgboost.csv', index=False)
    df_oof_test.to_csv(path_to_output + version + '-xgboost.csv', index=False)
    ############################################################
    # k-fold train/test
    ############################################################
    training_start_time = time.time()
    timestamp = str(training_start_time)
    running_score = []
    plateaued = False
    try:
        print('Performing {} iteration(s) of training.'.format(config.ITERATIONS))
        for iteration in range(config.ITERATIONS):
            if plateaued is True:
                break

            print('Starting iteration #{}'.format(iteration))

            for d, v_in, v_out, t_in, t_out in utils.kfold(input_data, output_data, config.K_FOLDS):
                print('Performing k-fold #{:02}'.format(d))

                if config.TRAIN_MODEL is True:

                    if config.RANDOMIZE_INPUT_DATA is True:
                        tf_log_dir = os.path.join(
                            config.TENSORBOARD_LOG_DATA, 'rand')
                    elif config.CONTIGUOUS_INPUT_DATA is True:
                        tf_log_dir = os.path.join(
                            config.TENSORBOARD_LOG_DATA, 'cont')
                    else:
                        tf_log_dir = os.path.join(
                            config.TENSORBOARD_LOG_DATA, 'step')

                    tf_log_dir = os.path.join(
示例#5
0
def train():

    path_to_data = '../../data/processed/'
    path_to_output = '../../data/submissions/'
    path_to_preds = '../../data/predictions/'

    version = '1.3'
    random_seed = 8675309
    sample_size = None
    n_folds = 5

    rf_params = {'n_jobs': -1, 'n_estimators': 100}

    lr_params = {'C': 0.001}

    et_params = {}
    nb_params = {}

    train, labels, test, train_ids, test_ids = utils.load_features(
        path_to_data, version, sample_size)

    train_df = train.fillna(0)
    train_df.replace(np.inf, 0, inplace=True)
    train_df.replace(-np.inf, 0, inplace=True)

    test_df = test.fillna(0)
    test_df.replace(np.inf, 0, inplace=True)
    test_df.replace(-np.inf, 0, inplace=True)

    # ------------------------------------------------------------------------
    #    Start training models.
    # ------------------------------------------------------------------------

    # Start with RandomForest
    oof_train, oof_test = utils.kfold(classifier_builder=SklearnWrapper,
                                      base_classifier=RandomForestClassifier,
                                      classifier_params=rf_params,
                                      train=train_df,
                                      labels=labels,
                                      test=test_df,
                                      n_folds=n_folds,
                                      random_seed=random_seed)

    df_oof_train = pd.DataFrame({
        'SK_ID_CURR': train_ids,
        'TARGET': labels,
        'random-forest': oof_train
    })
    df_oof_train['SK_ID_CURR'] = df_oof_train['SK_ID_CURR'].astype('int32')

    df_oof_test = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': oof_test})
    df_oof_test['SK_ID_CURR'] = df_oof_test['SK_ID_CURR'].astype('int32')

    df_oof_train.to_csv(path_to_preds + version + '-random-forest.csv',
                        index=False)
    df_oof_test.to_csv(path_to_output + version + '-random-forest.csv',
                       index=False)
    del oof_test, oof_train, df_oof_test, df_oof_train

    # Extra trees
    oof_train, oof_test = utils.kfold(classifier_builder=SklearnWrapper,
                                      base_classifier=ExtraTreesClassifier,
                                      classifier_params=et_params,
                                      train=train_df,
                                      labels=labels,
                                      test=test_df,
                                      n_folds=n_folds,
                                      random_seed=random_seed)

    df_oof_train = pd.DataFrame({
        'SK_ID_CURR': train_ids,
        'TARGET': labels,
        'extra-trees': oof_train
    })
    df_oof_train['SK_ID_CURR'] = df_oof_train['SK_ID_CURR'].astype('int32')

    df_oof_test = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': oof_test})
    df_oof_test['SK_ID_CURR'] = df_oof_test['SK_ID_CURR'].astype('int32')

    df_oof_train.to_csv(path_to_preds + version + '-extra-trees.csv',
                        index=False)
    df_oof_test.to_csv(path_to_output + version + '-extra-trees.csv',
                       index=False)
    del oof_test, oof_train, df_oof_test, df_oof_train

    # Naive Bayes
    oof_train, oof_test = utils.kfold(classifier_builder=SklearnWrapper,
                                      base_classifier=GaussianNB,
                                      classifier_params=nb_params,
                                      train=train_df,
                                      labels=labels,
                                      test=test_df,
                                      n_folds=n_folds,
                                      random_seed=random_seed)

    df_oof_train = pd.DataFrame({
        'SK_ID_CURR': train_ids,
        'TARGET': labels,
        'naive-bayes': oof_train
    })
    df_oof_train['SK_ID_CURR'] = df_oof_train['SK_ID_CURR'].astype('int32')

    df_oof_test = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': oof_test})
    df_oof_test['SK_ID_CURR'] = df_oof_test['SK_ID_CURR'].astype('int32')

    df_oof_train.to_csv(path_to_preds + version + '-naive-bayes.csv',
                        index=False)
    df_oof_test.to_csv(path_to_output + version + '-naive-bayes.csv',
                       index=False)

    # Logistic Regression
    oof_train, oof_test = utils.kfold(classifier_builder=SklearnWrapper,
                                      base_classifier=LogisticRegression,
                                      classifier_params=lr_params,
                                      train=train_df,
                                      labels=labels,
                                      test=test_df,
                                      n_folds=n_folds,
                                      random_seed=random_seed)

    df_oof_train = pd.DataFrame({
        'SK_ID_CURR': train_ids,
        'TARGET': labels,
        'logistic-regression': oof_train
    })
    df_oof_train['SK_ID_CURR'] = df_oof_train['SK_ID_CURR'].astype('int32')

    df_oof_test = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': oof_test})
    df_oof_test['SK_ID_CURR'] = df_oof_test['SK_ID_CURR'].astype('int32')

    df_oof_train.to_csv(path_to_preds + version + '-logistic-regression.csv',
                        index=False)
    df_oof_test.to_csv(path_to_output + version + '-logistic-regression.csv',
                       index=False)