Пример #1
0
def create_linear_automl(
    task: Task,
    n_folds: int = 5,
    timeout: Optional[None] = None,
    n_reader_jobs: int = 1,
    cpu_limit: int = 4,
    # verbose: int = 0,
    random_state: int = 42,
):
    """Linear automl

    Args:
        base_task: task
        n_folds: number of folds
        timeout: Stub, not used.
        random_state: random_state

    Returns:
        automl:

    """
    torch.set_num_threads(cpu_limit)

    reader = PandasToPandasReader(task,
                                  cv=n_folds,
                                  random_state=random_state,
                                  n_jobs=n_reader_jobs)
    pipe = LinearFeatures()
    model = LinearLBFGS()
    pipeline = MLPipeline([model],
                          pre_selection=None,
                          features_pipeline=pipe,
                          post_selection=None)
    automl = AutoML(reader, [[pipeline]], skip_conn=False)  # , verbose=0)

    return automl
Пример #2
0
)
print("End creation blending...")

print("Start creation automl...")
reader = PandasToPandasReader(
    Task("binary", ),
    samples=None,
    max_nan_rate=1,
    max_constant_rate=1,
)

automl = AutoML(
    reader,
    [
        [gbm_lvl0, reg_lvl0, reg_l1_lvl0],
        [reg_lvl1],
    ],
    skip_conn=False,
    blender=MeanBlender(),
)
print("End creation automl...")

print("Start fit automl...")
roles = {
    "target": "TARGET",
    DatetimeRole(base_date=True, seasonality=(), base_feats=False):
    "report_dt",
}

oof_pred = automl.fit_predict(train, roles=roles)
print("End fit automl...")
Пример #3
0
def test_permutation_importance_based_iterative_selector():
    logging.basicConfig(format="[%(asctime)s] (%(levelname)s): %(message)s",
                        level=logging.DEBUG)

    logging.debug("Load data...")
    data = pd.read_csv("./examples/data/sampled_app_train.csv")
    logging.debug("Data loaded")

    logging.debug("Features modification from user side...")
    data["BIRTH_DATE"] = (
        np.datetime64("2018-01-01") +
        data["DAYS_BIRTH"].astype(np.dtype("timedelta64[D]"))).astype(str)
    data["EMP_DATE"] = (np.datetime64("2018-01-01") +
                        np.clip(data["DAYS_EMPLOYED"], None, 0).astype(
                            np.dtype("timedelta64[D]"))).astype(str)

    data["constant"] = 1
    data["allnan"] = np.nan

    data.drop(["DAYS_BIRTH", "DAYS_EMPLOYED"], axis=1, inplace=True)
    logging.debug("Features modification finished")

    logging.debug("Split data...")
    train_data, test_data = train_test_split(data,
                                             test_size=2000,
                                             stratify=data["TARGET"],
                                             random_state=13)
    train_data.reset_index(drop=True, inplace=True)
    test_data.reset_index(drop=True, inplace=True)
    logging.debug(
        "Data splitted. Parts sizes: train_data = {}, test_data = {}".format(
            train_data.shape, test_data.shape))

    logging.debug("Create task...")
    task = Task("binary")
    logging.debug("Task created")

    logging.debug("Create reader...")
    reader = PandasToPandasReader(task, cv=5, random_state=1)
    logging.debug("Reader created")

    # selector parts
    logging.debug("Create feature selector")
    model0 = BoostLGBM(default_params={
        "learning_rate": 0.05,
        "num_leaves": 64,
        "seed": 42,
        "num_threads": 5,
    })
    pipe0 = LGBSimpleFeatures()
    pie = NpPermutationImportanceEstimator()
    selector = NpIterativeFeatureSelector(pipe0,
                                          model0,
                                          pie,
                                          feature_group_size=1,
                                          max_features_cnt_in_result=15)
    logging.debug("Feature selector created")

    # pipeline 1 level parts
    logging.debug("Start creation pipeline_1...")
    pipe = LGBSimpleFeatures()

    logging.debug("\t ParamsTuner1 and Model1...")
    model1 = BoostLGBM(default_params={
        "learning_rate": 0.05,
        "num_leaves": 128,
        "seed": 1,
        "num_threads": 5,
    })
    logging.debug("\t Tuner1 and model1 created")

    logging.debug("\t ParamsTuner2 and Model2...")
    params_tuner2 = OptunaTuner(n_trials=100, timeout=100)
    model2 = BoostLGBM(default_params={
        "learning_rate": 0.025,
        "num_leaves": 64,
        "seed": 2,
        "num_threads": 5,
    })
    logging.debug("\t Tuner2 and model2 created")

    logging.debug("\t Pipeline1...")
    pipeline_lvl1 = MLPipeline(
        [model1, (model2, params_tuner2)],
        pre_selection=selector,
        features_pipeline=pipe,
        post_selection=None,
    )
    logging.debug("Pipeline1 created")

    # pipeline 2 level parts
    logging.debug("Start creation pipeline_2...")
    pipe1 = LGBSimpleFeatures()

    logging.debug("\t ParamsTuner and Model...")
    model = BoostLGBM(
        default_params={
            "learning_rate": 0.05,
            "num_leaves": 64,
            "max_bin": 1024,
            "seed": 3,
            "num_threads": 5,
        },
        freeze_defaults=True,
    )
    logging.debug("\t Tuner and model created")

    logging.debug("\t Pipeline2...")
    pipeline_lvl2 = MLPipeline([model],
                               pre_selection=None,
                               features_pipeline=pipe1,
                               post_selection=None)
    logging.debug("Pipeline2 created")

    logging.debug("Create AutoML pipeline...")
    automl = AutoML(
        reader,
        [
            [pipeline_lvl1],
            [pipeline_lvl2],
        ],
        skip_conn=False,
    )

    logging.debug("AutoML pipeline created...")

    logging.debug("Start AutoML pipeline fit_predict...")
    start_time = time.time()
    oof_pred = automl.fit_predict(train_data, roles={"target": "TARGET"})
    logging.debug(
        "AutoML pipeline fitted and predicted. Time = {:.3f} sec".format(
            time.time() - start_time))

    logging.debug("Feature importances of selector:\n{}".format(
        selector.get_features_score()))

    logging.debug("oof_pred:\n{}\nShape = {}".format(oof_pred, oof_pred.shape))

    logging.debug("Feature importances of top level algorithm:\n{}".format(
        automl.levels[-1][0].ml_algos[0].get_features_score()))

    logging.debug(
        "Feature importances of lowest level algorithm - model 0:\n{}".format(
            automl.levels[0][0].ml_algos[0].get_features_score()))

    logging.debug(
        "Feature importances of lowest level algorithm - model 1:\n{}".format(
            automl.levels[0][0].ml_algos[1].get_features_score()))

    test_pred = automl.predict(test_data)
    logging.debug("Prediction for test data:\n{}\nShape = {}".format(
        test_pred, test_pred.shape))

    logging.debug("Check scores...")
    logging.debug("OOF score: {}".format(
        roc_auc_score(train_data["TARGET"].values, oof_pred.data[:, 0])))
    logging.debug("TEST score: {}".format(
        roc_auc_score(test_data["TARGET"].values, test_pred.data[:, 0])))
    logging.debug("Pickle automl")
    with open("automl.pickle", "wb") as f:
        pickle.dump(automl, f)

    logging.debug("Load pickled automl")
    with open("automl.pickle", "rb") as f:
        automl = pickle.load(f)

    logging.debug("Predict loaded automl")
    test_pred = automl.predict(test_data)
    logging.debug("TEST score, loaded: {}".format(
        roc_auc_score(test_data["TARGET"].values, test_pred.data[:, 0])))

    os.remove("automl.pickle")
Пример #4
0
    print("\t Tuner2 and model2 created")

    print("\t Pipeline1...")
    pipeline_lvl1 = MLPipeline(
        [model2],
        pre_selection=None,  # selector,
        features_pipeline=pipe,
        post_selection=None,
    )
    print("Pipeline1 created")

    print("Create AutoML pipeline...")
    automl = AutoML(
        reader,
        [
            [pipeline_lvl1],
        ],
        skip_conn=False,
    )

    print("AutoML pipeline created...")

    print("Start AutoML pipeline fit_predict...")
    start_time = time.time()
    oof_pred = automl.fit_predict(train_data, roles={"target": target})
    print("AutoML pipeline fitted and predicted. Time = {:.3f} sec".format(
        time.time() - start_time))

    test_pred = automl.predict(test_data)
    print("Prediction for test data:\n{}\nShape = {}".format(
        test_pred, test_pred.shape))
Пример #5
0
def test_permutation_importance_based_iterative_selector():
    logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s',
                        level=logging.DEBUG)

    logging.debug('Load data...')
    data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv')
    logging.debug('Data loaded')

    logging.debug('Features modification from user side...')
    data['BIRTH_DATE'] = (
        np.datetime64('2018-01-01') +
        data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
    data['EMP_DATE'] = (np.datetime64('2018-01-01') +
                        np.clip(data['DAYS_EMPLOYED'], None, 0).astype(
                            np.dtype('timedelta64[D]'))).astype(str)

    data['constant'] = 1
    data['allnan'] = np.nan

    data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)
    logging.debug('Features modification finished')

    logging.debug('Split data...')
    train_data, test_data = train_test_split(data,
                                             test_size=2000,
                                             stratify=data['TARGET'],
                                             random_state=13)
    train_data.reset_index(drop=True, inplace=True)
    test_data.reset_index(drop=True, inplace=True)
    logging.debug(
        'Data splitted. Parts sizes: train_data = {}, test_data = {}'.format(
            train_data.shape, test_data.shape))

    logging.debug('Create task...')
    task = Task('binary')
    logging.debug('Task created')

    logging.debug('Create reader...')
    reader = PandasToPandasReader(task, cv=5, random_state=1)
    logging.debug('Reader created')

    # selector parts
    logging.debug('Create feature selector')
    model0 = BoostLGBM(default_params={
        'learning_rate': 0.05,
        'num_leaves': 64,
        'seed': 42,
        'num_threads': 5
    })
    pipe0 = LGBSimpleFeatures()
    pie = NpPermutationImportanceEstimator()
    selector = NpIterativeFeatureSelector(pipe0,
                                          model0,
                                          pie,
                                          feature_group_size=1,
                                          max_features_cnt_in_result=15)
    logging.debug('Feature selector created')

    # pipeline 1 level parts
    logging.debug('Start creation pipeline_1...')
    pipe = LGBSimpleFeatures()

    logging.debug('\t ParamsTuner1 and Model1...')
    model1 = BoostLGBM(default_params={
        'learning_rate': 0.05,
        'num_leaves': 128,
        'seed': 1,
        'num_threads': 5
    })
    logging.debug('\t Tuner1 and model1 created')

    logging.debug('\t ParamsTuner2 and Model2...')
    params_tuner2 = OptunaTuner(n_trials=100, timeout=100)
    model2 = BoostLGBM(default_params={
        'learning_rate': 0.025,
        'num_leaves': 64,
        'seed': 2,
        'num_threads': 5
    })
    logging.debug('\t Tuner2 and model2 created')

    logging.debug('\t Pipeline1...')
    pipeline_lvl1 = MLPipeline([model1, (model2, params_tuner2)],
                               pre_selection=selector,
                               features_pipeline=pipe,
                               post_selection=None)
    logging.debug('Pipeline1 created')

    # pipeline 2 level parts
    logging.debug('Start creation pipeline_2...')
    pipe1 = LGBSimpleFeatures()

    logging.debug('\t ParamsTuner and Model...')
    model = BoostLGBM(default_params={
        'learning_rate': 0.05,
        'num_leaves': 64,
        'max_bin': 1024,
        'seed': 3,
        'num_threads': 5
    },
                      freeze_defaults=True)
    logging.debug('\t Tuner and model created')

    logging.debug('\t Pipeline2...')
    pipeline_lvl2 = MLPipeline([model],
                               pre_selection=None,
                               features_pipeline=pipe1,
                               post_selection=None)
    logging.debug('Pipeline2 created')

    logging.debug('Create AutoML pipeline...')
    automl = AutoML(reader, [
        [pipeline_lvl1],
        [pipeline_lvl2],
    ],
                    skip_conn=False)

    logging.debug('AutoML pipeline created...')

    logging.debug('Start AutoML pipeline fit_predict...')
    start_time = time.time()
    oof_pred = automl.fit_predict(train_data, roles={'target': 'TARGET'})
    logging.debug(
        'AutoML pipeline fitted and predicted. Time = {:.3f} sec'.format(
            time.time() - start_time))

    logging.debug('Feature importances of selector:\n{}'.format(
        selector.get_features_score()))

    logging.debug('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

    logging.debug('Feature importances of top level algorithm:\n{}'.format(
        automl.levels[-1][0].ml_algos[0].get_features_score()))

    logging.debug(
        'Feature importances of lowest level algorithm - model 0:\n{}'.format(
            automl.levels[0][0].ml_algos[0].get_features_score()))

    logging.debug(
        'Feature importances of lowest level algorithm - model 1:\n{}'.format(
            automl.levels[0][0].ml_algos[1].get_features_score()))

    test_pred = automl.predict(test_data)
    logging.debug('Prediction for test data:\n{}\nShape = {}'.format(
        test_pred, test_pred.shape))

    logging.debug('Check scores...')
    logging.debug('OOF score: {}'.format(
        roc_auc_score(train_data['TARGET'].values, oof_pred.data[:, 0])))
    logging.debug('TEST score: {}'.format(
        roc_auc_score(test_data['TARGET'].values, test_pred.data[:, 0])))
    logging.debug('Pickle automl')
    with open('automl.pickle', 'wb') as f:
        pickle.dump(automl, f)

    logging.debug('Load pickled automl')
    with open('automl.pickle', 'rb') as f:
        automl = pickle.load(f)

    logging.debug('Predict loaded automl')
    test_pred = automl.predict(test_data)
    logging.debug('TEST score, loaded: {}'.format(
        roc_auc_score(test_data['TARGET'].values, test_pred.data[:, 0])))

    os.remove('automl.pickle')
Пример #6
0
def test_boostlgbm_and_linearlbfgs_in_one_automl_pipeline():
    np.random.seed(42)
    logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.DEBUG)

    logging.debug('Load data...')
    data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv')
    logging.debug('Data loaded')

    logging.debug('Features modification from user side...')
    data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
    data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
                        ).astype(str)

    data['report_dt'] = np.datetime64('2018-01-01')

    data['constant'] = 1
    data['allnan'] = np.nan

    data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

    data['TARGET'] = data['TARGET']
    logging.debug('Features modification finished')

    logging.debug('Split data...')
    train, test = train_test_split(data, test_size=0.2, random_state=42)
    train.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)
    logging.debug('Data splitted. Parts sizes: train_data = {}, test_data = {}'
                  .format(train.shape, test.shape))

    logging.debug('Start creation selector_0...')
    feat_sel_0 = LGBSimpleFeatures()
    mod_sel_0 = BoostLGBM()
    imp_sel_0 = ModelBasedImportanceEstimator()
    selector_0 = ImportanceCutoffSelector(feat_sel_0, mod_sel_0, imp_sel_0, cutoff=0)
    logging.debug('End creation selector_0...')

    logging.debug('Start creation gbm_0...')
    feats_gbm_0 = LGBAdvancedPipeline()
    gbm_0 = BoostLGBM()
    gbm_1 = BoostLGBM()
    tuner_0 = OptunaTuner(n_trials=100, timeout=30, fit_on_holdout=True)
    gbm_lvl0 = MLPipeline([
        (gbm_0, tuner_0),
        gbm_1
    ],
        pre_selection=selector_0,
        features_pipeline=feats_gbm_0, post_selection=None)
    logging.debug('End creation gbm_0...')

    logging.debug('Start creation reg_0...')
    feats_reg_0 = LinearFeatures(output_categories=True)
    reg_0 = LinearLBFGS()
    reg_lvl0 = MLPipeline([
        reg_0
    ],
        pre_selection=None,
        features_pipeline=feats_reg_0,
        post_selection=HighCorrRemoval(corr_co=1))
    logging.debug('End creation reg_0...')

    logging.debug('Start creation composed selector...')
    feat_sel_1 = LGBSimpleFeatures()
    mod_sel_1 = BoostLGBM()
    imp_sel_1 = NpPermutationImportanceEstimator()
    selector_1 = NpIterativeFeatureSelector(feat_sel_1, mod_sel_1, imp_sel_1, feature_group_size=1)
    logging.debug('End creation composed selector...')

    logging.debug('Start creation reg_l1_0...')
    feats_reg_1 = LinearFeatures(output_categories=False)
    reg_1 = LinearL1CD()
    reg_l1_lvl0 = MLPipeline([
        reg_1
    ],

        pre_selection=selector_1,
        features_pipeline=feats_reg_1,
        post_selection=HighCorrRemoval())
    logging.debug('End creation reg_l1_0...')

    logging.debug('Start creation blending...')
    feats_reg_2 = LinearFeatures(output_categories=True)
    reg_2 = LinearLBFGS()
    reg_lvl1 = MLPipeline([
        reg_2
    ],

        pre_selection=None,
        features_pipeline=feats_reg_2,
        post_selection=HighCorrRemoval(corr_co=1))
    logging.debug('End creation blending...')

    logging.debug('Start creation automl...')
    reader = PandasToPandasReader(Task('binary', ), samples=None, max_nan_rate=1, max_constant_rate=1)

    automl = AutoML(reader, [
        [gbm_lvl0, reg_lvl0, reg_l1_lvl0],
        [reg_lvl1],
    ], skip_conn=False, blender=MeanBlender())
    logging.debug('End creation automl...')

    logging.debug('Start fit automl...')
    roles = {'target': 'TARGET',
             DatetimeRole(base_date=True, seasonality=(), base_feats=False): 'report_dt',
             }

    oof_pred = automl.fit_predict(train, roles=roles)
    logging.debug('End fit automl...')

    test_pred = automl.predict(test)
    logging.debug('Prediction for test data:\n{}\nShape = {}'
                  .format(test_pred, test_pred.shape))

    not_nan = np.any(~np.isnan(oof_pred.data), axis=1)

    logging.debug('Check scores...')
    print('OOF score: {}'.format(roc_auc_score(train[roles['target']].values[not_nan], oof_pred.data[not_nan][:, 0])))
    print('TEST score: {}'.format(roc_auc_score(test[roles['target']].values, test_pred.data[:, 0])))
    logging.debug('Pickle automl')
    with open('automl.pickle', 'wb') as f:
        pickle.dump(automl, f)

    logging.debug('Load pickled automl')
    with open('automl.pickle', 'rb') as f:
        automl = pickle.load(f)

    logging.debug('Predict loaded automl')
    test_pred = automl.predict(test)
    logging.debug('TEST score, loaded: {}'.format(roc_auc_score(test['TARGET'].values, test_pred.data[:, 0])))

    os.remove('automl.pickle')
Пример #7
0
    max_constant_rate=1,
    advanced_roles=True,
    drop_score_co=-1,
    n_jobs=1,
)
print("Reader created...")
# ======================================================================================
print("Create blender...")
blender = WeightedBlender()
print("Blender created...")
# ======================================================================================
print("Create AutoML...")
automl = AutoML(
    reader=reader,
    levels=[[gbm_lvl0, reg_lvl0]],
    timer=timer,
    blender=blender,
    skip_conn=False,
)
print("AutoML created...")
# ======================================================================================
print("Fit predict...")
oof_pred = automl.fit_predict(train, roles={"target": "TARGET"})
print("Finished fitting...")

test_pred = automl.predict(test)
print("Prediction for test data:\n{}\nShape = {}".format(
    test_pred, test_pred.shape))
# ======================================================================================
print("Check scores...")
# use only not nan
Пример #8
0
def test_multiclass_task_with_catboost():
    data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv')

    data['BIRTH_DATE'] = (
        np.datetime64('2018-01-01') +
        data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
    data['EMP_DATE'] = (np.datetime64('2018-01-01') +
                        np.clip(data['DAYS_EMPLOYED'], None, 0).astype(
                            np.dtype('timedelta64[D]'))).astype(str)

    data['report_dt'] = np.datetime64('2018-01-01')

    data['constant'] = 1
    data['allnan'] = np.nan

    data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)
    data['TARGET'] = np.where(
        np.random.rand(data.shape[0]) > .5, 2, data['TARGET'].values)

    train, test = train_test_split(data, test_size=2000, random_state=42)
    # ======================================================================================
    logging.debug('Create timer...')
    timer = PipelineTimer(600, mode=2)
    logging.debug('Timer created...')
    # ======================================================================================
    logging.debug('Create selector...')
    timer_gbm = timer.get_task_timer('gbm')
    feat_sel_0 = LGBSimpleFeatures()
    mod_sel_0 = BoostCB(timer=timer_gbm)
    imp_sel_0 = ModelBasedImportanceEstimator()
    selector_0 = ImportanceCutoffSelector(
        feat_sel_0,
        mod_sel_0,
        imp_sel_0,
        cutoff=0,
    )
    logging.debug('Selector created...')
    # ======================================================================================
    logging.debug('Create gbms...')
    feats_gbm_0 = LGBAdvancedPipeline(top_intersections=4, feats_imp=imp_sel_0)
    timer_gbm_0 = timer.get_task_timer('gbm')
    timer_gbm_1 = timer.get_task_timer('gbm')

    gbm_0 = BoostCB(timer=timer_gbm_0, default_params={"devices": "0"})
    gbm_1 = BoostCB(timer=timer_gbm_1, default_params={"devices": "0"})

    tuner_0 = OptunaTuner(n_trials=10, timeout=10, fit_on_holdout=True)
    gbm_lvl0 = MLPipeline([(gbm_0, tuner_0), gbm_1],
                          pre_selection=selector_0,
                          features_pipeline=feats_gbm_0,
                          post_selection=None)
    logging.debug('Gbms created...')
    # ======================================================================================
    logging.debug('Create linear...')
    feats_reg_0 = LinearFeatures(output_categories=True, sparse_ohe='auto')

    timer_reg = timer.get_task_timer('reg')
    reg_0 = LinearLBFGS(timer=timer_reg)

    reg_lvl0 = MLPipeline([reg_0],
                          pre_selection=None,
                          features_pipeline=feats_reg_0,
                          post_selection=None)
    logging.debug('Linear created...')
    # ======================================================================================
    logging.debug('Create reader...')
    reader = PandasToPandasReader(Task(
        'multiclass',
        metric='crossentropy',
    ),
                                  samples=None,
                                  max_nan_rate=1,
                                  max_constant_rate=1,
                                  advanced_roles=True,
                                  drop_score_co=-1,
                                  n_jobs=1)
    logging.debug('Reader created...')
    # ======================================================================================
    logging.debug('Create blender...')
    blender = WeightedBlender()
    logging.debug('Blender created...')
    # ======================================================================================
    logging.debug('Create AutoML...')
    automl = AutoML(reader=reader,
                    levels=[[gbm_lvl0, reg_lvl0]],
                    timer=timer,
                    blender=blender,
                    skip_conn=False)
    logging.debug('AutoML created...')
    # ======================================================================================
    logging.debug('Fit predict...')
    oof_pred = automl.fit_predict(train, roles={'target': "TARGET"})
    logging.debug('Finished fitting...')

    test_pred = automl.predict(test)
    logging.debug('Prediction for test data:\n{}\nShape = {}'.format(
        test_pred, test_pred.shape))
    # ======================================================================================
    logging.debug('Check scores...')
    # use only not nan
    not_nan = np.any(~np.isnan(oof_pred.data), axis=1)

    logging.debug('OOF score: {}'.format(
        log_loss(train['TARGET'].values[not_nan], oof_pred.data[not_nan])))
    logging.debug('TEST score: {}'.format(
        log_loss(test['TARGET'].values, test_pred.data)))
    # ======================================================================================
    for dat, df, name in zip([oof_pred, test_pred], [train, test],
                             ['train', 'test']):
        logging.debug('Check aucs {0}...'.format(name))
        for c in range(3):
            _sc = roc_auc_score((df['TARGET'].values == c).astype(np.float32),
                                dat.data[:, c])
            logging.debug('Cl {0} auc score: {1}'.format(c, _sc))
Пример #9
0
def test_different_losses_and_metrics():
    np.random.seed(42)
    logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s',
                        level=logging.DEBUG)

    logging.debug('Load data...')
    data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv')
    logging.debug('Data loaded')

    logging.debug('Features modification from user side...')
    data['BIRTH_DATE'] = (
        np.datetime64('2018-01-01') +
        data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
    data['EMP_DATE'] = (np.datetime64('2018-01-01') +
                        np.clip(data['DAYS_EMPLOYED'], None, 0).astype(
                            np.dtype('timedelta64[D]'))).astype(str)

    data['constant'] = 1
    data['allnan'] = np.nan

    data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)
    logging.debug('Features modification finished')

    logging.debug('Split data...')
    train_data, test_data = train_test_split(data,
                                             test_size=2000,
                                             stratify=data['TARGET'],
                                             random_state=13)

    train_data.reset_index(drop=True, inplace=True)
    test_data.reset_index(drop=True, inplace=True)
    logging.debug(
        'Data splitted. Parts sizes: train_data = {}, test_data = {}'.format(
            train_data.shape, test_data.shape))

    for task_params, target in zip([{
            'name': 'binary'
    }, {
            'name': 'binary',
            'metric': roc_auc_score
    }, {
            'name': 'reg',
            'loss': 'mse',
            'metric': 'r2'
    }, {
            'name': 'reg',
            'loss': 'rmsle',
            'metric': 'rmsle'
    }, {
            'name': 'reg',
            'loss': 'quantile',
            'loss_params': {
                'q': .9
            },
            'metric': 'quantile',
            'metric_params': {
                'q': .9
            }
    }], ['TARGET', 'TARGET', 'AMT_CREDIT', 'AMT_CREDIT', 'AMT_CREDIT']):
        logging.debug('Create task..')
        task = Task(**task_params)
        logging.debug('Task created')

        logging.debug('Create reader...')
        reader = PandasToPandasReader(task, cv=5, random_state=1)
        logging.debug('Reader created')

        # pipeline 1 level parts
        logging.debug('Start creation pipeline_1...')
        pipe = LGBSimpleFeatures()

        logging.debug('\t ParamsTuner2 and Model2...')
        model2 = BoostLGBM(default_params={
            'learning_rate': 0.025,
            'num_leaves': 64,
            'seed': 2,
            'num_threads': 5
        })
        logging.debug('\t Tuner2 and model2 created')

        logging.debug('\t Pipeline1...')
        pipeline_lvl1 = MLPipeline(
            [model2],
            pre_selection=None,  # selector,
            features_pipeline=pipe,
            post_selection=None)
        logging.debug('Pipeline1 created')

        logging.debug('Create AutoML pipeline...')
        automl = AutoML(reader, [
            [pipeline_lvl1],
        ], skip_conn=False)

        logging.debug('AutoML pipeline created...')

        logging.debug('Start AutoML pipeline fit_predict...')
        start_time = time.time()
        oof_pred = automl.fit_predict(train_data, roles={'target': target})
        logging.debug(
            'AutoML pipeline fitted and predicted. Time = {:.3f} sec'.format(
                time.time() - start_time))

        test_pred = automl.predict(test_data)
        logging.debug('Prediction for test data:\n{}\nShape = {}'.format(
            test_pred, test_pred.shape))

        logging.debug('Check scores...')
        logging.debug('OOF score: {}'.format(
            task.metric_func(train_data[target].values, oof_pred.data[:, 0])))
        logging.debug('TEST score: {}'.format(
            task.metric_func(test_data[target].values, test_pred.data[:, 0])))
        logging.debug('Pickle automl')
        with open('automl.pickle', 'wb') as f:
            pickle.dump(automl, f)

        logging.debug('Load pickled automl')
        with open('automl.pickle', 'rb') as f:
            automl = pickle.load(f)

        logging.debug('Predict loaded automl')
        test_pred = automl.predict(test_data)
        logging.debug('TEST score, loaded: {}'.format(
            roc_auc_score(test_data['TARGET'].values, test_pred.data[:, 0])))

        os.remove('automl.pickle')