def test_manual_pipeline(sampled_app_train_test, sampled_app_roles,
                         binary_task):

    train, test = sampled_app_train_test

    pd_dataset = PandasDataset(train,
                               roles_parser(sampled_app_roles),
                               task=binary_task)

    selector_iterator = FoldsIterator(pd_dataset, 1)

    pipe = LGBSimpleFeatures()

    model0 = BoostLGBM(default_params={
        "learning_rate": 0.05,
        "num_leaves": 64,
        "seed": 0,
        "num_threads": 5,
    })

    mbie = ModelBasedImportanceEstimator()
    selector = ImportanceCutoffSelector(pipe, model0, mbie, cutoff=10)

    selector.fit(selector_iterator)

    pipe = LGBSimpleFeatures()

    params_tuner1 = OptunaTuner(n_trials=10, timeout=300)
    model1 = BoostLGBM(default_params={
        "learning_rate": 0.05,
        "num_leaves": 128
    })

    params_tuner2 = OptunaTuner(n_trials=100, timeout=300)
    model2 = BoostLGBM(default_params={
        "learning_rate": 0.025,
        "num_leaves": 64
    })

    total = MLPipeline(
        [(model1, params_tuner1), (model2, params_tuner2)],
        pre_selection=selector,
        features_pipeline=pipe,
        post_selection=None,
    )

    train_valid = FoldsIterator(pd_dataset)

    total.fit_predict(train_valid)

    total.predict(pd_dataset)

    with open("automl.pickle", "wb") as f:
        pickle.dump(total, f)

    with open("automl.pickle", "rb") as f:
        total = pickle.load(f)

    total.predict(pd_dataset)
    os.remove("automl.pickle")
예제 #2
0
data.drop(["DAYS_BIRTH", "DAYS_EMPLOYED"], axis=1, inplace=True)

data["TARGET"] = data["TARGET"]
print("Features modification finished")

print("Split data...")
train, test = train_test_split(data, test_size=0.2, random_state=42)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
print("Data splitted. Parts sizes: train_data = {}, test_data = {}".format(
    train.shape, test.shape))

print("Start creation selector_0...")
feat_sel_0 = LGBSimpleFeatures()
mod_sel_0 = BoostLGBM()
imp_sel_0 = ModelBasedImportanceEstimator()
selector_0 = ImportanceCutoffSelector(feat_sel_0,
                                      mod_sel_0,
                                      imp_sel_0,
                                      cutoff=0)
print("End creation selector_0...")

print("Start creation gbm_0...")
feats_gbm_0 = LGBAdvancedPipeline()
gbm_0 = BoostLGBM()
gbm_1 = BoostLGBM()
tuner_0 = OptunaTuner(n_trials=100, timeout=30, fit_on_holdout=True)
gbm_lvl0 = MLPipeline(
    [(gbm_0, tuner_0), gbm_1],
    pre_selection=selector_0,
    features_pipeline=feats_gbm_0,
예제 #3
0
def test_cutoff_selector_in_pipeline():
    logging.debug("Load data...")
    data = pd.read_csv("./examples/data/sampled_app_train.csv")
    logging.debug("Data loaded")

    logging.debug("Features modification from user side...")
    data["BIRTH_DATE"] = (
        np.datetime64("2018-01-01") +
        data["DAYS_BIRTH"].astype(np.dtype("timedelta64[D]"))).astype(str)
    data["EMP_DATE"] = (np.datetime64("2018-01-01") +
                        np.clip(data["DAYS_EMPLOYED"], None, 0).astype(
                            np.dtype("timedelta64[D]"))).astype(str)

    data["constant"] = 1
    data["allnan"] = np.nan

    data.drop(["DAYS_BIRTH", "DAYS_EMPLOYED"], axis=1, inplace=True)
    logging.debug("Features modification finished")

    logging.debug("Split data...")
    train_data, test_data = train_test_split(data,
                                             test_size=2000,
                                             stratify=data["TARGET"],
                                             random_state=13)
    train_data.reset_index(drop=True, inplace=True)
    test_data.reset_index(drop=True, inplace=True)
    logging.debug(
        "Data splitted. Parts sizes: train_data = {}, test_data = {}".format(
            train_data.shape, test_data.shape))

    logging.debug("Create task...")
    task = Task("binary")
    logging.debug("Task created")

    logging.debug("Create reader...")
    reader = PandasToPandasReader(task, cv=5, random_state=1)
    logging.debug("Reader created")

    # selector parts
    logging.debug("Create feature selector")
    model0 = BoostLGBM(default_params={
        "learning_rate": 0.05,
        "num_leaves": 64,
        "seed": 42,
        "num_threads": 5,
    })
    pipe0 = LGBSimpleFeatures()
    mbie = ModelBasedImportanceEstimator()
    selector = ImportanceCutoffSelector(pipe0, model0, mbie, cutoff=10)
    logging.debug("Feature selector created")

    # pipeline 1 level parts
    logging.debug("Start creation pipeline_1...")
    pipe = LGBSimpleFeatures()

    logging.debug("\t ParamsTuner1 and Model1...")
    params_tuner1 = OptunaTuner(n_trials=100, timeout=300)
    model1 = BoostLGBM(default_params={
        "learning_rate": 0.05,
        "num_leaves": 128,
        "seed": 1,
        "num_threads": 5,
    })
    logging.debug("\t Tuner1 and model1 created")

    logging.debug("\t ParamsTuner2 and Model2...")
    model2 = BoostLGBM(default_params={
        "learning_rate": 0.025,
        "num_leaves": 64,
        "seed": 2,
        "num_threads": 5,
    })
    logging.debug("\t Tuner2 and model2 created")

    logging.debug("\t Pipeline1...")
    pipeline_lvl1 = MLPipeline(
        [(model1, params_tuner1), model2],
        pre_selection=selector,
        features_pipeline=pipe,
        post_selection=None,
    )
    logging.debug("Pipeline1 created")

    # pipeline 2 level parts
    logging.debug("Start creation pipeline_2...")
    pipe1 = LGBSimpleFeatures()

    logging.debug("\t ParamsTuner and Model...")
    model = BoostLGBM(
        default_params={
            "learning_rate": 0.05,
            "num_leaves": 64,
            "max_bin": 1024,
            "seed": 3,
            "num_threads": 5,
        },
        freeze_defaults=True,
    )
    logging.debug("\t Tuner and model created")

    logging.debug("\t Pipeline2...")
    pipeline_lvl2 = MLPipeline([model],
                               pre_selection=None,
                               features_pipeline=pipe1,
                               post_selection=None)
    logging.debug("Pipeline2 created")

    logging.debug("Create AutoML pipeline...")
    automl = AutoML(
        reader,
        [
            [pipeline_lvl1],
            [pipeline_lvl2],
        ],
        skip_conn=False,
    )

    logging.debug("AutoML pipeline created...")

    logging.debug("Start AutoML pipeline fit_predict...")
    start_time = time.time()
    oof_pred = automl.fit_predict(train_data, roles={"target": "TARGET"})
    logging.debug(
        "AutoML pipeline fitted and predicted. Time = {:.3f} sec".format(
            time.time() - start_time))

    logging.debug("Feature importances of selector:\n{}".format(
        selector.get_features_score()))

    logging.debug("oof_pred:\n{}\nShape = {}".format(oof_pred, oof_pred.shape))

    logging.debug("Feature importances of top level algorithm:\n{}".format(
        automl.levels[-1][0].ml_algos[0].get_features_score()))

    logging.debug(
        "Feature importances of lowest level algorithm - model 0:\n{}".format(
            automl.levels[0][0].ml_algos[0].get_features_score()))

    logging.debug(
        "Feature importances of lowest level algorithm - model 1:\n{}".format(
            automl.levels[0][0].ml_algos[1].get_features_score()))

    test_pred = automl.predict(test_data)
    logging.debug("Prediction for test data:\n{}\nShape = {}".format(
        test_pred, test_pred.shape))

    logging.debug("Check scores...")
    logging.debug("OOF score: {}".format(
        roc_auc_score(train_data["TARGET"].values, oof_pred.data[:, 0])))
    logging.debug("TEST score: {}".format(
        roc_auc_score(test_data["TARGET"].values, test_pred.data[:, 0])))

    logging.debug("Pickle automl")
    with open("automl.pickle", "wb") as f:
        pickle.dump(automl, f)

    logging.debug("Load pickled automl")
    with open("automl.pickle", "rb") as f:
        automl = pickle.load(f)

    logging.debug("Predict loaded automl")
    test_pred = automl.predict(test_data)
    logging.debug("TEST score, loaded: {}".format(
        roc_auc_score(test_data["TARGET"].values, test_pred.data[:, 0])))

    os.remove("automl.pickle")
예제 #4
0
def test_manual_pipeline():
    # Read data from file
    logging.debug('Read data from file')
    data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv',
                       usecols=[
                           'TARGET', 'NAME_CONTRACT_TYPE', 'AMT_CREDIT',
                           'NAME_TYPE_SUITE', 'AMT_GOODS_PRICE', 'DAYS_BIRTH',
                           'DAYS_EMPLOYED'
                       ])

    # Fix dates and convert to date type
    logging.debug('Fix dates and convert to date type')
    data['BIRTH_DATE'] = np.datetime64(
        '2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))
    data['EMP_DATE'] = np.datetime64('2018-01-01') + np.clip(
        data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
    data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

    # Create folds
    logging.debug('Create folds')
    data['__fold__'] = np.random.randint(0, 5, len(data))

    # Print data head
    logging.debug('Print data head')
    print(data.head())

    # # Set roles for columns
    logging.debug('Set roles for columns')
    check_roles = {
        TargetRole(): 'TARGET',
        CategoryRole(dtype=str): ['NAME_CONTRACT_TYPE', 'NAME_TYPE_SUITE'],
        NumericRole(np.float32): ['AMT_CREDIT', 'AMT_GOODS_PRICE'],
        DatetimeRole(seasonality=['y', 'm', 'wd']): ['BIRTH_DATE', 'EMP_DATE'],
        FoldsRole(): '__fold__'
    }

    # create Task
    task = Task('binary')
    # # Creating PandasDataSet
    logging.debug('Creating PandasDataset')
    start_time = time.time()
    pd_dataset = PandasDataset(data, roles_parser(check_roles), task=task)
    logging.debug(
        'PandasDataset created. Time = {:.3f} sec'.format(time.time() -
                                                          start_time))

    # # Print pandas dataset feature roles
    logging.debug('Print pandas dataset feature roles')
    roles = pd_dataset.roles
    for role in roles:
        logging.debug('{}: {}'.format(role, roles[role]))

    # # Feature selection part
    logging.debug('Feature selection part')
    selector_iterator = FoldsIterator(pd_dataset, 1)
    logging.debug('Selection iterator created')

    model = BoostLGBM()
    pipe = LGBSimpleFeatures()
    logging.debug('Pipe and model created')

    model0 = BoostLGBM(default_params={
        'learning_rate': 0.05,
        'num_leaves': 64,
        'seed': 0,
        'num_threads': 5
    })

    mbie = ModelBasedImportanceEstimator()
    selector = ImportanceCutoffSelector(pipe, model0, mbie, cutoff=10)
    start_time = time.time()
    selector.fit(selector_iterator)
    logging.debug(
        'Feature selector fitted. Time = {:.3f} sec'.format(time.time() -
                                                            start_time))

    logging.debug('Feature selector scores:')
    logging.debug('\n{}'.format(selector.get_features_score()))

    # # Build AutoML pipeline
    logging.debug('Start building AutoML pipeline')
    pipe = LGBSimpleFeatures()
    logging.debug('Pipe created')

    params_tuner1 = OptunaTuner(n_trials=10, timeout=300)
    model1 = BoostLGBM(default_params={
        'learning_rate': 0.05,
        'num_leaves': 128
    })
    logging.debug('Tuner1 and model1 created')

    params_tuner2 = OptunaTuner(n_trials=100, timeout=300)
    model2 = BoostLGBM(default_params={
        'learning_rate': 0.025,
        'num_leaves': 64
    })
    logging.debug('Tuner2 and model2 created')

    total = MLPipeline([(model1, params_tuner1), (model2, params_tuner2)],
                       pre_selection=selector,
                       features_pipeline=pipe,
                       post_selection=None)

    logging.debug('Finished building AutoML pipeline')

    # # Create full train iterator
    logging.debug('Full train valid iterator creation')
    train_valid = FoldsIterator(pd_dataset)
    logging.debug('Full train valid iterator created')

    # # Fit predict using pipeline
    logging.debug('Start AutoML pipeline fit_predict')
    start_time = time.time()
    pred = total.fit_predict(train_valid)
    logging.debug(
        'Fit_predict finished. Time = {:.3f} sec'.format(time.time() -
                                                         start_time))

    # # Check preds
    logging.debug('Preds:')
    logging.debug('\n{}'.format(pred))
    logging.debug('Preds.shape = {}'.format(pred.shape))

    # # Predict full train dataset
    logging.debug('Predict full train dataset')
    start_time = time.time()
    train_pred = total.predict(pd_dataset)
    logging.debug('Predict finished. Time = {:.3f} sec'.format(time.time() -
                                                               start_time))
    logging.debug('Preds:')
    logging.debug('\n{}'.format(train_pred))
    logging.debug('Preds.shape = {}'.format(train_pred.shape))

    logging.debug('Pickle automl')
    with open('automl.pickle', 'wb') as f:
        pickle.dump(total, f)

    logging.debug('Load pickled automl')
    with open('automl.pickle', 'rb') as f:
        total = pickle.load(f)

    logging.debug('Predict loaded automl')
    train_pred = total.predict(pd_dataset)
    os.remove('automl.pickle')

    # # Check preds feature names
    logging.debug('Preds features: {}'.format(train_pred.features))

    # # Check model feature scores
    logging.debug('Feature scores for model_1:\n{}'.format(
        model1.get_features_score()))
    logging.debug('Feature scores for model_2:\n{}'.format(
        model2.get_features_score()))
예제 #5
0
파일: demo5.py 프로젝트: zergey/LightAutoML
def test_boostlgbm_and_linearlbfgs_in_one_automl_pipeline():
    np.random.seed(42)
    logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.DEBUG)

    logging.debug('Load data...')
    data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv')
    logging.debug('Data loaded')

    logging.debug('Features modification from user side...')
    data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
    data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
                        ).astype(str)

    data['report_dt'] = np.datetime64('2018-01-01')

    data['constant'] = 1
    data['allnan'] = np.nan

    data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

    data['TARGET'] = data['TARGET']
    logging.debug('Features modification finished')

    logging.debug('Split data...')
    train, test = train_test_split(data, test_size=0.2, random_state=42)
    train.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)
    logging.debug('Data splitted. Parts sizes: train_data = {}, test_data = {}'
                  .format(train.shape, test.shape))

    logging.debug('Start creation selector_0...')
    feat_sel_0 = LGBSimpleFeatures()
    mod_sel_0 = BoostLGBM()
    imp_sel_0 = ModelBasedImportanceEstimator()
    selector_0 = ImportanceCutoffSelector(feat_sel_0, mod_sel_0, imp_sel_0, cutoff=0)
    logging.debug('End creation selector_0...')

    logging.debug('Start creation gbm_0...')
    feats_gbm_0 = LGBAdvancedPipeline()
    gbm_0 = BoostLGBM()
    gbm_1 = BoostLGBM()
    tuner_0 = OptunaTuner(n_trials=100, timeout=30, fit_on_holdout=True)
    gbm_lvl0 = MLPipeline([
        (gbm_0, tuner_0),
        gbm_1
    ],
        pre_selection=selector_0,
        features_pipeline=feats_gbm_0, post_selection=None)
    logging.debug('End creation gbm_0...')

    logging.debug('Start creation reg_0...')
    feats_reg_0 = LinearFeatures(output_categories=True)
    reg_0 = LinearLBFGS()
    reg_lvl0 = MLPipeline([
        reg_0
    ],
        pre_selection=None,
        features_pipeline=feats_reg_0,
        post_selection=HighCorrRemoval(corr_co=1))
    logging.debug('End creation reg_0...')

    logging.debug('Start creation composed selector...')
    feat_sel_1 = LGBSimpleFeatures()
    mod_sel_1 = BoostLGBM()
    imp_sel_1 = NpPermutationImportanceEstimator()
    selector_1 = NpIterativeFeatureSelector(feat_sel_1, mod_sel_1, imp_sel_1, feature_group_size=1)
    logging.debug('End creation composed selector...')

    logging.debug('Start creation reg_l1_0...')
    feats_reg_1 = LinearFeatures(output_categories=False)
    reg_1 = LinearL1CD()
    reg_l1_lvl0 = MLPipeline([
        reg_1
    ],

        pre_selection=selector_1,
        features_pipeline=feats_reg_1,
        post_selection=HighCorrRemoval())
    logging.debug('End creation reg_l1_0...')

    logging.debug('Start creation blending...')
    feats_reg_2 = LinearFeatures(output_categories=True)
    reg_2 = LinearLBFGS()
    reg_lvl1 = MLPipeline([
        reg_2
    ],

        pre_selection=None,
        features_pipeline=feats_reg_2,
        post_selection=HighCorrRemoval(corr_co=1))
    logging.debug('End creation blending...')

    logging.debug('Start creation automl...')
    reader = PandasToPandasReader(Task('binary', ), samples=None, max_nan_rate=1, max_constant_rate=1)

    automl = AutoML(reader, [
        [gbm_lvl0, reg_lvl0, reg_l1_lvl0],
        [reg_lvl1],
    ], skip_conn=False, blender=MeanBlender())
    logging.debug('End creation automl...')

    logging.debug('Start fit automl...')
    roles = {'target': 'TARGET',
             DatetimeRole(base_date=True, seasonality=(), base_feats=False): 'report_dt',
             }

    oof_pred = automl.fit_predict(train, roles=roles)
    logging.debug('End fit automl...')

    test_pred = automl.predict(test)
    logging.debug('Prediction for test data:\n{}\nShape = {}'
                  .format(test_pred, test_pred.shape))

    not_nan = np.any(~np.isnan(oof_pred.data), axis=1)

    logging.debug('Check scores...')
    print('OOF score: {}'.format(roc_auc_score(train[roles['target']].values[not_nan], oof_pred.data[not_nan][:, 0])))
    print('TEST score: {}'.format(roc_auc_score(test[roles['target']].values, test_pred.data[:, 0])))
    logging.debug('Pickle automl')
    with open('automl.pickle', 'wb') as f:
        pickle.dump(automl, f)

    logging.debug('Load pickled automl')
    with open('automl.pickle', 'rb') as f:
        automl = pickle.load(f)

    logging.debug('Predict loaded automl')
    test_pred = automl.predict(test)
    logging.debug('TEST score, loaded: {}'.format(roc_auc_score(test['TARGET'].values, test_pred.data[:, 0])))

    os.remove('automl.pickle')
예제 #6
0
print("Create feature selector")
model01 = BoostLGBM(default_params={
    "learning_rate": 0.05,
    "num_leaves": 64,
    "seed": 42,
    "num_threads": 5,
})
model02 = BoostLGBM(default_params={
    "learning_rate": 0.05,
    "num_leaves": 64,
    "seed": 42,
    "num_threads": 5,
})
pipe0 = LGBSimpleFeatures()
pie = NpPermutationImportanceEstimator()
pie1 = ModelBasedImportanceEstimator()
sel1 = ImportanceCutoffSelector(pipe0, model01, pie1, cutoff=0)
sel2 = NpIterativeFeatureSelector(pipe0,
                                  model02,
                                  pie,
                                  feature_group_size=1,
                                  max_features_cnt_in_result=15)
selector = ComposedSelector([sel1, sel2])
print("Feature selector created")

# pipeline 1 level parts
print("Start creation pipeline_1...")
pipe = LGBSimpleFeatures()

print("\t ParamsTuner1 and Model1...")
params_tuner1 = OptunaTuner(n_trials=100, timeout=100)
예제 #7
0
def test_multiclass_task_with_catboost():
    data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv')

    data['BIRTH_DATE'] = (
        np.datetime64('2018-01-01') +
        data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
    data['EMP_DATE'] = (np.datetime64('2018-01-01') +
                        np.clip(data['DAYS_EMPLOYED'], None, 0).astype(
                            np.dtype('timedelta64[D]'))).astype(str)

    data['report_dt'] = np.datetime64('2018-01-01')

    data['constant'] = 1
    data['allnan'] = np.nan

    data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)
    data['TARGET'] = np.where(
        np.random.rand(data.shape[0]) > .5, 2, data['TARGET'].values)

    train, test = train_test_split(data, test_size=2000, random_state=42)
    # ======================================================================================
    logging.debug('Create timer...')
    timer = PipelineTimer(600, mode=2)
    logging.debug('Timer created...')
    # ======================================================================================
    logging.debug('Create selector...')
    timer_gbm = timer.get_task_timer('gbm')
    feat_sel_0 = LGBSimpleFeatures()
    mod_sel_0 = BoostCB(timer=timer_gbm)
    imp_sel_0 = ModelBasedImportanceEstimator()
    selector_0 = ImportanceCutoffSelector(
        feat_sel_0,
        mod_sel_0,
        imp_sel_0,
        cutoff=0,
    )
    logging.debug('Selector created...')
    # ======================================================================================
    logging.debug('Create gbms...')
    feats_gbm_0 = LGBAdvancedPipeline(top_intersections=4, feats_imp=imp_sel_0)
    timer_gbm_0 = timer.get_task_timer('gbm')
    timer_gbm_1 = timer.get_task_timer('gbm')

    gbm_0 = BoostCB(timer=timer_gbm_0, default_params={"devices": "0"})
    gbm_1 = BoostCB(timer=timer_gbm_1, default_params={"devices": "0"})

    tuner_0 = OptunaTuner(n_trials=10, timeout=10, fit_on_holdout=True)
    gbm_lvl0 = MLPipeline([(gbm_0, tuner_0), gbm_1],
                          pre_selection=selector_0,
                          features_pipeline=feats_gbm_0,
                          post_selection=None)
    logging.debug('Gbms created...')
    # ======================================================================================
    logging.debug('Create linear...')
    feats_reg_0 = LinearFeatures(output_categories=True, sparse_ohe='auto')

    timer_reg = timer.get_task_timer('reg')
    reg_0 = LinearLBFGS(timer=timer_reg)

    reg_lvl0 = MLPipeline([reg_0],
                          pre_selection=None,
                          features_pipeline=feats_reg_0,
                          post_selection=None)
    logging.debug('Linear created...')
    # ======================================================================================
    logging.debug('Create reader...')
    reader = PandasToPandasReader(Task(
        'multiclass',
        metric='crossentropy',
    ),
                                  samples=None,
                                  max_nan_rate=1,
                                  max_constant_rate=1,
                                  advanced_roles=True,
                                  drop_score_co=-1,
                                  n_jobs=1)
    logging.debug('Reader created...')
    # ======================================================================================
    logging.debug('Create blender...')
    blender = WeightedBlender()
    logging.debug('Blender created...')
    # ======================================================================================
    logging.debug('Create AutoML...')
    automl = AutoML(reader=reader,
                    levels=[[gbm_lvl0, reg_lvl0]],
                    timer=timer,
                    blender=blender,
                    skip_conn=False)
    logging.debug('AutoML created...')
    # ======================================================================================
    logging.debug('Fit predict...')
    oof_pred = automl.fit_predict(train, roles={'target': "TARGET"})
    logging.debug('Finished fitting...')

    test_pred = automl.predict(test)
    logging.debug('Prediction for test data:\n{}\nShape = {}'.format(
        test_pred, test_pred.shape))
    # ======================================================================================
    logging.debug('Check scores...')
    # use only not nan
    not_nan = np.any(~np.isnan(oof_pred.data), axis=1)

    logging.debug('OOF score: {}'.format(
        log_loss(train['TARGET'].values[not_nan], oof_pred.data[not_nan])))
    logging.debug('TEST score: {}'.format(
        log_loss(test['TARGET'].values, test_pred.data)))
    # ======================================================================================
    for dat, df, name in zip([oof_pred, test_pred], [train, test],
                             ['train', 'test']):
        logging.debug('Check aucs {0}...'.format(name))
        for c in range(3):
            _sc = roc_auc_score((df['TARGET'].values == c).astype(np.float32),
                                dat.data[:, c])
            logging.debug('Cl {0} auc score: {1}'.format(c, _sc))
예제 #8
0
파일: demo3.py 프로젝트: zergey/LightAutoML
def test_composed_selector_cutoff_and_iterative():
    np.random.seed(42)
    logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s',
                        level=logging.DEBUG)

    logging.debug('Load data...')
    data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv')
    logging.debug('Data loaded')

    logging.debug('Features modification from user side...')
    data['BIRTH_DATE'] = (
        np.datetime64('2018-01-01') +
        data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
    data['EMP_DATE'] = (np.datetime64('2018-01-01') +
                        np.clip(data['DAYS_EMPLOYED'], None, 0).astype(
                            np.dtype('timedelta64[D]'))).astype(str)

    data['constant'] = 1
    data['allnan'] = np.nan

    data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)
    logging.debug('Features modification finished')

    logging.debug('Split data...')
    train_data, test_data = train_test_split(data,
                                             test_size=2000,
                                             stratify=data['TARGET'],
                                             random_state=13)
    train_data.reset_index(drop=True, inplace=True)
    test_data.reset_index(drop=True, inplace=True)
    logging.debug(
        'Data splitted. Parts sizes: train_data = {}, test_data = {}'.format(
            train_data.shape, test_data.shape))

    logging.debug('Create task..')
    task = Task('binary')
    logging.debug('Task created')

    logging.debug('Create reader...')
    reader = PandasToPandasReader(task, cv=5, random_state=1)
    logging.debug('Reader created')

    # selector parts
    logging.debug('Create feature selector')
    model01 = BoostLGBM(default_params={
        'learning_rate': 0.05,
        'num_leaves': 64,
        'seed': 42,
        'num_threads': 5
    })
    model02 = BoostLGBM(default_params={
        'learning_rate': 0.05,
        'num_leaves': 64,
        'seed': 42,
        'num_threads': 5
    })
    pipe0 = LGBSimpleFeatures()
    pie = NpPermutationImportanceEstimator()
    pie1 = ModelBasedImportanceEstimator()
    sel1 = ImportanceCutoffSelector(pipe0, model01, pie1, cutoff=0)
    sel2 = NpIterativeFeatureSelector(pipe0,
                                      model02,
                                      pie,
                                      feature_group_size=1,
                                      max_features_cnt_in_result=15)
    selector = ComposedSelector([sel1, sel2])
    logging.debug('Feature selector created')

    # pipeline 1 level parts
    logging.debug('Start creation pipeline_1...')
    pipe = LGBSimpleFeatures()

    logging.debug('\t ParamsTuner1 and Model1...')
    params_tuner1 = OptunaTuner(n_trials=100, timeout=100)
    model1 = BoostLGBM(default_params={
        'learning_rate': 0.05,
        'num_leaves': 128,
        'seed': 1,
        'num_threads': 5
    })
    logging.debug('\t Tuner1 and model1 created')

    logging.debug('\t ParamsTuner2 and Model2...')
    model2 = BoostLGBM(default_params={
        'learning_rate': 0.025,
        'num_leaves': 64,
        'seed': 2,
        'num_threads': 5
    })
    logging.debug('\t Tuner2 and model2 created')

    logging.debug('\t Pipeline1...')
    pipeline_lvl1 = MLPipeline([(model1, params_tuner1), model2],
                               pre_selection=selector,
                               features_pipeline=pipe,
                               post_selection=None)
    logging.debug('Pipeline1 created')

    # pipeline 2 level parts
    logging.debug('Start creation pipeline_2...')
    pipe1 = LGBSimpleFeatures()

    logging.debug('\t ParamsTuner and Model...')
    model = BoostLGBM(
        default_params={
            'learning_rate': 0.05,
            'num_leaves': 64,
            'max_bin': 1024,
            'seed': 3,
            'num_threads': 5
        })
    logging.debug('\t Tuner and model created')

    logging.debug('\t Pipeline2...')
    pipeline_lvl2 = MLPipeline([model],
                               pre_selection=None,
                               features_pipeline=pipe1,
                               post_selection=None)
    logging.debug('Pipeline2 created')

    logging.debug('Create AutoML pipeline...')
    automl = AutoML(reader, [
        [pipeline_lvl1],
        [pipeline_lvl2],
    ],
                    skip_conn=False)

    logging.debug('AutoML pipeline created...')

    logging.debug('Start AutoML pipeline fit_predict...')
    start_time = time.time()
    oof_pred = automl.fit_predict(train_data, roles={'target': 'TARGET'})
    logging.debug(
        'AutoML pipeline fitted and predicted. Time = {:.3f} sec'.format(
            time.time() - start_time))

    logging.debug('Feature importances of selector:\n{}'.format(
        selector.get_features_score()))

    logging.debug('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

    logging.debug('Feature importances of top level algorithm:\n{}'.format(
        automl.levels[-1][0].ml_algos[0].get_features_score()))

    logging.debug(
        'Feature importances of lowest level algorithm - model 0:\n{}'.format(
            automl.levels[0][0].ml_algos[0].get_features_score()))

    logging.debug(
        'Feature importances of lowest level algorithm - model 1:\n{}'.format(
            automl.levels[0][0].ml_algos[1].get_features_score()))

    test_pred = automl.predict(test_data)
    logging.debug('Prediction for test data:\n{}\nShape = {}'.format(
        test_pred, test_pred.shape))

    logging.debug('Check scores...')
    logging.debug('OOF score: {}'.format(
        roc_auc_score(train_data['TARGET'].values, oof_pred.data[:, 0])))
    logging.debug('TEST score: {}'.format(
        roc_auc_score(test_data['TARGET'].values, test_pred.data[:, 0])))
    logging.debug('Pickle automl')
    with open('automl.pickle', 'wb') as f:
        pickle.dump(automl, f)

    logging.debug('Load pickled automl')
    with open('automl.pickle', 'rb') as f:
        automl = pickle.load(f)

    logging.debug('Predict loaded automl')
    test_pred = automl.predict(test_data)
    logging.debug('TEST score, loaded: {}'.format(
        roc_auc_score(test_data['TARGET'].values, test_pred.data[:, 0])))

    os.remove('automl.pickle')
예제 #9
0
print("Feature selection part")
selector_iterator = FoldsIterator(pd_dataset, 1)
print("Selection iterator created")

model = BoostLGBM()
pipe = LGBSimpleFeatures()
print("Pipe and model created")

model0 = BoostLGBM(default_params={
    "learning_rate": 0.05,
    "num_leaves": 64,
    "seed": 0,
    "num_threads": 5,
})

mbie = ModelBasedImportanceEstimator()
selector = ImportanceCutoffSelector(pipe, model0, mbie, cutoff=10)
start_time = time.time()
selector.fit(selector_iterator)
print("Feature selector fitted. Time = {:.3f} sec".format(time.time() -
                                                          start_time))

print("Feature selector scores:")
print("\n{}".format(selector.get_features_score()))

# # Build AutoML pipeline
print("Start building AutoML pipeline")
pipe = LGBSimpleFeatures()
print("Pipe created")

params_tuner1 = OptunaTuner(n_trials=10, timeout=300)