Пример #1
0
    def __init__(
        self,
        propensity_learner: Optional[AutoML] = None,
        mean_outcome_learner: Optional[AutoML] = None,
        effect_learner: Optional[AutoML] = None,
        base_task: Optional[Task] = Task("binary"),
        timeout: Optional[int] = None,
        cpu_limit: int = 4,
        gpu_ids: Optional[str] = "all",
    ):
        """
        Args:
            propensity_learner: AutoML model, if `None` then will be used model by default (task must be 'binary')
            mean_outcome_learner: AutoML model, if `None` then will be used model by default
            effect_learner: AutoML model, if `None` then will be used model by default (task must be 'reg')
            base_task: task
            timeout: Timeout
            cpu_limit: CPU limit that that are passed to each automl.
            gpu_ids: GPU IDs that are passed to each automl.

        """
        if propensity_learner is not None and self._get_task(propensity_learner).name != "binary":
            raise RuntimeError("Task of 'propensity_learner' must be 'binary'")

        if mean_outcome_learner is None and base_task is None:
            raise RuntimeError("Must specify 'mean_outcome_learner' or base_task")

        if effect_learner is not None and self._get_task(effect_learner).name != "reg":
            raise RuntimeError("Task of effect_learner must be 'reg'")

        super().__init__(base_task, timeout, cpu_limit, gpu_ids)

        self.propensity_learner: AutoML
        self.mean_outcome_learner: AutoML
        self.effect_learner: AutoML

        no_learners = (propensity_learner is None) and (mean_outcome_learner is None) and (effect_learner is None)
        tabular_timeout = timeout / 3 if no_learners and timeout is not None else None

        if propensity_learner is None:
            self.propensity_learner = TabularAutoML(task=Task("binary"), timeout=tabular_timeout)
        else:
            self.propensity_learner = propensity_learner

        if mean_outcome_learner is not None:
            self.mean_outcome_learner = mean_outcome_learner
            self.base_task = self._get_task(mean_outcome_learner)
        elif base_task is not None:
            self.mean_outcome_learner = TabularAutoML(task=base_task, timeout=tabular_timeout)

        if effect_learner is None:
            self.effect_learner = TabularAutoML(task=Task("reg"), timeout=tabular_timeout)
        else:
            self.effect_learner = effect_learner
Пример #2
0
def test_time_series_iterator_and_multiprocessed_inference():
    np.random.seed(42)
    logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s',
                        level=logging.DEBUG)

    data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv')

    data['BIRTH_DATE'] = (
        np.datetime64('2018-01-01') +
        data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
    data['EMP_DATE'] = (np.datetime64('2018-01-01') +
                        np.clip(data['DAYS_EMPLOYED'], None, 0).astype(
                            np.dtype('timedelta64[D]'))).astype(str)

    data['report_dt'] = np.datetime64('2018-01-01')

    data['constant'] = 1
    data['allnan'] = np.nan

    data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

    train, test = train_test_split(data, test_size=2000, random_state=42)
    # create time series iterator that is passed as cv_func
    cv_iter = TimeSeriesIterator(train['EMP_DATE'].astype(np.datetime64),
                                 n_splits=5,
                                 sorted_kfold=False)

    # train dataset may be passed as dict of np.ndarray
    train = {
        'data': train[['AMT_CREDIT', 'AMT_ANNUITY']].values,
        'target': train['TARGET'].values
    }

    task = Task('binary', )

    automl = TabularAutoML(
        task=task,
        timeout=200,
    )
    oof_pred = automl.fit_predict(train,
                                  train_features=['AMT_CREDIT', 'AMT_ANNUITY'],
                                  cv_iter=cv_iter)
    # prediction can be made on file by
    test.to_csv('temp_test_data.csv', index=False)
    test_pred = automl.predict('temp_test_data.csv', batch_size=100, n_jobs=4)

    logging.debug('Check scores...')
    oof_prediction = oof_pred.data[:, 0]
    not_empty = np.logical_not(np.isnan(oof_prediction))
    logging.debug('OOF score: {}'.format(
        roc_auc_score(train['target'][not_empty], oof_prediction[not_empty])))
    logging.debug('TEST score: {}'.format(
        roc_auc_score(test['TARGET'].values, test_pred.data[:, 0])))
Пример #3
0
def test_tabular_utilized_preset():
    np.random.seed(42)
    logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.DEBUG)

    data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv')

    data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
    data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
                        ).astype(str)

    data['report_dt'] = np.datetime64('2018-01-01')

    data['constant'] = 1
    data['allnan'] = np.nan

    data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

    train, test = train_test_split(data, test_size=2000, random_state=42)

    roles = {'target': 'TARGET',
             DatetimeRole(base_date=True, seasonality=(), base_feats=False): 'report_dt',
             }

    task = Task('binary', )

    automl = TabularUtilizedAutoML(task=task, timeout=600, )
    oof_pred = automl.fit_predict(train, roles=roles)
    test_pred = automl.predict(test)

    logging.debug('Check scores...')
    # use only not nan
    not_nan = np.any(~np.isnan(oof_pred.data), axis=1)

    logging.debug('OOF score: {}'.format(roc_auc_score(train['TARGET'].values[not_nan], oof_pred.data[not_nan])))
    logging.debug('TEST score: {}'.format(roc_auc_score(test[roles['target']].values, test_pred.data[:, 0])))
    logging.debug('Pickle automl')
    with open('automl.pickle', 'wb') as f:
        pickle.dump(automl, f)

    logging.debug('Load pickled automl')
    with open('automl.pickle', 'rb') as f:
        automl = pickle.load(f)

    logging.debug('Predict loaded automl')
    test_pred = automl.predict(test)
    logging.debug('TEST score, loaded: {}'.format(roc_auc_score(test['TARGET'].values, test_pred.data[:, 0])))

    os.remove('automl.pickle')
Пример #4
0
def test_manual_pipeline():
    # Read data from file
    logging.debug('Read data from file')
    data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv',
                       usecols=[
                           'TARGET', 'NAME_CONTRACT_TYPE', 'AMT_CREDIT',
                           'NAME_TYPE_SUITE', 'AMT_GOODS_PRICE', 'DAYS_BIRTH',
                           'DAYS_EMPLOYED'
                       ])

    # Fix dates and convert to date type
    logging.debug('Fix dates and convert to date type')
    data['BIRTH_DATE'] = np.datetime64(
        '2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))
    data['EMP_DATE'] = np.datetime64('2018-01-01') + np.clip(
        data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
    data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

    # Create folds
    logging.debug('Create folds')
    data['__fold__'] = np.random.randint(0, 5, len(data))

    # Print data head
    logging.debug('Print data head')
    print(data.head())

    # # Set roles for columns
    logging.debug('Set roles for columns')
    check_roles = {
        TargetRole(): 'TARGET',
        CategoryRole(dtype=str): ['NAME_CONTRACT_TYPE', 'NAME_TYPE_SUITE'],
        NumericRole(np.float32): ['AMT_CREDIT', 'AMT_GOODS_PRICE'],
        DatetimeRole(seasonality=['y', 'm', 'wd']): ['BIRTH_DATE', 'EMP_DATE'],
        FoldsRole(): '__fold__'
    }

    # create Task
    task = Task('binary')
    # # Creating PandasDataSet
    logging.debug('Creating PandasDataset')
    start_time = time.time()
    pd_dataset = PandasDataset(data, roles_parser(check_roles), task=task)
    logging.debug(
        'PandasDataset created. Time = {:.3f} sec'.format(time.time() -
                                                          start_time))

    # # Print pandas dataset feature roles
    logging.debug('Print pandas dataset feature roles')
    roles = pd_dataset.roles
    for role in roles:
        logging.debug('{}: {}'.format(role, roles[role]))

    # # Feature selection part
    logging.debug('Feature selection part')
    selector_iterator = FoldsIterator(pd_dataset, 1)
    logging.debug('Selection iterator created')

    model = BoostLGBM()
    pipe = LGBSimpleFeatures()
    logging.debug('Pipe and model created')

    model0 = BoostLGBM(default_params={
        'learning_rate': 0.05,
        'num_leaves': 64,
        'seed': 0,
        'num_threads': 5
    })

    mbie = ModelBasedImportanceEstimator()
    selector = ImportanceCutoffSelector(pipe, model0, mbie, cutoff=10)
    start_time = time.time()
    selector.fit(selector_iterator)
    logging.debug(
        'Feature selector fitted. Time = {:.3f} sec'.format(time.time() -
                                                            start_time))

    logging.debug('Feature selector scores:')
    logging.debug('\n{}'.format(selector.get_features_score()))

    # # Build AutoML pipeline
    logging.debug('Start building AutoML pipeline')
    pipe = LGBSimpleFeatures()
    logging.debug('Pipe created')

    params_tuner1 = OptunaTuner(n_trials=10, timeout=300)
    model1 = BoostLGBM(default_params={
        'learning_rate': 0.05,
        'num_leaves': 128
    })
    logging.debug('Tuner1 and model1 created')

    params_tuner2 = OptunaTuner(n_trials=100, timeout=300)
    model2 = BoostLGBM(default_params={
        'learning_rate': 0.025,
        'num_leaves': 64
    })
    logging.debug('Tuner2 and model2 created')

    total = MLPipeline([(model1, params_tuner1), (model2, params_tuner2)],
                       pre_selection=selector,
                       features_pipeline=pipe,
                       post_selection=None)

    logging.debug('Finished building AutoML pipeline')

    # # Create full train iterator
    logging.debug('Full train valid iterator creation')
    train_valid = FoldsIterator(pd_dataset)
    logging.debug('Full train valid iterator created')

    # # Fit predict using pipeline
    logging.debug('Start AutoML pipeline fit_predict')
    start_time = time.time()
    pred = total.fit_predict(train_valid)
    logging.debug(
        'Fit_predict finished. Time = {:.3f} sec'.format(time.time() -
                                                         start_time))

    # # Check preds
    logging.debug('Preds:')
    logging.debug('\n{}'.format(pred))
    logging.debug('Preds.shape = {}'.format(pred.shape))

    # # Predict full train dataset
    logging.debug('Predict full train dataset')
    start_time = time.time()
    train_pred = total.predict(pd_dataset)
    logging.debug('Predict finished. Time = {:.3f} sec'.format(time.time() -
                                                               start_time))
    logging.debug('Preds:')
    logging.debug('\n{}'.format(train_pred))
    logging.debug('Preds.shape = {}'.format(train_pred.shape))

    logging.debug('Pickle automl')
    with open('automl.pickle', 'wb') as f:
        pickle.dump(total, f)

    logging.debug('Load pickled automl')
    with open('automl.pickle', 'rb') as f:
        total = pickle.load(f)

    logging.debug('Predict loaded automl')
    train_pred = total.predict(pd_dataset)
    os.remove('automl.pickle')

    # # Check preds feature names
    logging.debug('Preds features: {}'.format(train_pred.features))

    # # Check model feature scores
    logging.debug('Feature scores for model_1:\n{}'.format(
        model1.get_features_score()))
    logging.debug('Feature scores for model_2:\n{}'.format(
        model2.get_features_score()))
Пример #5
0
data["report_dt"] = np.datetime64("2018-01-01")

data["constant"] = 1
data["allnan"] = np.nan

data.drop(["DAYS_BIRTH", "DAYS_EMPLOYED"], axis=1, inplace=True)

train, test = train_test_split(data, test_size=2000, random_state=42)

roles = {
    "target": "TARGET",
    DatetimeRole(base_date=True, seasonality=(), base_feats=False):
    "report_dt",
}

task = Task("binary", )

automl = TabularAutoML(
    task=task,
    timeout=600,
    general_params={
        "use_algos": [
            [
                "linear_l2",
                "lgb",
            ],
            ["linear_l2", "lgb"],
        ],
        "nested_cv": True,
        "skip_conn": True,
    },
Пример #6
0
def test_boostlgbm_and_linearlbfgs_in_one_automl_pipeline():
    np.random.seed(42)
    logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.DEBUG)

    logging.debug('Load data...')
    data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv')
    logging.debug('Data loaded')

    logging.debug('Features modification from user side...')
    data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
    data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
                        ).astype(str)

    data['report_dt'] = np.datetime64('2018-01-01')

    data['constant'] = 1
    data['allnan'] = np.nan

    data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

    data['TARGET'] = data['TARGET']
    logging.debug('Features modification finished')

    logging.debug('Split data...')
    train, test = train_test_split(data, test_size=0.2, random_state=42)
    train.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)
    logging.debug('Data splitted. Parts sizes: train_data = {}, test_data = {}'
                  .format(train.shape, test.shape))

    logging.debug('Start creation selector_0...')
    feat_sel_0 = LGBSimpleFeatures()
    mod_sel_0 = BoostLGBM()
    imp_sel_0 = ModelBasedImportanceEstimator()
    selector_0 = ImportanceCutoffSelector(feat_sel_0, mod_sel_0, imp_sel_0, cutoff=0)
    logging.debug('End creation selector_0...')

    logging.debug('Start creation gbm_0...')
    feats_gbm_0 = LGBAdvancedPipeline()
    gbm_0 = BoostLGBM()
    gbm_1 = BoostLGBM()
    tuner_0 = OptunaTuner(n_trials=100, timeout=30, fit_on_holdout=True)
    gbm_lvl0 = MLPipeline([
        (gbm_0, tuner_0),
        gbm_1
    ],
        pre_selection=selector_0,
        features_pipeline=feats_gbm_0, post_selection=None)
    logging.debug('End creation gbm_0...')

    logging.debug('Start creation reg_0...')
    feats_reg_0 = LinearFeatures(output_categories=True)
    reg_0 = LinearLBFGS()
    reg_lvl0 = MLPipeline([
        reg_0
    ],
        pre_selection=None,
        features_pipeline=feats_reg_0,
        post_selection=HighCorrRemoval(corr_co=1))
    logging.debug('End creation reg_0...')

    logging.debug('Start creation composed selector...')
    feat_sel_1 = LGBSimpleFeatures()
    mod_sel_1 = BoostLGBM()
    imp_sel_1 = NpPermutationImportanceEstimator()
    selector_1 = NpIterativeFeatureSelector(feat_sel_1, mod_sel_1, imp_sel_1, feature_group_size=1)
    logging.debug('End creation composed selector...')

    logging.debug('Start creation reg_l1_0...')
    feats_reg_1 = LinearFeatures(output_categories=False)
    reg_1 = LinearL1CD()
    reg_l1_lvl0 = MLPipeline([
        reg_1
    ],

        pre_selection=selector_1,
        features_pipeline=feats_reg_1,
        post_selection=HighCorrRemoval())
    logging.debug('End creation reg_l1_0...')

    logging.debug('Start creation blending...')
    feats_reg_2 = LinearFeatures(output_categories=True)
    reg_2 = LinearLBFGS()
    reg_lvl1 = MLPipeline([
        reg_2
    ],

        pre_selection=None,
        features_pipeline=feats_reg_2,
        post_selection=HighCorrRemoval(corr_co=1))
    logging.debug('End creation blending...')

    logging.debug('Start creation automl...')
    reader = PandasToPandasReader(Task('binary', ), samples=None, max_nan_rate=1, max_constant_rate=1)

    automl = AutoML(reader, [
        [gbm_lvl0, reg_lvl0, reg_l1_lvl0],
        [reg_lvl1],
    ], skip_conn=False, blender=MeanBlender())
    logging.debug('End creation automl...')

    logging.debug('Start fit automl...')
    roles = {'target': 'TARGET',
             DatetimeRole(base_date=True, seasonality=(), base_feats=False): 'report_dt',
             }

    oof_pred = automl.fit_predict(train, roles=roles)
    logging.debug('End fit automl...')

    test_pred = automl.predict(test)
    logging.debug('Prediction for test data:\n{}\nShape = {}'
                  .format(test_pred, test_pred.shape))

    not_nan = np.any(~np.isnan(oof_pred.data), axis=1)

    logging.debug('Check scores...')
    print('OOF score: {}'.format(roc_auc_score(train[roles['target']].values[not_nan], oof_pred.data[not_nan][:, 0])))
    print('TEST score: {}'.format(roc_auc_score(test[roles['target']].values, test_pred.data[:, 0])))
    logging.debug('Pickle automl')
    with open('automl.pickle', 'wb') as f:
        pickle.dump(automl, f)

    logging.debug('Load pickled automl')
    with open('automl.pickle', 'rb') as f:
        automl = pickle.load(f)

    logging.debug('Predict loaded automl')
    test_pred = automl.predict(test)
    logging.debug('TEST score, loaded: {}'.format(roc_auc_score(test['TARGET'].values, test_pred.data[:, 0])))

    os.remove('automl.pickle')
Пример #7
0
print("Create linear...")
feats_reg_0 = LinearFeatures(output_categories=True, sparse_ohe="auto")

timer_reg = timer.get_task_timer("reg")
reg_0 = LinearLBFGS(timer=timer_reg)

reg_lvl0 = MLPipeline([reg_0],
                      pre_selection=None,
                      features_pipeline=feats_reg_0,
                      post_selection=None)
print("Linear created...")
# ======================================================================================
print("Create reader...")
reader = PandasToPandasReader(
    Task(
        "multiclass",
        metric="crossentropy",  # metric_params = {'multi_class': 'ovr'}
    ),
    samples=None,
    max_nan_rate=1,
    max_constant_rate=1,
    advanced_roles=True,
    drop_score_co=-1,
    n_jobs=1,
)
print("Reader created...")
# ======================================================================================
print("Create blender...")
blender = WeightedBlender()
print("Blender created...")
# ======================================================================================
print("Create AutoML...")
Пример #8
0
print("End creation reg_l1_0...")

print("Start creation blending...")
feats_reg_2 = LinearFeatures(output_categories=True)
reg_2 = LinearLBFGS()
reg_lvl1 = MLPipeline(
    [reg_2],
    pre_selection=None,
    features_pipeline=feats_reg_2,
    post_selection=HighCorrRemoval(corr_co=1),
)
print("End creation blending...")

print("Start creation automl...")
reader = PandasToPandasReader(
    Task("binary", ),
    samples=None,
    max_nan_rate=1,
    max_constant_rate=1,
)

automl = AutoML(
    reader,
    [
        [gbm_lvl0, reg_lvl0, reg_l1_lvl0],
        [reg_lvl1],
    ],
    skip_conn=False,
    blender=MeanBlender(),
)
print("End creation automl...")
Пример #9
0
def test_permutation_importance_based_iterative_selector():
    logging.basicConfig(format="[%(asctime)s] (%(levelname)s): %(message)s",
                        level=logging.DEBUG)

    logging.debug("Load data...")
    data = pd.read_csv("./examples/data/sampled_app_train.csv")
    logging.debug("Data loaded")

    logging.debug("Features modification from user side...")
    data["BIRTH_DATE"] = (
        np.datetime64("2018-01-01") +
        data["DAYS_BIRTH"].astype(np.dtype("timedelta64[D]"))).astype(str)
    data["EMP_DATE"] = (np.datetime64("2018-01-01") +
                        np.clip(data["DAYS_EMPLOYED"], None, 0).astype(
                            np.dtype("timedelta64[D]"))).astype(str)

    data["constant"] = 1
    data["allnan"] = np.nan

    data.drop(["DAYS_BIRTH", "DAYS_EMPLOYED"], axis=1, inplace=True)
    logging.debug("Features modification finished")

    logging.debug("Split data...")
    train_data, test_data = train_test_split(data,
                                             test_size=2000,
                                             stratify=data["TARGET"],
                                             random_state=13)
    train_data.reset_index(drop=True, inplace=True)
    test_data.reset_index(drop=True, inplace=True)
    logging.debug(
        "Data splitted. Parts sizes: train_data = {}, test_data = {}".format(
            train_data.shape, test_data.shape))

    logging.debug("Create task...")
    task = Task("binary")
    logging.debug("Task created")

    logging.debug("Create reader...")
    reader = PandasToPandasReader(task, cv=5, random_state=1)
    logging.debug("Reader created")

    # selector parts
    logging.debug("Create feature selector")
    model0 = BoostLGBM(default_params={
        "learning_rate": 0.05,
        "num_leaves": 64,
        "seed": 42,
        "num_threads": 5,
    })
    pipe0 = LGBSimpleFeatures()
    pie = NpPermutationImportanceEstimator()
    selector = NpIterativeFeatureSelector(pipe0,
                                          model0,
                                          pie,
                                          feature_group_size=1,
                                          max_features_cnt_in_result=15)
    logging.debug("Feature selector created")

    # pipeline 1 level parts
    logging.debug("Start creation pipeline_1...")
    pipe = LGBSimpleFeatures()

    logging.debug("\t ParamsTuner1 and Model1...")
    model1 = BoostLGBM(default_params={
        "learning_rate": 0.05,
        "num_leaves": 128,
        "seed": 1,
        "num_threads": 5,
    })
    logging.debug("\t Tuner1 and model1 created")

    logging.debug("\t ParamsTuner2 and Model2...")
    params_tuner2 = OptunaTuner(n_trials=100, timeout=100)
    model2 = BoostLGBM(default_params={
        "learning_rate": 0.025,
        "num_leaves": 64,
        "seed": 2,
        "num_threads": 5,
    })
    logging.debug("\t Tuner2 and model2 created")

    logging.debug("\t Pipeline1...")
    pipeline_lvl1 = MLPipeline(
        [model1, (model2, params_tuner2)],
        pre_selection=selector,
        features_pipeline=pipe,
        post_selection=None,
    )
    logging.debug("Pipeline1 created")

    # pipeline 2 level parts
    logging.debug("Start creation pipeline_2...")
    pipe1 = LGBSimpleFeatures()

    logging.debug("\t ParamsTuner and Model...")
    model = BoostLGBM(
        default_params={
            "learning_rate": 0.05,
            "num_leaves": 64,
            "max_bin": 1024,
            "seed": 3,
            "num_threads": 5,
        },
        freeze_defaults=True,
    )
    logging.debug("\t Tuner and model created")

    logging.debug("\t Pipeline2...")
    pipeline_lvl2 = MLPipeline([model],
                               pre_selection=None,
                               features_pipeline=pipe1,
                               post_selection=None)
    logging.debug("Pipeline2 created")

    logging.debug("Create AutoML pipeline...")
    automl = AutoML(
        reader,
        [
            [pipeline_lvl1],
            [pipeline_lvl2],
        ],
        skip_conn=False,
    )

    logging.debug("AutoML pipeline created...")

    logging.debug("Start AutoML pipeline fit_predict...")
    start_time = time.time()
    oof_pred = automl.fit_predict(train_data, roles={"target": "TARGET"})
    logging.debug(
        "AutoML pipeline fitted and predicted. Time = {:.3f} sec".format(
            time.time() - start_time))

    logging.debug("Feature importances of selector:\n{}".format(
        selector.get_features_score()))

    logging.debug("oof_pred:\n{}\nShape = {}".format(oof_pred, oof_pred.shape))

    logging.debug("Feature importances of top level algorithm:\n{}".format(
        automl.levels[-1][0].ml_algos[0].get_features_score()))

    logging.debug(
        "Feature importances of lowest level algorithm - model 0:\n{}".format(
            automl.levels[0][0].ml_algos[0].get_features_score()))

    logging.debug(
        "Feature importances of lowest level algorithm - model 1:\n{}".format(
            automl.levels[0][0].ml_algos[1].get_features_score()))

    test_pred = automl.predict(test_data)
    logging.debug("Prediction for test data:\n{}\nShape = {}".format(
        test_pred, test_pred.shape))

    logging.debug("Check scores...")
    logging.debug("OOF score: {}".format(
        roc_auc_score(train_data["TARGET"].values, oof_pred.data[:, 0])))
    logging.debug("TEST score: {}".format(
        roc_auc_score(test_data["TARGET"].values, test_pred.data[:, 0])))
    logging.debug("Pickle automl")
    with open("automl.pickle", "wb") as f:
        pickle.dump(automl, f)

    logging.debug("Load pickled automl")
    with open("automl.pickle", "rb") as f:
        automl = pickle.load(f)

    logging.debug("Predict loaded automl")
    test_pred = automl.predict(test_data)
    logging.debug("TEST score, loaded: {}".format(
        roc_auc_score(test_data["TARGET"].values, test_pred.data[:, 0])))

    os.remove("automl.pickle")
Пример #10
0
        {
            "name": "reg",
            "loss": "quantile",
            "loss_params": {
                "q": 0.9
            },
            "metric": "quantile",
            "metric_params": {
                "q": 0.9
            },
        },
    ],
    ["TARGET", "TARGET", "AMT_CREDIT", "AMT_CREDIT", "AMT_CREDIT"],
):
    print("Create task..")
    task = Task(**task_params)
    print("Task created")

    print("Create reader...")
    reader = PandasToPandasReader(task, cv=5, random_state=1)
    print("Reader created")

    # pipeline 1 level parts
    print("Start creation pipeline_1...")
    pipe = LGBSimpleFeatures()

    print("\t ParamsTuner2 and Model2...")
    model2 = BoostLGBM(default_params={
        "learning_rate": 0.025,
        "num_leaves": 64,
        "seed": 2,
Пример #11
0
from lightautoml.automl.presets.text_presets import TabularNLPAutoML
from lightautoml.tasks import Task

np.random.seed(42)

data = pd.read_csv("./data/avito1k_train.csv")

train, test = train_test_split(data, test_size=500, random_state=42)

roles = {
    "target": "deal_probability",
    "group": ["user_id"],
    "text": ["description", "title", "param_1", "param_2", "param_3"],
}

task = Task("reg")

automl = TabularNLPAutoML(task=task, timeout=600)
oof_pred = automl.fit_predict(train, roles=roles)
test_pred = automl.predict(test)
not_nan = np.any(~np.isnan(oof_pred.data), axis=1)

print("Check scores...")
print("OOF score: {}".format(
    mean_squared_error(train[roles["target"]].values[not_nan],
                       oof_pred.data[not_nan][:, 0])))
print("TEST score: {}".format(
    mean_squared_error(test[roles["target"]].values, test_pred.data[:, 0])))

shutil.rmtree("./models", ignore_errors=True)
Пример #12
0
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

# load and prepare data
data = pd.read_csv("./data/sampled_app_train.csv")
train_data, test_data = train_test_split(data,
                                         test_size=0.2,
                                         stratify=data["TARGET"],
                                         random_state=42)

# run automl
automl = TabularAutoML(task=Task("binary"))
oof_predictions = automl.fit_predict(train_data,
                                     roles={
                                         "target": "TARGET",
                                         "drop": ["SK_ID_CURR"]
                                     })
te_pred = automl.predict(test_data)

# calculate scores
print(
    f"Score for out-of-fold predictions: {roc_auc_score(train_data['TARGET'].values, oof_predictions.data[:, 0])}"
)
print(
    f"Score for hold-out: {roc_auc_score(test_data['TARGET'].values, te_pred.data[:, 0])}"
)
Пример #13
0
    def __init__(
        self,
        outcome_learners: Optional[Sequence[AutoML]] = None,
        effect_learners: Optional[Sequence[AutoML]] = None,
        propensity_learner: Optional[AutoML] = None,
        base_task: Optional[Task] = None,
        timeout: Optional[int] = None,
        cpu_limit: int = 4,
        gpu_ids: Optional[str] = "all",
    ):
        """
        Args:
            outcome_learners: Models predict `outcome` (base task) for each group (treatment/control),
                base task can be classification or regression task.
                It can be: two models, one model or nothing.
                If there is one model, then it will used for both groups.
                If `None` then will be used model by default.
            effect_learners:  Models predict treatment effect. (task must be 'reg')
                It can be: two models, one model or nothing.
                If there is one model, then it will used for both groups.
                If `None` then will be used model by default.
            propensity_learner: Model predicts treatment group membership,
                If `None` then will be used model by default
            cpu_limit: CPU limit that that are passed to each automl.
            gpu_ids: GPU IDs that are passed to each automl.
            base_task: Task - 'binary' or 'reg'

        """
        if (outcome_learners is None or len(outcome_learners) == 0) and base_task is None:
            raise RuntimeError('Must specify any of learners or "base_task"')

        if outcome_learners is not None and len(outcome_learners) > 0:
            base_task = self._get_task(outcome_learners[0])
            super().__init__(self._get_task(outcome_learners[0]))

        super().__init__(base_task, timeout, cpu_limit, gpu_ids)

        self.learners: Dict[str, Union[Dict[str, AutoML], AutoML]] = {
            "outcome": {},
            "effect": {},
        }
        if propensity_learner is None:
            self.learners["propensity"] = self._get_default_learner(Task("binary"))
        else:
            self.learners["propensity"] = propensity_learner

        if outcome_learners is None or len(outcome_learners) == 0:
            self.learners["outcome"]["control"] = self._get_default_learner(self.base_task)
            self.learners["outcome"]["treatment"] = self._get_default_learner(self.base_task)
        elif len(outcome_learners) == 1:
            self.learners["outcome"]["control"] = outcome_learners[0]
            self.learners["outcome"]["treatment"] = copy.deepcopy(outcome_learners[0])
        elif len(outcome_learners) == 2:
            self.learners["outcome"]["control"] = outcome_learners[0]
            self.learners["outcome"]["treatment"] = outcome_learners[1]
        else:
            raise RuntimeError('The number of "outcome_learners" must be 0/1/2')

        if effect_learners is None or len(effect_learners) == 0:
            self.learners["effect"]["control"] = self._get_default_learner(Task("reg"))
            self.learners["effect"]["treatment"] = self._get_default_learner(Task("reg"))
        elif len(effect_learners) == 1:
            self.learners["effect"]["control"] = effect_learners[0]
            self.learners["effect"]["treatment"] = copy.deepcopy(effect_learners[0])
        elif len(effect_learners) == 2:
            self.learners["effect"]["control"] = effect_learners[0]
            self.learners["effect"]["treatment"] = effect_learners[1]
        else:
            raise RuntimeError('The number of "effect_learners" must be 0/1/2')
Пример #14
0
TIMEOUT = 3600 * 4

train = pd.read_csv("train.csv", header=0)
test = pd.read_csv("test.csv", header=0)

train = train.drop(['id'], axis=1)
test = test.drop(['id'], axis=1)

X = train.iloc[:,:-2]
y = train['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)
le = LabelEncoder()
train['target'] = le.fit_transform(train['target'])

automl = TabularUtilizedAutoML(task = Task('multiclass'),
                               timeout = TIMEOUT,
                               cpu_limit = N_THREADS,
                               verbose=0,
                               reader_params = {'n_jobs': N_THREADS}
)

target_column = 'target'
roles = {
    'target': target_column,
    'drop': ['id']
}

lightml_pred = automl.fit_predict(train, roles = roles)
#print('lightml_pred:\n{}\nShape = {}'.format(lightml_pred[:10], lightml_pred.shape))
Пример #15
0
def binary_task():
    return Task("binary")
Пример #16
0
def test_permutation_importance_based_iterative_selector():
    logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s',
                        level=logging.DEBUG)

    logging.debug('Load data...')
    data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv')
    logging.debug('Data loaded')

    logging.debug('Features modification from user side...')
    data['BIRTH_DATE'] = (
        np.datetime64('2018-01-01') +
        data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
    data['EMP_DATE'] = (np.datetime64('2018-01-01') +
                        np.clip(data['DAYS_EMPLOYED'], None, 0).astype(
                            np.dtype('timedelta64[D]'))).astype(str)

    data['constant'] = 1
    data['allnan'] = np.nan

    data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)
    logging.debug('Features modification finished')

    logging.debug('Split data...')
    train_data, test_data = train_test_split(data,
                                             test_size=2000,
                                             stratify=data['TARGET'],
                                             random_state=13)
    train_data.reset_index(drop=True, inplace=True)
    test_data.reset_index(drop=True, inplace=True)
    logging.debug(
        'Data splitted. Parts sizes: train_data = {}, test_data = {}'.format(
            train_data.shape, test_data.shape))

    logging.debug('Create task...')
    task = Task('binary')
    logging.debug('Task created')

    logging.debug('Create reader...')
    reader = PandasToPandasReader(task, cv=5, random_state=1)
    logging.debug('Reader created')

    # selector parts
    logging.debug('Create feature selector')
    model0 = BoostLGBM(default_params={
        'learning_rate': 0.05,
        'num_leaves': 64,
        'seed': 42,
        'num_threads': 5
    })
    pipe0 = LGBSimpleFeatures()
    pie = NpPermutationImportanceEstimator()
    selector = NpIterativeFeatureSelector(pipe0,
                                          model0,
                                          pie,
                                          feature_group_size=1,
                                          max_features_cnt_in_result=15)
    logging.debug('Feature selector created')

    # pipeline 1 level parts
    logging.debug('Start creation pipeline_1...')
    pipe = LGBSimpleFeatures()

    logging.debug('\t ParamsTuner1 and Model1...')
    model1 = BoostLGBM(default_params={
        'learning_rate': 0.05,
        'num_leaves': 128,
        'seed': 1,
        'num_threads': 5
    })
    logging.debug('\t Tuner1 and model1 created')

    logging.debug('\t ParamsTuner2 and Model2...')
    params_tuner2 = OptunaTuner(n_trials=100, timeout=100)
    model2 = BoostLGBM(default_params={
        'learning_rate': 0.025,
        'num_leaves': 64,
        'seed': 2,
        'num_threads': 5
    })
    logging.debug('\t Tuner2 and model2 created')

    logging.debug('\t Pipeline1...')
    pipeline_lvl1 = MLPipeline([model1, (model2, params_tuner2)],
                               pre_selection=selector,
                               features_pipeline=pipe,
                               post_selection=None)
    logging.debug('Pipeline1 created')

    # pipeline 2 level parts
    logging.debug('Start creation pipeline_2...')
    pipe1 = LGBSimpleFeatures()

    logging.debug('\t ParamsTuner and Model...')
    model = BoostLGBM(default_params={
        'learning_rate': 0.05,
        'num_leaves': 64,
        'max_bin': 1024,
        'seed': 3,
        'num_threads': 5
    },
                      freeze_defaults=True)
    logging.debug('\t Tuner and model created')

    logging.debug('\t Pipeline2...')
    pipeline_lvl2 = MLPipeline([model],
                               pre_selection=None,
                               features_pipeline=pipe1,
                               post_selection=None)
    logging.debug('Pipeline2 created')

    logging.debug('Create AutoML pipeline...')
    automl = AutoML(reader, [
        [pipeline_lvl1],
        [pipeline_lvl2],
    ],
                    skip_conn=False)

    logging.debug('AutoML pipeline created...')

    logging.debug('Start AutoML pipeline fit_predict...')
    start_time = time.time()
    oof_pred = automl.fit_predict(train_data, roles={'target': 'TARGET'})
    logging.debug(
        'AutoML pipeline fitted and predicted. Time = {:.3f} sec'.format(
            time.time() - start_time))

    logging.debug('Feature importances of selector:\n{}'.format(
        selector.get_features_score()))

    logging.debug('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

    logging.debug('Feature importances of top level algorithm:\n{}'.format(
        automl.levels[-1][0].ml_algos[0].get_features_score()))

    logging.debug(
        'Feature importances of lowest level algorithm - model 0:\n{}'.format(
            automl.levels[0][0].ml_algos[0].get_features_score()))

    logging.debug(
        'Feature importances of lowest level algorithm - model 1:\n{}'.format(
            automl.levels[0][0].ml_algos[1].get_features_score()))

    test_pred = automl.predict(test_data)
    logging.debug('Prediction for test data:\n{}\nShape = {}'.format(
        test_pred, test_pred.shape))

    logging.debug('Check scores...')
    logging.debug('OOF score: {}'.format(
        roc_auc_score(train_data['TARGET'].values, oof_pred.data[:, 0])))
    logging.debug('TEST score: {}'.format(
        roc_auc_score(test_data['TARGET'].values, test_pred.data[:, 0])))
    logging.debug('Pickle automl')
    with open('automl.pickle', 'wb') as f:
        pickle.dump(automl, f)

    logging.debug('Load pickled automl')
    with open('automl.pickle', 'rb') as f:
        automl = pickle.load(f)

    logging.debug('Predict loaded automl')
    test_pred = automl.predict(test_data)
    logging.debug('TEST score, loaded: {}'.format(
        roc_auc_score(test_data['TARGET'].values, test_pred.data[:, 0])))

    os.remove('automl.pickle')
Пример #17
0
def run(dataset, config):
    log.info(f"\n**** lightautoml (R) [{__version__}] ****\n")

    warnings.simplefilter(action='ignore', category=FutureWarning)
    warnings.simplefilter(action='ignore', category=DeprecationWarning)

    is_classification = config.type == 'classification'

    label = dataset.target.name
    df_train = dataset.train.data

    max_mem_size_gb = float(config.max_mem_size_mb) / 1024
    task = Task(dataset.problem_type
                if dataset.problem_type != 'regression' else 'reg')
    automl = TabularUtilizedAutoML(task=task,
                                   timeout=config.max_runtime_seconds,
                                   cpu_limit=config.cores,
                                   memory_limit=max_mem_size_gb,
                                   random_state=config.seed)

    log.info("Training...")
    with Timer() as training:
        automl.fit_predict(train_data=df_train, roles={'target': label})

    X_test, y_test = dataset.test.X, dataset.test.y
    log.info("Predicting on the test set...")
    with Timer() as predict:
        preds = automl.predict(X_test).data

    probabilities_labels = None
    if is_classification:
        probabilities = preds

        if dataset.problem_type == 'binary':
            probabilities = np.vstack(
                [1 - probabilities[:, 0], probabilities[:, 0]]).T

        predictions = np.argmax(probabilities, axis=1)
        class_map = automl.outer_pipes[0].ml_algos[0].models[0][
            0].reader.class_mapping
        if class_map is None and df_train[label].dtype == bool:
            class_map = {False: 0, True: 1}
        if class_map:
            column_to_class = {
                col: class_
                for class_, col in class_map.items()
            }
            predictions = list(map(column_to_class.get, predictions))
            probabilities_labels = [
                column_to_class[col] for col in sorted(column_to_class)
            ]
    else:
        probabilities = None
        predictions = preds

    log.debug(probabilities)
    log.debug(config.output_predictions_file)

    save_artifacts(automl, config)

    return result(
        output_file=config.output_predictions_file,
        probabilities_labels=probabilities_labels,
        probabilities=probabilities,
        predictions=predictions,
        training_duration=training.duration,
        predict_duration=predict.duration,
    )
Пример #18
0
def run(dataset, config):
    log.info(f"\n**** lightautoml (R) [{__version__}] ****\n")
    save_metadata(config, version=__version__)

    warnings.simplefilter(action='ignore', category=FutureWarning)
    warnings.simplefilter(action='ignore', category=DeprecationWarning)

    is_classification = config.type == 'classification'

    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    column_names, _ = zip(*dataset.columns)
    column_types = dict(dataset.columns)
    label = dataset.target.name

    df_train = pd.DataFrame(dataset.train.data, columns=column_names).astype(column_types, copy=False)
    df_train[dataset.target.name] = y_train

    max_mem_size_gb = float(config.max_mem_size_mb) / 1024
    task = Task(dataset.problem_type if dataset.problem_type != 'regression' else 'reg')
    automl = TabularUtilizedAutoML(task=task, timeout=config.max_runtime_seconds, cpu_limit=config.cores,
                                   memory_limit=max_mem_size_gb, random_state=config.seed)

    log.info("Training...")
    with utils.Timer() as training:
        automl.fit_predict(train_data=df_train, roles={'target': label})

    df_test = pd.DataFrame(dataset.test.data, columns=column_names).astype(column_types, copy=False)
    df_x_test = df_test.drop(columns=label)

    log.info("Predicting on the test set...")
    with utils.Timer() as predict:
        preds = automl.predict(df_x_test).data

    if is_classification:
        probabilities = preds

        if dataset.problem_type == 'binary':
            probabilities = np.vstack([
                1 - probabilities[:, 0], probabilities[:, 0]
            ]).T

        predictions = np.argmax(probabilities, axis=1)

    else:
        probabilities = None
        predictions = preds

    log.debug(probabilities)
    log.debug(config.output_predictions_file)

    save_artifacts(automl, config)

    return result(
        output_file=config.output_predictions_file,
        probabilities=probabilities,
        predictions=predictions,
        truth=y_test,
        target_is_encoded=is_classification,
        training_duration=training.duration,
        predict_duration=predict.duration,
    )
Пример #19
0
def test_multiclass_task_with_catboost():
    data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv')

    data['BIRTH_DATE'] = (
        np.datetime64('2018-01-01') +
        data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
    data['EMP_DATE'] = (np.datetime64('2018-01-01') +
                        np.clip(data['DAYS_EMPLOYED'], None, 0).astype(
                            np.dtype('timedelta64[D]'))).astype(str)

    data['report_dt'] = np.datetime64('2018-01-01')

    data['constant'] = 1
    data['allnan'] = np.nan

    data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)
    data['TARGET'] = np.where(
        np.random.rand(data.shape[0]) > .5, 2, data['TARGET'].values)

    train, test = train_test_split(data, test_size=2000, random_state=42)
    # ======================================================================================
    logging.debug('Create timer...')
    timer = PipelineTimer(600, mode=2)
    logging.debug('Timer created...')
    # ======================================================================================
    logging.debug('Create selector...')
    timer_gbm = timer.get_task_timer('gbm')
    feat_sel_0 = LGBSimpleFeatures()
    mod_sel_0 = BoostCB(timer=timer_gbm)
    imp_sel_0 = ModelBasedImportanceEstimator()
    selector_0 = ImportanceCutoffSelector(
        feat_sel_0,
        mod_sel_0,
        imp_sel_0,
        cutoff=0,
    )
    logging.debug('Selector created...')
    # ======================================================================================
    logging.debug('Create gbms...')
    feats_gbm_0 = LGBAdvancedPipeline(top_intersections=4, feats_imp=imp_sel_0)
    timer_gbm_0 = timer.get_task_timer('gbm')
    timer_gbm_1 = timer.get_task_timer('gbm')

    gbm_0 = BoostCB(timer=timer_gbm_0, default_params={"devices": "0"})
    gbm_1 = BoostCB(timer=timer_gbm_1, default_params={"devices": "0"})

    tuner_0 = OptunaTuner(n_trials=10, timeout=10, fit_on_holdout=True)
    gbm_lvl0 = MLPipeline([(gbm_0, tuner_0), gbm_1],
                          pre_selection=selector_0,
                          features_pipeline=feats_gbm_0,
                          post_selection=None)
    logging.debug('Gbms created...')
    # ======================================================================================
    logging.debug('Create linear...')
    feats_reg_0 = LinearFeatures(output_categories=True, sparse_ohe='auto')

    timer_reg = timer.get_task_timer('reg')
    reg_0 = LinearLBFGS(timer=timer_reg)

    reg_lvl0 = MLPipeline([reg_0],
                          pre_selection=None,
                          features_pipeline=feats_reg_0,
                          post_selection=None)
    logging.debug('Linear created...')
    # ======================================================================================
    logging.debug('Create reader...')
    reader = PandasToPandasReader(Task(
        'multiclass',
        metric='crossentropy',
    ),
                                  samples=None,
                                  max_nan_rate=1,
                                  max_constant_rate=1,
                                  advanced_roles=True,
                                  drop_score_co=-1,
                                  n_jobs=1)
    logging.debug('Reader created...')
    # ======================================================================================
    logging.debug('Create blender...')
    blender = WeightedBlender()
    logging.debug('Blender created...')
    # ======================================================================================
    logging.debug('Create AutoML...')
    automl = AutoML(reader=reader,
                    levels=[[gbm_lvl0, reg_lvl0]],
                    timer=timer,
                    blender=blender,
                    skip_conn=False)
    logging.debug('AutoML created...')
    # ======================================================================================
    logging.debug('Fit predict...')
    oof_pred = automl.fit_predict(train, roles={'target': "TARGET"})
    logging.debug('Finished fitting...')

    test_pred = automl.predict(test)
    logging.debug('Prediction for test data:\n{}\nShape = {}'.format(
        test_pred, test_pred.shape))
    # ======================================================================================
    logging.debug('Check scores...')
    # use only not nan
    not_nan = np.any(~np.isnan(oof_pred.data), axis=1)

    logging.debug('OOF score: {}'.format(
        log_loss(train['TARGET'].values[not_nan], oof_pred.data[not_nan])))
    logging.debug('TEST score: {}'.format(
        log_loss(test['TARGET'].values, test_pred.data)))
    # ======================================================================================
    for dat, df, name in zip([oof_pred, test_pred], [train, test],
                             ['train', 'test']):
        logging.debug('Check aucs {0}...'.format(name))
        for c in range(3):
            _sc = roc_auc_score((df['TARGET'].values == c).astype(np.float32),
                                dat.data[:, c])
            logging.debug('Cl {0} auc score: {1}'.format(c, _sc))
Пример #20
0

# load and prepare data
data = pd.read_csv("./data/sampled_app_train.csv")
train_data, test_data = train_test_split(data, test_size=0.2, stratify=data["TARGET"], random_state=42)


def sample(optimization_search_space, trial, suggested_params):
    trial_values = copy.copy(suggested_params)
    trial_values["feature_fraction"] = trial.suggest_uniform("feature_fraction", low=0.5, high=1.0)

    if trial_values["feature_fraction"] > 0.7:
        trial_values["min_sum_hessian_in_leaf"] = trial.suggest_uniform("min_sum_hessian_in_leaf", low=0.5, high=1)
    else:
        trial_values["min_sum_hessian_in_leaf"] = trial.suggest_uniform("min_sum_hessian_in_leaf", low=0, high=0.5)

    return trial_values


# run automl with custom search spaces
automl = TabularAutoML(
    task=Task("binary"),
    lgb_params={"optimization_search_space": sample},
)
oof_predictions = automl.fit_predict(train_data, roles={"target": "TARGET", "drop": ["SK_ID_CURR"]})
te_pred = automl.predict(test_data)

# calculate scores
print(f"Score for out-of-fold predictions: {roc_auc_score(train_data['TARGET'].values, oof_predictions.data[:, 0])}")
print(f"Score for hold-out: {roc_auc_score(test_data['TARGET'].values, te_pred.data[:, 0])}")
Пример #21
0
# train_df = train_df.drop(columns=['passengerid'])

test_df = pd.read_csv('/kaggle/input/tabular-playground-series-may-2021/test.csv')
test_df.columns = [column.lower() for column in test_df.columns]

submission = pd.read_csv('/kaggle/input/tabular-playground-series-may-2021/sample_submission.csv')
submission.head()

feature_columns = train_df.iloc[:, 1:-1].columns.values
target_column = 'target'

le = LabelEncoder()
train_df[target_column] = le.fit_transform(train_df[target_column])


task = Task('multiclass',)

roles = {
    'target': target_column,
    'drop': ['id'],
}

automl = TabularUtilizedAutoML(task = task, 
                               timeout = TIMEOUT,
                               cpu_limit = N_THREADS,
                               reader_params = {'n_jobs': N_THREADS},
                               verbose=0
)

oof_pred = automl.fit_predict(train_df, roles = roles)
Пример #22
0
print("Create linear...")
feats_reg_0 = LinearFeatures(output_categories=True, sparse_ohe="auto")

timer_reg = timer.get_task_timer("reg")
reg_0 = LinearLBFGS(timer=timer_reg)

reg_lvl0 = MLPipeline([reg_0],
                      pre_selection=None,
                      features_pipeline=feats_reg_0,
                      post_selection=None)
print("Linear created...")
# ======================================================================================
print("Create reader...")
reader = PandasToPandasReader(
    Task(
        "multiclass",
        metric="crossentropy",
    ),
    samples=None,
    max_nan_rate=1,
    max_constant_rate=1,
    advanced_roles=True,
    drop_score_co=-1,
    n_jobs=1,
)
print("Reader created...")
# ======================================================================================
print("Create blender...")
blender = WeightedBlender()
print("Blender created...")
# ======================================================================================
print("Create AutoML...")
Пример #23
0
def test_different_losses_and_metrics():
    np.random.seed(42)
    logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s',
                        level=logging.DEBUG)

    logging.debug('Load data...')
    data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv')
    logging.debug('Data loaded')

    logging.debug('Features modification from user side...')
    data['BIRTH_DATE'] = (
        np.datetime64('2018-01-01') +
        data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
    data['EMP_DATE'] = (np.datetime64('2018-01-01') +
                        np.clip(data['DAYS_EMPLOYED'], None, 0).astype(
                            np.dtype('timedelta64[D]'))).astype(str)

    data['constant'] = 1
    data['allnan'] = np.nan

    data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)
    logging.debug('Features modification finished')

    logging.debug('Split data...')
    train_data, test_data = train_test_split(data,
                                             test_size=2000,
                                             stratify=data['TARGET'],
                                             random_state=13)

    train_data.reset_index(drop=True, inplace=True)
    test_data.reset_index(drop=True, inplace=True)
    logging.debug(
        'Data splitted. Parts sizes: train_data = {}, test_data = {}'.format(
            train_data.shape, test_data.shape))

    for task_params, target in zip([{
            'name': 'binary'
    }, {
            'name': 'binary',
            'metric': roc_auc_score
    }, {
            'name': 'reg',
            'loss': 'mse',
            'metric': 'r2'
    }, {
            'name': 'reg',
            'loss': 'rmsle',
            'metric': 'rmsle'
    }, {
            'name': 'reg',
            'loss': 'quantile',
            'loss_params': {
                'q': .9
            },
            'metric': 'quantile',
            'metric_params': {
                'q': .9
            }
    }], ['TARGET', 'TARGET', 'AMT_CREDIT', 'AMT_CREDIT', 'AMT_CREDIT']):
        logging.debug('Create task..')
        task = Task(**task_params)
        logging.debug('Task created')

        logging.debug('Create reader...')
        reader = PandasToPandasReader(task, cv=5, random_state=1)
        logging.debug('Reader created')

        # pipeline 1 level parts
        logging.debug('Start creation pipeline_1...')
        pipe = LGBSimpleFeatures()

        logging.debug('\t ParamsTuner2 and Model2...')
        model2 = BoostLGBM(default_params={
            'learning_rate': 0.025,
            'num_leaves': 64,
            'seed': 2,
            'num_threads': 5
        })
        logging.debug('\t Tuner2 and model2 created')

        logging.debug('\t Pipeline1...')
        pipeline_lvl1 = MLPipeline(
            [model2],
            pre_selection=None,  # selector,
            features_pipeline=pipe,
            post_selection=None)
        logging.debug('Pipeline1 created')

        logging.debug('Create AutoML pipeline...')
        automl = AutoML(reader, [
            [pipeline_lvl1],
        ], skip_conn=False)

        logging.debug('AutoML pipeline created...')

        logging.debug('Start AutoML pipeline fit_predict...')
        start_time = time.time()
        oof_pred = automl.fit_predict(train_data, roles={'target': target})
        logging.debug(
            'AutoML pipeline fitted and predicted. Time = {:.3f} sec'.format(
                time.time() - start_time))

        test_pred = automl.predict(test_data)
        logging.debug('Prediction for test data:\n{}\nShape = {}'.format(
            test_pred, test_pred.shape))

        logging.debug('Check scores...')
        logging.debug('OOF score: {}'.format(
            task.metric_func(train_data[target].values, oof_pred.data[:, 0])))
        logging.debug('TEST score: {}'.format(
            task.metric_func(test_data[target].values, test_pred.data[:, 0])))
        logging.debug('Pickle automl')
        with open('automl.pickle', 'wb') as f:
            pickle.dump(automl, f)

        logging.debug('Load pickled automl')
        with open('automl.pickle', 'rb') as f:
            automl = pickle.load(f)

        logging.debug('Predict loaded automl')
        test_pred = automl.predict(test_data)
        logging.debug('TEST score, loaded: {}'.format(
            roc_auc_score(test_data['TARGET'].values, test_pred.data[:, 0])))

        os.remove('automl.pickle')