def test_manual_pipeline(sampled_app_train_test, sampled_app_roles,
                         binary_task):

    train, test = sampled_app_train_test

    pd_dataset = PandasDataset(train,
                               roles_parser(sampled_app_roles),
                               task=binary_task)

    selector_iterator = FoldsIterator(pd_dataset, 1)

    pipe = LGBSimpleFeatures()

    model0 = BoostLGBM(default_params={
        "learning_rate": 0.05,
        "num_leaves": 64,
        "seed": 0,
        "num_threads": 5,
    })

    mbie = ModelBasedImportanceEstimator()
    selector = ImportanceCutoffSelector(pipe, model0, mbie, cutoff=10)

    selector.fit(selector_iterator)

    pipe = LGBSimpleFeatures()

    params_tuner1 = OptunaTuner(n_trials=10, timeout=300)
    model1 = BoostLGBM(default_params={
        "learning_rate": 0.05,
        "num_leaves": 128
    })

    params_tuner2 = OptunaTuner(n_trials=100, timeout=300)
    model2 = BoostLGBM(default_params={
        "learning_rate": 0.025,
        "num_leaves": 64
    })

    total = MLPipeline(
        [(model1, params_tuner1), (model2, params_tuner2)],
        pre_selection=selector,
        features_pipeline=pipe,
        post_selection=None,
    )

    train_valid = FoldsIterator(pd_dataset)

    total.fit_predict(train_valid)

    total.predict(pd_dataset)

    with open("automl.pickle", "wb") as f:
        pickle.dump(total, f)

    with open("automl.pickle", "rb") as f:
        total = pickle.load(f)

    total.predict(pd_dataset)
    os.remove("automl.pickle")
示例#2
0
def test_manual_pipeline():
    # Read data from file
    logging.debug('Read data from file')
    data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv',
                       usecols=[
                           'TARGET', 'NAME_CONTRACT_TYPE', 'AMT_CREDIT',
                           'NAME_TYPE_SUITE', 'AMT_GOODS_PRICE', 'DAYS_BIRTH',
                           'DAYS_EMPLOYED'
                       ])

    # Fix dates and convert to date type
    logging.debug('Fix dates and convert to date type')
    data['BIRTH_DATE'] = np.datetime64(
        '2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))
    data['EMP_DATE'] = np.datetime64('2018-01-01') + np.clip(
        data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
    data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

    # Create folds
    logging.debug('Create folds')
    data['__fold__'] = np.random.randint(0, 5, len(data))

    # Print data head
    logging.debug('Print data head')
    print(data.head())

    # # Set roles for columns
    logging.debug('Set roles for columns')
    check_roles = {
        TargetRole(): 'TARGET',
        CategoryRole(dtype=str): ['NAME_CONTRACT_TYPE', 'NAME_TYPE_SUITE'],
        NumericRole(np.float32): ['AMT_CREDIT', 'AMT_GOODS_PRICE'],
        DatetimeRole(seasonality=['y', 'm', 'wd']): ['BIRTH_DATE', 'EMP_DATE'],
        FoldsRole(): '__fold__'
    }

    # create Task
    task = Task('binary')
    # # Creating PandasDataSet
    logging.debug('Creating PandasDataset')
    start_time = time.time()
    pd_dataset = PandasDataset(data, roles_parser(check_roles), task=task)
    logging.debug(
        'PandasDataset created. Time = {:.3f} sec'.format(time.time() -
                                                          start_time))

    # # Print pandas dataset feature roles
    logging.debug('Print pandas dataset feature roles')
    roles = pd_dataset.roles
    for role in roles:
        logging.debug('{}: {}'.format(role, roles[role]))

    # # Feature selection part
    logging.debug('Feature selection part')
    selector_iterator = FoldsIterator(pd_dataset, 1)
    logging.debug('Selection iterator created')

    model = BoostLGBM()
    pipe = LGBSimpleFeatures()
    logging.debug('Pipe and model created')

    model0 = BoostLGBM(default_params={
        'learning_rate': 0.05,
        'num_leaves': 64,
        'seed': 0,
        'num_threads': 5
    })

    mbie = ModelBasedImportanceEstimator()
    selector = ImportanceCutoffSelector(pipe, model0, mbie, cutoff=10)
    start_time = time.time()
    selector.fit(selector_iterator)
    logging.debug(
        'Feature selector fitted. Time = {:.3f} sec'.format(time.time() -
                                                            start_time))

    logging.debug('Feature selector scores:')
    logging.debug('\n{}'.format(selector.get_features_score()))

    # # Build AutoML pipeline
    logging.debug('Start building AutoML pipeline')
    pipe = LGBSimpleFeatures()
    logging.debug('Pipe created')

    params_tuner1 = OptunaTuner(n_trials=10, timeout=300)
    model1 = BoostLGBM(default_params={
        'learning_rate': 0.05,
        'num_leaves': 128
    })
    logging.debug('Tuner1 and model1 created')

    params_tuner2 = OptunaTuner(n_trials=100, timeout=300)
    model2 = BoostLGBM(default_params={
        'learning_rate': 0.025,
        'num_leaves': 64
    })
    logging.debug('Tuner2 and model2 created')

    total = MLPipeline([(model1, params_tuner1), (model2, params_tuner2)],
                       pre_selection=selector,
                       features_pipeline=pipe,
                       post_selection=None)

    logging.debug('Finished building AutoML pipeline')

    # # Create full train iterator
    logging.debug('Full train valid iterator creation')
    train_valid = FoldsIterator(pd_dataset)
    logging.debug('Full train valid iterator created')

    # # Fit predict using pipeline
    logging.debug('Start AutoML pipeline fit_predict')
    start_time = time.time()
    pred = total.fit_predict(train_valid)
    logging.debug(
        'Fit_predict finished. Time = {:.3f} sec'.format(time.time() -
                                                         start_time))

    # # Check preds
    logging.debug('Preds:')
    logging.debug('\n{}'.format(pred))
    logging.debug('Preds.shape = {}'.format(pred.shape))

    # # Predict full train dataset
    logging.debug('Predict full train dataset')
    start_time = time.time()
    train_pred = total.predict(pd_dataset)
    logging.debug('Predict finished. Time = {:.3f} sec'.format(time.time() -
                                                               start_time))
    logging.debug('Preds:')
    logging.debug('\n{}'.format(train_pred))
    logging.debug('Preds.shape = {}'.format(train_pred.shape))

    logging.debug('Pickle automl')
    with open('automl.pickle', 'wb') as f:
        pickle.dump(total, f)

    logging.debug('Load pickled automl')
    with open('automl.pickle', 'rb') as f:
        total = pickle.load(f)

    logging.debug('Predict loaded automl')
    train_pred = total.predict(pd_dataset)
    os.remove('automl.pickle')

    # # Check preds feature names
    logging.debug('Preds features: {}'.format(train_pred.features))

    # # Check model feature scores
    logging.debug('Feature scores for model_1:\n{}'.format(
        model1.get_features_score()))
    logging.debug('Feature scores for model_2:\n{}'.format(
        model2.get_features_score()))
示例#3
0
    pre_selection=selector,
    features_pipeline=pipe,
    post_selection=None,
)

print("Finished building AutoML pipeline")

# # Create full train iterator
print("Full train valid iterator creation")
train_valid = FoldsIterator(pd_dataset)
print("Full train valid iterator created")

# # Fit predict using pipeline
print("Start AutoML pipeline fit_predict")
start_time = time.time()
pred = total.fit_predict(train_valid)
print("Fit_predict finished. Time = {:.3f} sec".format(time.time() -
                                                       start_time))

# # Check preds
print("Preds:")
print("\n{}".format(pred))
print("Preds.shape = {}".format(pred.shape))

# # Predict full train dataset
print("Predict full train dataset")
start_time = time.time()
train_pred = total.predict(pd_dataset)
print("Predict finished. Time = {:.3f} sec".format(time.time() - start_time))
print("Preds:")
print("\n{}".format(train_pred))