示例#1
0
def test_lightgbm_classifier():
    # Classification
    params = params = {
        'objective': 'binary',
        'metric': ['auc', 'binary_logloss'],
        'boosting_type': 'gbdt',
    }
    classifier_cls = get_lightgbm_learner_learning_api(params, num_boost_round=10000, \
     early_stopping_rounds=5, verbose_eval=50, split_random_seed=42)
    fs = RFE(classifier_cls)

    dataset = BankNote()
    target_column = dataset.y_column
    df = dataset.df

    # Features generation
    features_df = df.kxy.generate_features(entity=None,
                                           max_lag=None,
                                           entity_name='*',
                                           exclude=[target_column])

    # Feature selection
    x_columns = [_ for _ in features_df.columns if _ != target_column]
    x_df = features_df[x_columns]
    y_df = features_df[[target_column]]
    n_vars = max(x_df.shape[1] - 5, 1)
    m = fs.fit(x_df, y_df, n_vars)

    # Assertions
    assert len(fs.selected_variables) == n_vars
    assert fs.selected_variables == ['Variance', 'Kurtosis', 'Skewness.ABS(* - MEAN(*))', 'Skewness', 'Variance.ABS(* - MEAN(*))', \
     'Entropy', 'Variance.ABS(* - Q25(*))', 'Kurtosis.ABS(* - MEDIAN(*))', 'Kurtosis.ABS(* - Q75(*))', 'Skewness.ABS(* - MEDIAN(*))', \
     'Kurtosis.ABS(* - Q25(*))', 'Kurtosis.ABS(* - MEAN(*))', 'Variance.ABS(* - MEDIAN(*))', 'Entropy.ABS(* - MEDIAN(*))', \
     'Entropy.ABS(* - Q25(*))']
示例#2
0
def test_lightgbm_regression():
    lgbm_params = {
        'objective': 'rmse',
        'boosting_type': 'gbdt',
        'n_jobs': -1,
        'learning_rate': 0.1,
        'verbose': -1,
    }
    regressor_cls = get_lightgbm_learner_learning_api(lgbm_params, num_boost_round=2000, \
     early_stopping_rounds=5, split_random_seed=0)

    fs = Boruta(regressor_cls)

    dataset = Abalone()
    target_column = dataset.y_column
    df = dataset.df

    # Features generation
    features_df = df.kxy.generate_features(entity=None,
                                           max_lag=None,
                                           entity_name='*',
                                           exclude=[target_column])

    # Feature selection
    x_columns = [_ for _ in features_df.columns if _ != target_column]
    x_df = features_df[x_columns]
    y_df = features_df[[target_column]]
    m = fs.fit(x_df, y_df)

    assert len(fs.selected_variables) == 13
    assert fs.selected_variables == ['Shucked weight.ABS(* - Q75(*))', 'Shucked weight.ABS(* - MEDIAN(*))', \
     'Shucked weight.ABS(* - MEAN(*))', 'Shucked weight', 'Shell weight.ABS(* - MEAN(*))', 'Shell weight', \
     'Shucked weight.ABS(* - Q25(*))', 'Sex_I', 'Diameter', 'Whole weight', 'Shell weight.ABS(* - Q75(*))', \
     'Whole weight.ABS(* - MEAN(*))', 'Height']
示例#3
0
def test_non_additive_lean_boosted_classifier():
    # Classification
    params = {
        'objective': 'binary',
        'metric': ['auc', 'binary_logloss'],
        'boosting_type': 'gbdt',
    }
    lightgbm_classifier_cls = get_lightgbm_learner_learning_api(params, num_boost_round=10000, \
     split_random_seed=42)
    dataset = BankNote()
    target_column = dataset.y_column
    df = dataset.df

    # Features generation
    features_df = df.kxy.generate_features(entity=None,
                                           max_lag=None,
                                           entity_name='*',
                                           exclude=[target_column])
    features_df[target_column] = features_df[target_column].astype(int)

    # Model building
    results = features_df.kxy.fit(target_column, lightgbm_classifier_cls, \
     problem_type='classification', additive_learning=False, return_scores=True, \
     n_down_perf_before_stop=1)
    assert results['Testing Accuracy'] == '0.964'
    assert results['Selected Variables'] == [
        'Variance', 'Skewness.ABS(* - Q25(*))', 'Kurtosis', 'Skewness',
        'Entropy'
    ]
示例#4
0
def test_non_additive_lean_boosted_regressor():
    # Regression
    params = {
        'objective': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': 100,
        'n_jobs': -1,
        'learning_rate': 0.1,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'verbose': -1,
    }
    lightgbm_regressor_cls = get_lightgbm_learner_learning_api(params, num_boost_round=10000, \
     split_random_seed=42)
    dataset = Abalone()
    target_column = dataset.y_column
    df = dataset.df

    # Features generation
    features_df = df.kxy.generate_features(entity=None,
                                           max_lag=None,
                                           entity_name='*',
                                           exclude=[target_column])

    # Model building
    results = features_df.kxy.fit(target_column, lightgbm_regressor_cls, \
     problem_type='regression', additive_learning=False, return_scores=True, \
     n_down_perf_before_stop=1)
    assert results['Testing R-Squared'] == '0.554'
    assert results['Selected Variables'] == ['Shell weight', 'Shucked weight', 'Whole weight', 'Shell weight.ABS(* - Q25(*))',\
     'Viscera weight.ABS(* - MEDIAN(*))', 'Viscera weight.ABS(* - MEAN(*))', 'Height', 'Length', 'Diameter', 'Sex_I',\
     'Shucked weight.ABS(* - MEDIAN(*))', 'Diameter.ABS(* - MEDIAN(*))', 'Viscera weight.ABS(* - Q75(*))',\
     'Viscera weight.ABS(* - Q25(*))', 'Diameter.ABS(* - Q25(*))', 'Sex_M', 'Sex_F']
示例#5
0
def lightgbm_classification_benchmark():
    # LeanML vs Boruta vs RFE
    params = {
        'objective': 'binary',
        'metric': ['auc', 'binary_logloss'],
    }

    lightgbm_classifier_cls = get_lightgbm_learner_learning_api(params, num_boost_round=10000, \
        early_stopping_rounds=50, verbose_eval=50)
    classification_benchmark(lightgbm_classifier_cls, 'lightgbm')
示例#6
0
def lightgbm_regression_benchmark():
    # LeanML vs Boruta vs RFE
    params = {
        'objective': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': 100,
        'n_jobs': -1,
        'learning_rate': 0.1,
        'verbose': -1,
    }
    lightgbm_regressor_cls = get_lightgbm_learner_learning_api(params, num_boost_round=10000, \
        early_stopping_rounds=50, verbose_eval=50)
    regression_benchmark(lightgbm_regressor_cls, 'lightgbm')
示例#7
0
def test_lean_boosted_lightgbm_learning_regressor():
    # Regression
    params = params = {
        'objective': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': 100,
        'n_jobs': -1,
        'learning_rate': 0.1,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'verbose': -1,
    }
    lightgbm_regressor_cls = get_lightgbm_learner_learning_api(params, num_boost_round=10000, \
     early_stopping_rounds=50, verbose_eval=50, split_random_seed=42)
    dataset = Abalone()
    target_column = dataset.y_column
    df = dataset.df

    # Features generation
    features_df = df.kxy.generate_features(entity=None,
                                           max_lag=None,
                                           entity_name='*',
                                           exclude=[target_column])

    # Model building
    results = features_df.kxy.fit(target_column, lightgbm_regressor_cls, \
     problem_type='regression', additive_learning=True, return_scores=True, \
     n_down_perf_before_stop=1)
    model = results['predictor'].models[0]
    feature_columns = results['Selected Variables']
    x = features_df[feature_columns].values
    predictions = model.predict(x)
    path = '../kxy/misc/cache/%s-%s.sav' % (dataset.name,
                                            'lightbm-learning-api-regressor')
    model.save(path)

    loaded_model = lightgbm_regressor_cls(path=path)
    loaded_predictions = loaded_model.predict(x)

    assert np.allclose(predictions, loaded_predictions)
示例#8
0
def test_leanml_predictor_lightgbm():
    # Regression
    params = params = {
        'objective': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': 100,
        'n_jobs': -1,
        'learning_rate': 0.1,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'verbose': -1,
    }
    lightgbm_regressor_cls = get_lightgbm_learner_learning_api(params, num_boost_round=10000, \
     early_stopping_rounds=50, verbose_eval=50, split_random_seed=42)
    dataset = Abalone()
    target_column = dataset.y_column
    df = dataset.df

    # Features generation
    features_df = df.kxy.generate_features(entity=None,
                                           max_lag=None,
                                           entity_name='*',
                                           exclude=[target_column])

    # Model building
    results = features_df.kxy.fit(target_column, lightgbm_regressor_cls, \
     problem_type='regression')
    feature_columns = results['Selected Variables']
    predictor = results['predictor']
    predictions = predictor.predict(features_df[feature_columns])
    path = '../kxy/misc/cache/%s-%s.sav' % (dataset.name,
                                            'lightbm-learning-api-regressor')
    predictor.save(path)

    loaded_predictor = LeanMLPredictor.load(path, lightgbm_regressor_cls)
    loaded_predictions = loaded_predictor.predict(features_df[feature_columns])

    assert np.allclose(predictions, loaded_predictions)
# function, the class should also define a save(self, path) method to 
# save a model to disk, and a load(cls, path) class method to load a 
# saved model from disk. 

# See kxy.learning.base_learners for helper functions that allow you 
# create learner functions that return instances of popular predictive 
# models (e.g. lightgbm, xgboost, sklearn, tensorflow, pytorch models 
# etc.).

from kxy.learning import get_lightgbm_learner_learning_api
params = {
	'objective': 'binary',
	'metric': ['auc', 'binary_logloss'],
}
lightgbm_learner_func = get_lightgbm_learner_learning_api(params, \
	num_boost_round=10000, early_stopping_rounds=50, verbose_eval=50, \
	split_random_seed=0)
    
# 5. Fit a LightGBM classifier wrapped around LeanML feature selection
results = train_features_df.kxy.fit(target_column, \
	lightgbm_learner_func, problem_type='classification', \
	feature_selection_method='leanml')
predictor = results['predictor']

# 6. Make predictions from a dataframe of test features
test_predictions_df = predictor.predict(test_features_df)

# 7. Compute out-of-sample accuracy and AUC
from sklearn.metrics import accuracy_score, roc_auc_score
accuracy = accuracy_score(
    test_labels_df[target_column].values, \