def test_manual_pipeline(sampled_app_train_test, sampled_app_roles, binary_task): train, test = sampled_app_train_test pd_dataset = PandasDataset(train, roles_parser(sampled_app_roles), task=binary_task) selector_iterator = FoldsIterator(pd_dataset, 1) pipe = LGBSimpleFeatures() model0 = BoostLGBM(default_params={ "learning_rate": 0.05, "num_leaves": 64, "seed": 0, "num_threads": 5, }) mbie = ModelBasedImportanceEstimator() selector = ImportanceCutoffSelector(pipe, model0, mbie, cutoff=10) selector.fit(selector_iterator) pipe = LGBSimpleFeatures() params_tuner1 = OptunaTuner(n_trials=10, timeout=300) model1 = BoostLGBM(default_params={ "learning_rate": 0.05, "num_leaves": 128 }) params_tuner2 = OptunaTuner(n_trials=100, timeout=300) model2 = BoostLGBM(default_params={ "learning_rate": 0.025, "num_leaves": 64 }) total = MLPipeline( [(model1, params_tuner1), (model2, params_tuner2)], pre_selection=selector, features_pipeline=pipe, post_selection=None, ) train_valid = FoldsIterator(pd_dataset) total.fit_predict(train_valid) total.predict(pd_dataset) with open("automl.pickle", "wb") as f: pickle.dump(total, f) with open("automl.pickle", "rb") as f: total = pickle.load(f) total.predict(pd_dataset) os.remove("automl.pickle")
data.drop(["DAYS_BIRTH", "DAYS_EMPLOYED"], axis=1, inplace=True) data["TARGET"] = data["TARGET"] print("Features modification finished") print("Split data...") train, test = train_test_split(data, test_size=0.2, random_state=42) train.reset_index(drop=True, inplace=True) test.reset_index(drop=True, inplace=True) print("Data splitted. Parts sizes: train_data = {}, test_data = {}".format( train.shape, test.shape)) print("Start creation selector_0...") feat_sel_0 = LGBSimpleFeatures() mod_sel_0 = BoostLGBM() imp_sel_0 = ModelBasedImportanceEstimator() selector_0 = ImportanceCutoffSelector(feat_sel_0, mod_sel_0, imp_sel_0, cutoff=0) print("End creation selector_0...") print("Start creation gbm_0...") feats_gbm_0 = LGBAdvancedPipeline() gbm_0 = BoostLGBM() gbm_1 = BoostLGBM() tuner_0 = OptunaTuner(n_trials=100, timeout=30, fit_on_holdout=True) gbm_lvl0 = MLPipeline( [(gbm_0, tuner_0), gbm_1], pre_selection=selector_0,
def test_permutation_importance_based_iterative_selector(): logging.basicConfig(format="[%(asctime)s] (%(levelname)s): %(message)s", level=logging.DEBUG) logging.debug("Load data...") data = pd.read_csv("./examples/data/sampled_app_train.csv") logging.debug("Data loaded") logging.debug("Features modification from user side...") data["BIRTH_DATE"] = ( np.datetime64("2018-01-01") + data["DAYS_BIRTH"].astype(np.dtype("timedelta64[D]"))).astype(str) data["EMP_DATE"] = (np.datetime64("2018-01-01") + np.clip(data["DAYS_EMPLOYED"], None, 0).astype( np.dtype("timedelta64[D]"))).astype(str) data["constant"] = 1 data["allnan"] = np.nan data.drop(["DAYS_BIRTH", "DAYS_EMPLOYED"], axis=1, inplace=True) logging.debug("Features modification finished") logging.debug("Split data...") train_data, test_data = train_test_split(data, test_size=2000, stratify=data["TARGET"], random_state=13) train_data.reset_index(drop=True, inplace=True) test_data.reset_index(drop=True, inplace=True) logging.debug( "Data splitted. Parts sizes: train_data = {}, test_data = {}".format( train_data.shape, test_data.shape)) logging.debug("Create task...") task = Task("binary") logging.debug("Task created") logging.debug("Create reader...") reader = PandasToPandasReader(task, cv=5, random_state=1) logging.debug("Reader created") # selector parts logging.debug("Create feature selector") model0 = BoostLGBM(default_params={ "learning_rate": 0.05, "num_leaves": 64, "seed": 42, "num_threads": 5, }) pipe0 = LGBSimpleFeatures() pie = NpPermutationImportanceEstimator() selector = NpIterativeFeatureSelector(pipe0, model0, pie, feature_group_size=1, max_features_cnt_in_result=15) logging.debug("Feature selector created") # pipeline 1 level parts logging.debug("Start creation pipeline_1...") pipe = LGBSimpleFeatures() logging.debug("\t ParamsTuner1 and Model1...") model1 = BoostLGBM(default_params={ "learning_rate": 0.05, "num_leaves": 128, "seed": 1, "num_threads": 5, }) logging.debug("\t Tuner1 and model1 created") logging.debug("\t ParamsTuner2 and Model2...") params_tuner2 = OptunaTuner(n_trials=100, timeout=100) model2 = BoostLGBM(default_params={ "learning_rate": 0.025, "num_leaves": 64, "seed": 2, "num_threads": 5, }) logging.debug("\t Tuner2 and model2 created") logging.debug("\t Pipeline1...") pipeline_lvl1 = MLPipeline( [model1, (model2, params_tuner2)], pre_selection=selector, features_pipeline=pipe, post_selection=None, ) logging.debug("Pipeline1 created") # pipeline 2 level parts logging.debug("Start creation pipeline_2...") pipe1 = LGBSimpleFeatures() logging.debug("\t ParamsTuner and Model...") model = BoostLGBM( default_params={ "learning_rate": 0.05, "num_leaves": 64, "max_bin": 1024, "seed": 3, "num_threads": 5, }, freeze_defaults=True, ) logging.debug("\t Tuner and model created") logging.debug("\t Pipeline2...") pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None) logging.debug("Pipeline2 created") logging.debug("Create AutoML pipeline...") automl = AutoML( reader, [ [pipeline_lvl1], [pipeline_lvl2], ], skip_conn=False, ) logging.debug("AutoML pipeline created...") logging.debug("Start AutoML pipeline fit_predict...") start_time = time.time() oof_pred = automl.fit_predict(train_data, roles={"target": "TARGET"}) logging.debug( "AutoML pipeline fitted and predicted. Time = {:.3f} sec".format( time.time() - start_time)) logging.debug("Feature importances of selector:\n{}".format( selector.get_features_score())) logging.debug("oof_pred:\n{}\nShape = {}".format(oof_pred, oof_pred.shape)) logging.debug("Feature importances of top level algorithm:\n{}".format( automl.levels[-1][0].ml_algos[0].get_features_score())) logging.debug( "Feature importances of lowest level algorithm - model 0:\n{}".format( automl.levels[0][0].ml_algos[0].get_features_score())) logging.debug( "Feature importances of lowest level algorithm - model 1:\n{}".format( automl.levels[0][0].ml_algos[1].get_features_score())) test_pred = automl.predict(test_data) logging.debug("Prediction for test data:\n{}\nShape = {}".format( test_pred, test_pred.shape)) logging.debug("Check scores...") logging.debug("OOF score: {}".format( roc_auc_score(train_data["TARGET"].values, oof_pred.data[:, 0]))) logging.debug("TEST score: {}".format( roc_auc_score(test_data["TARGET"].values, test_pred.data[:, 0]))) logging.debug("Pickle automl") with open("automl.pickle", "wb") as f: pickle.dump(automl, f) logging.debug("Load pickled automl") with open("automl.pickle", "rb") as f: automl = pickle.load(f) logging.debug("Predict loaded automl") test_pred = automl.predict(test_data) logging.debug("TEST score, loaded: {}".format( roc_auc_score(test_data["TARGET"].values, test_pred.data[:, 0]))) os.remove("automl.pickle")
print("Create task..") task = Task(**task_params) print("Task created") print("Create reader...") reader = PandasToPandasReader(task, cv=5, random_state=1) print("Reader created") # pipeline 1 level parts print("Start creation pipeline_1...") pipe = LGBSimpleFeatures() print("\t ParamsTuner2 and Model2...") model2 = BoostLGBM(default_params={ "learning_rate": 0.025, "num_leaves": 64, "seed": 2, "num_threads": 5, }) print("\t Tuner2 and model2 created") print("\t Pipeline1...") pipeline_lvl1 = MLPipeline( [model2], pre_selection=None, # selector, features_pipeline=pipe, post_selection=None, ) print("Pipeline1 created") print("Create AutoML pipeline...") automl = AutoML(
def test_permutation_importance_based_iterative_selector(): logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.DEBUG) logging.debug('Load data...') data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv') logging.debug('Data loaded') logging.debug('Features modification from user side...') data['BIRTH_DATE'] = ( np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str) data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype( np.dtype('timedelta64[D]'))).astype(str) data['constant'] = 1 data['allnan'] = np.nan data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True) logging.debug('Features modification finished') logging.debug('Split data...') train_data, test_data = train_test_split(data, test_size=2000, stratify=data['TARGET'], random_state=13) train_data.reset_index(drop=True, inplace=True) test_data.reset_index(drop=True, inplace=True) logging.debug( 'Data splitted. Parts sizes: train_data = {}, test_data = {}'.format( train_data.shape, test_data.shape)) logging.debug('Create task...') task = Task('binary') logging.debug('Task created') logging.debug('Create reader...') reader = PandasToPandasReader(task, cv=5, random_state=1) logging.debug('Reader created') # selector parts logging.debug('Create feature selector') model0 = BoostLGBM(default_params={ 'learning_rate': 0.05, 'num_leaves': 64, 'seed': 42, 'num_threads': 5 }) pipe0 = LGBSimpleFeatures() pie = NpPermutationImportanceEstimator() selector = NpIterativeFeatureSelector(pipe0, model0, pie, feature_group_size=1, max_features_cnt_in_result=15) logging.debug('Feature selector created') # pipeline 1 level parts logging.debug('Start creation pipeline_1...') pipe = LGBSimpleFeatures() logging.debug('\t ParamsTuner1 and Model1...') model1 = BoostLGBM(default_params={ 'learning_rate': 0.05, 'num_leaves': 128, 'seed': 1, 'num_threads': 5 }) logging.debug('\t Tuner1 and model1 created') logging.debug('\t ParamsTuner2 and Model2...') params_tuner2 = OptunaTuner(n_trials=100, timeout=100) model2 = BoostLGBM(default_params={ 'learning_rate': 0.025, 'num_leaves': 64, 'seed': 2, 'num_threads': 5 }) logging.debug('\t Tuner2 and model2 created') logging.debug('\t Pipeline1...') pipeline_lvl1 = MLPipeline([model1, (model2, params_tuner2)], pre_selection=selector, features_pipeline=pipe, post_selection=None) logging.debug('Pipeline1 created') # pipeline 2 level parts logging.debug('Start creation pipeline_2...') pipe1 = LGBSimpleFeatures() logging.debug('\t ParamsTuner and Model...') model = BoostLGBM(default_params={ 'learning_rate': 0.05, 'num_leaves': 64, 'max_bin': 1024, 'seed': 3, 'num_threads': 5 }, freeze_defaults=True) logging.debug('\t Tuner and model created') logging.debug('\t Pipeline2...') pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None) logging.debug('Pipeline2 created') logging.debug('Create AutoML pipeline...') automl = AutoML(reader, [ [pipeline_lvl1], [pipeline_lvl2], ], skip_conn=False) logging.debug('AutoML pipeline created...') logging.debug('Start AutoML pipeline fit_predict...') start_time = time.time() oof_pred = automl.fit_predict(train_data, roles={'target': 'TARGET'}) logging.debug( 'AutoML pipeline fitted and predicted. Time = {:.3f} sec'.format( time.time() - start_time)) logging.debug('Feature importances of selector:\n{}'.format( selector.get_features_score())) logging.debug('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape)) logging.debug('Feature importances of top level algorithm:\n{}'.format( automl.levels[-1][0].ml_algos[0].get_features_score())) logging.debug( 'Feature importances of lowest level algorithm - model 0:\n{}'.format( automl.levels[0][0].ml_algos[0].get_features_score())) logging.debug( 'Feature importances of lowest level algorithm - model 1:\n{}'.format( automl.levels[0][0].ml_algos[1].get_features_score())) test_pred = automl.predict(test_data) logging.debug('Prediction for test data:\n{}\nShape = {}'.format( test_pred, test_pred.shape)) logging.debug('Check scores...') logging.debug('OOF score: {}'.format( roc_auc_score(train_data['TARGET'].values, oof_pred.data[:, 0]))) logging.debug('TEST score: {}'.format( roc_auc_score(test_data['TARGET'].values, test_pred.data[:, 0]))) logging.debug('Pickle automl') with open('automl.pickle', 'wb') as f: pickle.dump(automl, f) logging.debug('Load pickled automl') with open('automl.pickle', 'rb') as f: automl = pickle.load(f) logging.debug('Predict loaded automl') test_pred = automl.predict(test_data) logging.debug('TEST score, loaded: {}'.format( roc_auc_score(test_data['TARGET'].values, test_pred.data[:, 0]))) os.remove('automl.pickle')
def test_manual_pipeline(): # Read data from file logging.debug('Read data from file') data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv', usecols=[ 'TARGET', 'NAME_CONTRACT_TYPE', 'AMT_CREDIT', 'NAME_TYPE_SUITE', 'AMT_GOODS_PRICE', 'DAYS_BIRTH', 'DAYS_EMPLOYED' ]) # Fix dates and convert to date type logging.debug('Fix dates and convert to date type') data['BIRTH_DATE'] = np.datetime64( '2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]')) data['EMP_DATE'] = np.datetime64('2018-01-01') + np.clip( data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]')) data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True) # Create folds logging.debug('Create folds') data['__fold__'] = np.random.randint(0, 5, len(data)) # Print data head logging.debug('Print data head') print(data.head()) # # Set roles for columns logging.debug('Set roles for columns') check_roles = { TargetRole(): 'TARGET', CategoryRole(dtype=str): ['NAME_CONTRACT_TYPE', 'NAME_TYPE_SUITE'], NumericRole(np.float32): ['AMT_CREDIT', 'AMT_GOODS_PRICE'], DatetimeRole(seasonality=['y', 'm', 'wd']): ['BIRTH_DATE', 'EMP_DATE'], FoldsRole(): '__fold__' } # create Task task = Task('binary') # # Creating PandasDataSet logging.debug('Creating PandasDataset') start_time = time.time() pd_dataset = PandasDataset(data, roles_parser(check_roles), task=task) logging.debug( 'PandasDataset created. Time = {:.3f} sec'.format(time.time() - start_time)) # # Print pandas dataset feature roles logging.debug('Print pandas dataset feature roles') roles = pd_dataset.roles for role in roles: logging.debug('{}: {}'.format(role, roles[role])) # # Feature selection part logging.debug('Feature selection part') selector_iterator = FoldsIterator(pd_dataset, 1) logging.debug('Selection iterator created') model = BoostLGBM() pipe = LGBSimpleFeatures() logging.debug('Pipe and model created') model0 = BoostLGBM(default_params={ 'learning_rate': 0.05, 'num_leaves': 64, 'seed': 0, 'num_threads': 5 }) mbie = ModelBasedImportanceEstimator() selector = ImportanceCutoffSelector(pipe, model0, mbie, cutoff=10) start_time = time.time() selector.fit(selector_iterator) logging.debug( 'Feature selector fitted. Time = {:.3f} sec'.format(time.time() - start_time)) logging.debug('Feature selector scores:') logging.debug('\n{}'.format(selector.get_features_score())) # # Build AutoML pipeline logging.debug('Start building AutoML pipeline') pipe = LGBSimpleFeatures() logging.debug('Pipe created') params_tuner1 = OptunaTuner(n_trials=10, timeout=300) model1 = BoostLGBM(default_params={ 'learning_rate': 0.05, 'num_leaves': 128 }) logging.debug('Tuner1 and model1 created') params_tuner2 = OptunaTuner(n_trials=100, timeout=300) model2 = BoostLGBM(default_params={ 'learning_rate': 0.025, 'num_leaves': 64 }) logging.debug('Tuner2 and model2 created') total = MLPipeline([(model1, params_tuner1), (model2, params_tuner2)], pre_selection=selector, features_pipeline=pipe, post_selection=None) logging.debug('Finished building AutoML pipeline') # # Create full train iterator logging.debug('Full train valid iterator creation') train_valid = FoldsIterator(pd_dataset) logging.debug('Full train valid iterator created') # # Fit predict using pipeline logging.debug('Start AutoML pipeline fit_predict') start_time = time.time() pred = total.fit_predict(train_valid) logging.debug( 'Fit_predict finished. Time = {:.3f} sec'.format(time.time() - start_time)) # # Check preds logging.debug('Preds:') logging.debug('\n{}'.format(pred)) logging.debug('Preds.shape = {}'.format(pred.shape)) # # Predict full train dataset logging.debug('Predict full train dataset') start_time = time.time() train_pred = total.predict(pd_dataset) logging.debug('Predict finished. Time = {:.3f} sec'.format(time.time() - start_time)) logging.debug('Preds:') logging.debug('\n{}'.format(train_pred)) logging.debug('Preds.shape = {}'.format(train_pred.shape)) logging.debug('Pickle automl') with open('automl.pickle', 'wb') as f: pickle.dump(total, f) logging.debug('Load pickled automl') with open('automl.pickle', 'rb') as f: total = pickle.load(f) logging.debug('Predict loaded automl') train_pred = total.predict(pd_dataset) os.remove('automl.pickle') # # Check preds feature names logging.debug('Preds features: {}'.format(train_pred.features)) # # Check model feature scores logging.debug('Feature scores for model_1:\n{}'.format( model1.get_features_score())) logging.debug('Feature scores for model_2:\n{}'.format( model2.get_features_score()))
def test_boostlgbm_and_linearlbfgs_in_one_automl_pipeline(): np.random.seed(42) logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.DEBUG) logging.debug('Load data...') data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv') logging.debug('Data loaded') logging.debug('Features modification from user side...') data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str) data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]')) ).astype(str) data['report_dt'] = np.datetime64('2018-01-01') data['constant'] = 1 data['allnan'] = np.nan data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True) data['TARGET'] = data['TARGET'] logging.debug('Features modification finished') logging.debug('Split data...') train, test = train_test_split(data, test_size=0.2, random_state=42) train.reset_index(drop=True, inplace=True) test.reset_index(drop=True, inplace=True) logging.debug('Data splitted. Parts sizes: train_data = {}, test_data = {}' .format(train.shape, test.shape)) logging.debug('Start creation selector_0...') feat_sel_0 = LGBSimpleFeatures() mod_sel_0 = BoostLGBM() imp_sel_0 = ModelBasedImportanceEstimator() selector_0 = ImportanceCutoffSelector(feat_sel_0, mod_sel_0, imp_sel_0, cutoff=0) logging.debug('End creation selector_0...') logging.debug('Start creation gbm_0...') feats_gbm_0 = LGBAdvancedPipeline() gbm_0 = BoostLGBM() gbm_1 = BoostLGBM() tuner_0 = OptunaTuner(n_trials=100, timeout=30, fit_on_holdout=True) gbm_lvl0 = MLPipeline([ (gbm_0, tuner_0), gbm_1 ], pre_selection=selector_0, features_pipeline=feats_gbm_0, post_selection=None) logging.debug('End creation gbm_0...') logging.debug('Start creation reg_0...') feats_reg_0 = LinearFeatures(output_categories=True) reg_0 = LinearLBFGS() reg_lvl0 = MLPipeline([ reg_0 ], pre_selection=None, features_pipeline=feats_reg_0, post_selection=HighCorrRemoval(corr_co=1)) logging.debug('End creation reg_0...') logging.debug('Start creation composed selector...') feat_sel_1 = LGBSimpleFeatures() mod_sel_1 = BoostLGBM() imp_sel_1 = NpPermutationImportanceEstimator() selector_1 = NpIterativeFeatureSelector(feat_sel_1, mod_sel_1, imp_sel_1, feature_group_size=1) logging.debug('End creation composed selector...') logging.debug('Start creation reg_l1_0...') feats_reg_1 = LinearFeatures(output_categories=False) reg_1 = LinearL1CD() reg_l1_lvl0 = MLPipeline([ reg_1 ], pre_selection=selector_1, features_pipeline=feats_reg_1, post_selection=HighCorrRemoval()) logging.debug('End creation reg_l1_0...') logging.debug('Start creation blending...') feats_reg_2 = LinearFeatures(output_categories=True) reg_2 = LinearLBFGS() reg_lvl1 = MLPipeline([ reg_2 ], pre_selection=None, features_pipeline=feats_reg_2, post_selection=HighCorrRemoval(corr_co=1)) logging.debug('End creation blending...') logging.debug('Start creation automl...') reader = PandasToPandasReader(Task('binary', ), samples=None, max_nan_rate=1, max_constant_rate=1) automl = AutoML(reader, [ [gbm_lvl0, reg_lvl0, reg_l1_lvl0], [reg_lvl1], ], skip_conn=False, blender=MeanBlender()) logging.debug('End creation automl...') logging.debug('Start fit automl...') roles = {'target': 'TARGET', DatetimeRole(base_date=True, seasonality=(), base_feats=False): 'report_dt', } oof_pred = automl.fit_predict(train, roles=roles) logging.debug('End fit automl...') test_pred = automl.predict(test) logging.debug('Prediction for test data:\n{}\nShape = {}' .format(test_pred, test_pred.shape)) not_nan = np.any(~np.isnan(oof_pred.data), axis=1) logging.debug('Check scores...') print('OOF score: {}'.format(roc_auc_score(train[roles['target']].values[not_nan], oof_pred.data[not_nan][:, 0]))) print('TEST score: {}'.format(roc_auc_score(test[roles['target']].values, test_pred.data[:, 0]))) logging.debug('Pickle automl') with open('automl.pickle', 'wb') as f: pickle.dump(automl, f) logging.debug('Load pickled automl') with open('automl.pickle', 'rb') as f: automl = pickle.load(f) logging.debug('Predict loaded automl') test_pred = automl.predict(test) logging.debug('TEST score, loaded: {}'.format(roc_auc_score(test['TARGET'].values, test_pred.data[:, 0]))) os.remove('automl.pickle')
print("Data splitted. Parts sizes: train_data = {}, test_data = {}".format( train_data.shape, test_data.shape)) print("Create task..") task = Task("binary") print("Task created") print("Create reader...") reader = PandasToPandasReader(task, cv=5, random_state=1) print("Reader created") # selector parts print("Create feature selector") model01 = BoostLGBM(default_params={ "learning_rate": 0.05, "num_leaves": 64, "seed": 42, "num_threads": 5, }) model02 = BoostLGBM(default_params={ "learning_rate": 0.05, "num_leaves": 64, "seed": 42, "num_threads": 5, }) pipe0 = LGBSimpleFeatures() pie = NpPermutationImportanceEstimator() pie1 = ModelBasedImportanceEstimator() sel1 = ImportanceCutoffSelector(pipe0, model01, pie1, cutoff=0) sel2 = NpIterativeFeatureSelector(pipe0, model02, pie,
pd_dataset = PandasDataset(data, roles_parser(check_roles), task=task) print("PandasDataset created. Time = {:.3f} sec".format(time.time() - start_time)) # # Print pandas dataset feature roles print("Print pandas dataset feature roles") roles = pd_dataset.roles for role in roles: print("{}: {}".format(role, roles[role])) # # Feature selection part print("Feature selection part") selector_iterator = FoldsIterator(pd_dataset, 1) print("Selection iterator created") model = BoostLGBM() pipe = LGBSimpleFeatures() print("Pipe and model created") model0 = BoostLGBM(default_params={ "learning_rate": 0.05, "num_leaves": 64, "seed": 0, "num_threads": 5, }) mbie = ModelBasedImportanceEstimator() selector = ImportanceCutoffSelector(pipe, model0, mbie, cutoff=10) start_time = time.time() selector.fit(selector_iterator) print("Feature selector fitted. Time = {:.3f} sec".format(time.time() -
data["allnan"] = np.nan data.drop(["DAYS_BIRTH", "DAYS_EMPLOYED"], axis=1, inplace=True) data["TARGET"] = np.where( np.random.rand(data.shape[0]) > 0.5, 2, data["TARGET"].values) train, test = train_test_split(data, test_size=2000, random_state=42) # ====================================================================================== print("Create timer...") timer = PipelineTimer(600, mode=2) print("Timer created...") # ====================================================================================== print("Create selector...") timer_gbm = timer.get_task_timer("gbm") feat_sel_0 = LGBSimpleFeatures() mod_sel_0 = BoostLGBM(timer=timer_gbm) imp_sel_0 = ModelBasedImportanceEstimator() selector_0 = ImportanceCutoffSelector( feat_sel_0, mod_sel_0, imp_sel_0, cutoff=0, ) print("Selector created...") # ====================================================================================== print("Create gbms...") feats_gbm_0 = LGBAdvancedPipeline(top_intersections=4, output_categories=True, feats_imp=imp_sel_0) timer_gbm_0 = timer.get_task_timer("gbm") timer_gbm_1 = timer.get_task_timer("gbm")
def test_different_losses_and_metrics(): np.random.seed(42) logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.DEBUG) logging.debug('Load data...') data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv') logging.debug('Data loaded') logging.debug('Features modification from user side...') data['BIRTH_DATE'] = ( np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str) data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype( np.dtype('timedelta64[D]'))).astype(str) data['constant'] = 1 data['allnan'] = np.nan data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True) logging.debug('Features modification finished') logging.debug('Split data...') train_data, test_data = train_test_split(data, test_size=2000, stratify=data['TARGET'], random_state=13) train_data.reset_index(drop=True, inplace=True) test_data.reset_index(drop=True, inplace=True) logging.debug( 'Data splitted. Parts sizes: train_data = {}, test_data = {}'.format( train_data.shape, test_data.shape)) for task_params, target in zip([{ 'name': 'binary' }, { 'name': 'binary', 'metric': roc_auc_score }, { 'name': 'reg', 'loss': 'mse', 'metric': 'r2' }, { 'name': 'reg', 'loss': 'rmsle', 'metric': 'rmsle' }, { 'name': 'reg', 'loss': 'quantile', 'loss_params': { 'q': .9 }, 'metric': 'quantile', 'metric_params': { 'q': .9 } }], ['TARGET', 'TARGET', 'AMT_CREDIT', 'AMT_CREDIT', 'AMT_CREDIT']): logging.debug('Create task..') task = Task(**task_params) logging.debug('Task created') logging.debug('Create reader...') reader = PandasToPandasReader(task, cv=5, random_state=1) logging.debug('Reader created') # pipeline 1 level parts logging.debug('Start creation pipeline_1...') pipe = LGBSimpleFeatures() logging.debug('\t ParamsTuner2 and Model2...') model2 = BoostLGBM(default_params={ 'learning_rate': 0.025, 'num_leaves': 64, 'seed': 2, 'num_threads': 5 }) logging.debug('\t Tuner2 and model2 created') logging.debug('\t Pipeline1...') pipeline_lvl1 = MLPipeline( [model2], pre_selection=None, # selector, features_pipeline=pipe, post_selection=None) logging.debug('Pipeline1 created') logging.debug('Create AutoML pipeline...') automl = AutoML(reader, [ [pipeline_lvl1], ], skip_conn=False) logging.debug('AutoML pipeline created...') logging.debug('Start AutoML pipeline fit_predict...') start_time = time.time() oof_pred = automl.fit_predict(train_data, roles={'target': target}) logging.debug( 'AutoML pipeline fitted and predicted. Time = {:.3f} sec'.format( time.time() - start_time)) test_pred = automl.predict(test_data) logging.debug('Prediction for test data:\n{}\nShape = {}'.format( test_pred, test_pred.shape)) logging.debug('Check scores...') logging.debug('OOF score: {}'.format( task.metric_func(train_data[target].values, oof_pred.data[:, 0]))) logging.debug('TEST score: {}'.format( task.metric_func(test_data[target].values, test_pred.data[:, 0]))) logging.debug('Pickle automl') with open('automl.pickle', 'wb') as f: pickle.dump(automl, f) logging.debug('Load pickled automl') with open('automl.pickle', 'rb') as f: automl = pickle.load(f) logging.debug('Predict loaded automl') test_pred = automl.predict(test_data) logging.debug('TEST score, loaded: {}'.format( roc_auc_score(test_data['TARGET'].values, test_pred.data[:, 0]))) os.remove('automl.pickle')
def test_timer_blender_multiclass(): np.random.seed(42) logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.DEBUG) data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv') data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str) data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]')) ).astype(str) data['report_dt'] = np.datetime64('2018-01-01') data['constant'] = 1 data['allnan'] = np.nan data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True) data['TARGET'] = np.where(np.random.rand(data.shape[0]) > .5, 2, data['TARGET'].values) train, test = train_test_split(data, test_size=2000, random_state=42) # ====================================================================================== logging.debug('Create timer...') timer = PipelineTimer(600, mode=2) logging.debug('Timer created...') # ====================================================================================== logging.debug('Create selector...') timer_gbm = timer.get_task_timer('gbm') feat_sel_0 = LGBSimpleFeatures() mod_sel_0 = BoostLGBM(timer=timer_gbm) imp_sel_0 = ModelBasedImportanceEstimator() selector_0 = ImportanceCutoffSelector(feat_sel_0, mod_sel_0, imp_sel_0, cutoff=0, ) logging.debug('Selector created...') # ====================================================================================== logging.debug('Create gbms...') feats_gbm_0 = LGBAdvancedPipeline(top_intersections=4, output_categories=True, feats_imp=imp_sel_0) timer_gbm_0 = timer.get_task_timer('gbm') timer_gbm_1 = timer.get_task_timer('gbm') gbm_0 = BoostLGBM(timer=timer_gbm_0) gbm_1 = BoostLGBM(timer=timer_gbm_1) tuner_0 = OptunaTuner(n_trials=10, timeout=10, fit_on_holdout=True) gbm_lvl0 = MLPipeline([ (gbm_0, tuner_0), gbm_1 ], pre_selection=selector_0, features_pipeline=feats_gbm_0, post_selection=None) logging.debug('Gbms created...') # ====================================================================================== logging.debug('Create linear...') feats_reg_0 = LinearFeatures(output_categories=True, sparse_ohe='auto') timer_reg = timer.get_task_timer('reg') reg_0 = LinearLBFGS(timer=timer_reg) reg_lvl0 = MLPipeline([ reg_0 ], pre_selection=None, features_pipeline=feats_reg_0, post_selection=None) logging.debug('Linear created...') # ====================================================================================== logging.debug('Create reader...') reader = PandasToPandasReader(Task('multiclass', metric='crossentropy', # metric_params = {'multi_class': 'ovr'} ), samples=None, max_nan_rate=1, max_constant_rate=1, advanced_roles=True, drop_score_co=-1, n_jobs=1) logging.debug('Reader created...') # ====================================================================================== logging.debug('Create blender...') blender = WeightedBlender() logging.debug('Blender created...') # ====================================================================================== logging.debug('Create AutoML...') automl = AutoML(reader=reader, levels=[ [gbm_lvl0, reg_lvl0] ], timer=timer, blender=blender, skip_conn=False) logging.debug('AutoML created...') # ====================================================================================== logging.debug('Fit predict...') oof_pred = automl.fit_predict(train, roles={'target': "TARGET"}) logging.debug('Finnished fitting...') test_pred = automl.predict(test) logging.debug('Prediction for test data:\n{}\nShape = {}' .format(test_pred, test_pred.shape)) # ====================================================================================== not_nan = np.any(~np.isnan(oof_pred.data), axis=1) logging.debug('Check scores...') logging.debug('OOF score: {}'.format(log_loss(train['TARGET'].values[not_nan], oof_pred.data[not_nan, :]))) logging.debug('TEST score: {}'.format(log_loss(test['TARGET'].values, test_pred.data))) # ====================================================================================== logging.debug('Pickle automl') with open('automl.pickle', 'wb') as f: pickle.dump(automl, f) logging.debug('Load pickled automl') with open('automl.pickle', 'rb') as f: automl = pickle.load(f) logging.debug('Predict loaded automl') test_pred = automl.predict(test) logging.debug('TEST score, loaded: {}'.format(log_loss(test['TARGET'].values, test_pred.data))) # ====================================================================================== for dat, df, name in zip([oof_pred, test_pred], [train, test], ['train', 'test']): logging.debug('Check aucs {0}...'.format(name)) for c in range(3): _sc = roc_auc_score((df['TARGET'].values == c).astype(np.float32), dat.data[:, c]) logging.debug('Cl {0} auc score: {1}'.format(c, _sc)) # ====================================================================================== os.remove('automl.pickle')