def _create_bankdata_experiment(predefined_kwargs, maker=None, need_test=False, user_kwargs=None): target = 'y' df = dsutils.load_bank().head(2000) df[target] = LabelEncoder().fit_transform(df[target]) df_train, df_test = train_test_split(df, test_size=0.3, random_state=9527) def maker_(*args, **kwargs): return make_experiment(PlainModel, *args, **kwargs) default_kwargs = dict(log_level='info') predefined_kwargs.update(default_kwargs) if maker is None: maker = maker_ predefined_kwargs['search_space'] = PlainSearchSpace(enable_lr=True, enable_nn=False, enable_dt=False, enable_dtr=False) predefined_kwargs['hyper_model_options'] = {'transformer': MultiLabelEncoder} if need_test: predefined_kwargs['test_data'] = df_test predefined_kwargs.update(user_kwargs) return maker(df_train, target=target, task=const.TASK_BINARY, **predefined_kwargs)
def test_experiment_with_blood_full_features(): df = dsutils.load_blood() target = 'Class' df_train, df_test = train_test_split(df, train_size=0.8, random_state=335) df_test.pop(target) experiment = make_experiment( PlainModel, df, target=target, search_space=PlainSearchSpace(), test_data=df_test, feature_generation=True, collinearity_detection=True, drift_detection=True, feature_selection=True, down_sample_search=True, down_sample_search_size=0.2, feature_reselection=True, pseudo_labeling=True, random_state=335, early_stopping_time_limit=1200, # log_level='info', ) estimator = experiment.run(max_trials=3) print(estimator) assert estimator is not None step_names = [step[0] for step in estimator.steps] assert step_names == [ StepNames.DATA_CLEAN, StepNames.MULITICOLLINEARITY_DETECTION, 'estimator' ]
def _create_experiment(predefined_kwargs, maker=None, need_test=False, user_kwargs=None): df = dsutils.load_boston() df['Constant'] = [0 for i in range(df.shape[0])] df['Id'] = [i for i in range(df.shape[0])] target = 'target' df_train, df_test = train_test_split(df, test_size=0.2, random_state=1234) df_test.pop(target) df_train['Drifted'] = np.random.random(df_train.shape[0]) df_test['Drifted'] = np.random.random(df_test.shape[0]) * 100 def maker_(*args, **kwargs): if 'random_state' not in kwargs.keys(): kwargs['random_state'] = 1234 return make_experiment(PlainModel, *args, **kwargs) default_kwargs = dict( log_level='info', ) predefined_kwargs.update(default_kwargs) if maker is None: maker = maker_ predefined_kwargs['search_space'] = PlainSearchSpace(enable_lr=False, enable_nn=False, enable_dt=False, enable_dtr=True) if need_test: predefined_kwargs['test_data'] = df_test predefined_kwargs.update(user_kwargs) return maker(df_train, target=target, task=const.TASK_REGRESSION, **predefined_kwargs)
def test_creator(self): test_data = dsutils.load_blood() eval_data = dsutils.load_blood() make_options = { 'test_data': test_data, 'eval_data': eval_data, "task": const.TASK_BINARY, 'target': "Class", 'feature_selection': True, 'feature_selection_strategy': "threshold", "evaluation_metrics": "auto", "evaluation_persist_prediction": True, "report_render": 'excel', "search_space": PlainSearchSpace(), } job_working_dir = common_util.get_temp_dir_path( prefix="hyn_job_creator_test_") exp = BloodDatasetJobEngine().create_experiment_with_params( make_options, job_working_dir) assert exp assert isinstance(exp, CompeteExperiment) run_options = {"max_trials": 2} exp.run(**run_options) assert (Path(job_working_dir) / "report.xlsx").exists()
def test_experiment_with_blood_simple(): df = dsutils.load_blood() experiment = make_experiment(PlainModel, df, target='Class', search_space=PlainSearchSpace()) estimator = experiment.run(max_trials=3) print(estimator) assert estimator is not None
def create_hyper_model(reward_metric='auc', optimize_direction='max'): search_space = PlainSearchSpace() searcher = make_searcher('random', search_space_fn=search_space, optimize_direction=optimize_direction) hyper_model = PlainModel(searcher=searcher, reward_metric=reward_metric, callbacks=[]) return hyper_model
def create_plain_model(reward_metric='auc', optimize_direction='max', with_encoder=False, with_dask=False): search_space = PlainSearchSpace(enable_dt=True, enable_lr=True, enable_nn=False) searcher = make_searcher('random', search_space_fn=search_space, optimize_direction=optimize_direction) encoder = MultiLabelEncoder if with_encoder else None cls = DaskPlainModel if with_dask else PlainModel hyper_model = cls(searcher=searcher, reward_metric=reward_metric, callbacks=[SummaryCallback()], transformer=encoder) return hyper_model
def maker(df_train, target, df_eval, file_path): experiment = make_experiment( PlainModel, df_train, target=target, test_data=df_eval.copy(), drift_detection_threshold=0.4, drift_detection_min_features=3, drift_detection_remove_size=0.5, search_space=PlainSearchSpace(enable_lr=False, enable_nn=False), report_render='excel', report_render_options={'file_path': file_path}) return experiment
def maker(df_train, target, df_eval, file_path): from hypernets.experiment.report import ExcelReportRender experiment = make_experiment( PlainModel, df_train, target=target, eval_data=df_eval, test_data=df_eval.copy(), drift_detection_threshold=0.4, drift_detection_min_features=3, drift_detection_remove_size=0.5, search_space=PlainSearchSpace(enable_lr=False, enable_nn=False), report_render=ExcelReportRender(file_path)) return experiment
def test_regression_task_report(): df = dsutils.load_boston() df['Constant'] = [0 for i in range(df.shape[0])] df['Id'] = [i for i in range(df.shape[0])] target = 'target' df_train, df_eval = train_test_split(df, test_size=0.2) df_train['Drifted'] = np.random.random(df_train.shape[0]) df_eval['Drifted'] = np.random.random(df_eval.shape[0]) * 100 file_path = common_util.get_temp_file_path(prefix="report_excel_", suffix=".xlsx") print(file_path) experiment = make_experiment( PlainModel, df_train, target=target, eval_data=df_eval.copy(), test_data=df_eval.copy(), drift_detection_threshold=0.4, drift_detection_min_features=3, drift_detection_remove_size=0.5, search_space=PlainSearchSpace(enable_lr=False, enable_nn=False, enable_dt=False, enable_dtr=True), report_render='excel', report_render_options={'file_path': file_path}) estimator = experiment.run(max_trials=3) assert estimator is not None mlr_callback = None mle_callback = None for callback in experiment.callbacks: if isinstance(callback, MLReportCallback): mlr_callback = callback if isinstance(callback, MLEvaluateCallback): mle_callback = callback assert mlr_callback is not None _experiment_meta: ExperimentMeta = mlr_callback.experiment_meta_ assert len(_experiment_meta.resource_usage) > 0 assert len(_experiment_meta.steps) == 5 assert os.path.exists(file_path) assert mle_callback is not None assert _experiment_meta.evaluation_metric is not None assert len(_experiment_meta.prediction_stats) == 1 assert len(_experiment_meta.datasets) == 3
def run_experiment(train_df, check_as_local=True, **kwargs): experiment = make_experiment(PlainModel, train_df, search_space=PlainSearchSpace(), **kwargs) estimator = experiment.run() print(experiment.random_state, estimator) assert estimator is not None if check_as_local: assert hasattr(estimator, 'as_local') local_estimator = estimator.as_local() assert not hasattr(local_estimator, 'as_local')
def test_experiment_with_data_adaption(): df = dsutils.load_bank() df = MultiLabelEncoder().fit_transform(df) mem_usage = int(df.memory_usage().sum()) experiment = make_experiment( PlainModel, df, target='y', search_space=PlainSearchSpace(), data_adaption_memory_limit=mem_usage // 2, log_level='info', ) estimator = experiment.run(max_trials=3) assert estimator is not None assert estimator.steps[0][0] == 'data_adaption'
def test_experiment_with_blood_down_sample(): df = dsutils.load_blood() experiment = make_experiment( PlainModel, df, target='Class', search_space=PlainSearchSpace(), down_sample_search=True, down_sample_search_size=0.1, down_sample_search_time_limit=300, down_sample_search_max_trials=10, # log_level='info', ) estimator = experiment.run(max_trials=3) print(estimator) assert estimator is not None
def main(): df = dsutils.load_boston() df_train, df_eval = train_test_split(df, test_size=0.2) search_space = PlainSearchSpace(enable_lr=False, enable_nn=False, enable_dt=False, enable_dtr=True) experiment = make_experiment(PlainModel, df_train, target='target', search_space=search_space, report_render='excel') estimator = experiment.run(max_trials=3) print(estimator)