예제 #1
0
 def start_tuner(self, tuner: Tuner, hdl: dict):
     self.logger.info(f"Start fine tune task, \nwhich HDL(Hyperparams Descriptions Language) is:\n{hdl}")
     self.logger.info(f"which Tuner is:\n{tuner}")
     tuner.set_data_manager(self.data_manager)
     tuner.set_random_state(self.random_state)
     tuner.set_hdl(hdl)  # just for get shps of tuner
     if estimate_config_space_numbers(tuner.shps) == 1:
         self.logger.info("HDL(Hyperparams Descriptions Language) is a constant space, using manual modeling.")
         dhp, self.estimator = tuner.shp2model(tuner.shps.sample_configuration())
         self.estimator.fit(self.data_manager.X_train, self.data_manager.y_train)
         return {"is_manual": True}
     n_jobs = tuner.n_jobs
     run_limits = [math.ceil(tuner.run_limit / n_jobs)] * n_jobs
     is_master_list = [False] * n_jobs
     is_master_list[0] = True
     initial_configs_list = get_chunks(
         tuner.design_initial_configs(n_jobs),
         n_jobs)
     random_states = np.arange(n_jobs) + self.random_state
     if n_jobs > 1 and tuner.search_method != "grid":
         sync_dict = Manager().dict()
         sync_dict["exit_processes"] = tuner.exit_processes
     else:
         sync_dict = None
     self.resource_manager.close_trials_db()
     self.resource_manager.clear_pid_list()
     self.resource_manager.close_redis()
     resource_managers = [deepcopy(self.resource_manager) for i in range(n_jobs)]
     tuners = [deepcopy(tuner) for i in range(n_jobs)]
     with joblib.parallel_backend(n_jobs=n_jobs, backend="multiprocessing"):
         joblib.Parallel()(
             joblib.delayed(self.run)
             (tuner, resource_manager, run_limit, initial_configs, is_master, random_state, sync_dict)
             for tuner, resource_manager, run_limit, initial_configs, is_master, random_state in
             zip(tuners, resource_managers, run_limits, initial_configs_list, is_master_list, random_states)
         )
     return {"is_manual": False}
예제 #2
0
 def __init__(
         self,
         tuner: Union[Tuner, List[Tuner], None, dict] = None,
         hdl_constructor: Union[HDL_Constructor, List[HDL_Constructor], None, dict] = None,
         resource_manager: Union[ResourceManager, str] = None,
         ensemble_builder: Union[StackEnsembleBuilder, None, bool, int] = None,
         random_state=42
 ):
     # ---logger------------------------------------
     self.logger = get_logger(__name__)
     # ---random_state-----------------------------------
     self.random_state = random_state
     # ---ensemble_builder-----------------------------------
     if ensemble_builder is None:
         self.logger.info("Using default StackEnsembleBuilder.")
         ensemble_builder = StackEnsembleBuilder()
     elif ensemble_builder == False:
         self.logger.info("Not using EnsembleBuilder, will select the best estimator.")
     else:
         ensemble_builder = StackEnsembleBuilder(set_model=ensemble_builder)
     self.ensemble_builder = ensemble_builder
     # ---tuners-----------------------------------
     if not tuner:
         tuner = Tuner()
     if not isinstance(tuner, (list, tuple)):
         tuner = [tuner]
     self.tuners: List[Tuner] = tuner
     # ---hdl_constructors-----------------------------------
     if not hdl_constructor:
         hdl_constructor = HDL_Constructor()
     if not isinstance(hdl_constructor, (list, tuple)):
         hdl_constructor = [hdl_constructor]
     self.hdl_constructors = hdl_constructor
     # ---resource_manager-----------------------------------
     if resource_manager is None:
         resource_manager = ResourceManager()
     self.resource_manager = resource_manager
     # ---member_variable------------------------------------
     self.estimator = None
예제 #3
0
import pandas as pd
from sklearn.model_selection import ShuffleSplit

from hyperflow.estimator.base import HyperFlowEstimator
from hyperflow.hdl.hdl_constructor import HDL_Constructor
from hyperflow.tuner.tuner import Tuner

df = pd.read_csv("../examples/classification/train_classification.csv")
ss = ShuffleSplit(n_splits=1, random_state=0, test_size=0.25)
train_ix, test_ix = next(ss.split(df))
df_train = df.iloc[train_ix, :]
df_test = df.iloc[test_ix, :]

tuner = Tuner(
    initial_runs=5,
    run_limit=12,
)
hdl_constructor = HDL_Constructor(
    DAG_descriptions={
        "nan->imp": "impute.fill_abnormal",
        "imp->{cat_name=cat,num_name=num}": "operate.split.cat_num",
        "cat->num": "encode.cat_boost|scale.standardize",
        "num->target": "reduce.pca|lightgbm"
    })
hyperflow_pipeline = HyperFlowEstimator(tuner, hdl_constructor)
column_descriptions = {
    "id": "PassengerId",
    "target": "Survived",
    "ignore": "Name"
}
예제 #4
0
import pandas as pd
from sklearn.model_selection import ShuffleSplit

from hyperflow.estimator.base import HyperFlowEstimator
from hyperflow.tuner.tuner import Tuner

df = pd.read_csv("../examples/classification/train_classification.csv")
ss = ShuffleSplit(n_splits=1, random_state=0, test_size=0.25)
train_ix, test_ix = next(ss.split(df))
df_train = df.iloc[train_ix, :]
df_test = df.iloc[test_ix, :]

tuner = Tuner(
    initial_runs=1,
    run_limit=100,
    n_jobs=1,
    search_method_params={"anneal_func":"lambda x:1*(1/(-(3*(x-1))))"}
)
hyperflow_pipeline = HyperFlowEstimator(tuner)
column_descriptions = {
    "id": "PassengerId",
    "target": "Survived",
    "ignore": "Name"
}
hyperflow_pipeline.fit(
    X=df_train, X_test=df_test, column_descriptions=column_descriptions
)
예제 #5
0
df = pd.read_csv("../examples/classification/train_classification.csv")
ss = ShuffleSplit(n_splits=1, random_state=0, test_size=0.25)
train_ix, test_ix = next(ss.split(df))
df_train = df.iloc[train_ix, :]
df_test = df.iloc[test_ix, :]

hdl_constructor = HDL_Constructor(
    DAG_descriptions={
        "nan->{highR=highR_nan,lowR=lowR_nan}": "operate.split.nan",
        "lowR_nan->nan": "impute.fill_abnormal",
        "highR_nan->nan": "operate.drop",
        "all->{cat_name=cat,num_name=num}": "operate.split.cat_num",
        "cat->num": "encode.cat_boost",
        "num->target": {
            "_name": "lightgbm",
            "_vanilla": True
        }
    })
tuner = Tuner(run_limit=-1, search_method="grid")
hyperflow_pipeline = HyperFlowEstimator(tuner, hdl_constructor)
column_descriptions = {
    "id": "PassengerId",
    "target": "Survived",
    "ignore": "Name"
}

hyperflow_pipeline.fit(X=df_train,
                       X_test=df_test,
                       column_descriptions=column_descriptions)
예제 #6
0
            "lowR_nan->nan": "impute.fill_abnormal",
            "highR_nan->nan": "operate.drop",
            "all->{cat_name=cat,num_name=num}": "operate.split.cat_num",
            "cat->num": "encode.label",
            "num->num": {"_name": "<placeholder>",
                         "_select_percent": {"_type": "quniform", "_value": [1, 100, 0.5],
                                                       "_default": 80}},
            "num->target": {"_name": "lightgbm", "_vanilla": True}
        }
    ),
]

tuners = [
    Tuner(
        run_limit=-1,
        search_method="grid",
        n_jobs=3
    ),
    Tuner(
        run_limit=50,
        initial_runs=10,
        search_method="smac",
        n_jobs=3
    ),
]
hyperflow_pipeline = HyperFlowEstimator(tuners, hdl_constructors)
column_descriptions = {
    "id": "PassengerId",
    "target": "Survived",
    "ignore": "Name"
}
예제 #7
0
import pandas as pd
from sklearn.model_selection import ShuffleSplit

from hyperflow.estimator.base import HyperFlowEstimator
from hyperflow.hdl.hdl_constructor import HDL_Constructor
from hyperflow.tuner.tuner import Tuner

df = pd.read_csv("../examples/classification/train_classification.csv")
ss = ShuffleSplit(n_splits=1, random_state=0, test_size=0.25)
train_ix, test_ix = next(ss.split(df))
df_train = df.iloc[train_ix, :]
df_test = df.iloc[test_ix, :]

tuner = Tuner(
    initial_runs=30,
    run_limit=0,
)
hdl_constructor = HDL_Constructor(
    DAG_descriptions={
        "nan->imp": "impute.fill_abnormal",
        "imp->{cat_name=cat,num_name=num}": "operate.split.cat_num",
        "cat->num": "encode.cat_boost",
        "over_sample": [
            "balance.under_sample.all_knn",
            "balance.under_sample.cluster_centroids",
            "balance.under_sample.condensed_nearest_neighbour",
            "balance.under_sample.edited_nearest_neighbours",
            "balance.under_sample.instance_hardness_threshold",
            "balance.under_sample.near_miss",
            "balance.under_sample.neighbourhood_cleaning_rule",
            "balance.under_sample.one_sided_selection",
예제 #8
0
import pandas as pd

from hyperflow.estimator.base import HyperFlowEstimator
from hyperflow.hdl.hdl_constructor import HDL_Constructor
from hyperflow.tuner.tuner import Tuner

df = pd.read_csv("../data/QSAR.csv")

hdl_constructor = HDL_Constructor(
    DAG_descriptions={
        "num->var": "compress.variance",
        "var->pea": {
            "_name": "compress.pearson",
            "n_jobs": 6
        },
        "pea->target": "logistic_regression"
    })
tuner = Tuner(run_limit=12, initial_runs=12, search_method="smac")
hyperflow_pipeline = HyperFlowEstimator(tuner, hdl_constructor)
column_descriptions = {"id": "Name", "target": "labels"}

hyperflow_pipeline.fit(X=df, column_descriptions=column_descriptions, n_jobs=3)
예제 #9
0
        if ind.any():
            temp = newfeature[:, i]
            a = temp[~np.isnan(temp)].mean()
            newfeature[:, i][np.isnan(temp)] = a

    # 标准化
    stdScale = StandardScaler().fit(newfeature)
    newfeaturenorm = stdScale.transform(newfeature)

    # 区间化
    bins = [-9, -5, -3, -1, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 24]
    new_range = pd.cut(df.Label, bins)
    newlabel = np.array(df.Label)
    return newfeaturenorm, newlabel, new_range


x_train, y_train, y_range = data_preprocessing()

tuner = Tuner(
    initial_runs=12,
    run_limit=120,
)
hdl_constructor = HDL_Constructor(DAG_descriptions={"num->target": "lightgbm"})
resource_manager = ResourceManager(os.getcwd() + "/for_hxw_result")
hyperflow_pipeline = HyperFlowEstimator(tuner,
                                        hdl_constructor,
                                        ensemble_builder=False)

hyperflow_pipeline.fit(X=x_train, y=y_train, n_jobs=3)
joblib.dump(hyperflow_pipeline, "hyperflow_pipeline_for_hxw.bz")