def __init__(self, threshold, n_jobs=1, max_delete=1): self.max_delete = max_delete self.to_delete = [] self.threshold = threshold self.n_jobs = n_jobs self._type = "DataFrame" self.logger = get_logger(self)
def init_data(self, random_state, data_manager: DataManager, metric: Scorer, should_calc_all_metric: bool, splitter, should_store_intermediate_result: bool, resource_manager: ResourceManager): self.random_state = random_state if hasattr(splitter, "random_state"): setattr(splitter, "random_state", self.random_state) self.splitter = splitter self.data_manager = data_manager self.X_train = self.data_manager.X_train self.y_train = self.data_manager.y_train self.X_test = self.data_manager.X_test self.y_test = self.data_manager.y_test self.should_store_intermediate_result = should_store_intermediate_result self.metric = metric self.ml_task: MLTask = self.data_manager.ml_task self.should_calc_all_metric = should_calc_all_metric if self.ml_task.mainTask == "regression": self.predict_function = self._predict_regression else: self.predict_function = self._predict_proba self.logger = get_logger(self) self.resource_manager = resource_manager
def __init__(self): self.resource_manager = None self.estimator = None self.in_feature_groups = None self.out_feature_groups = None self.hyperparams = {} self.logger = get_logger(self)
def __init__( self, X_train: Union[pd.DataFrame, GenericDataFrame, np.ndarray, None] = None, y_train: Union[pd.Series, np.ndarray, str, None] = None, X_test: Union[pd.DataFrame, GenericDataFrame, np.ndarray, None] = None, y_test: Union[pd.Series, np.ndarray, str, None] = None, dataset_metadata: Dict[str, Any] = frozenset(), column_descriptions: Dict[str, Union[List[str], str]] = None, highR_nan_threshold: float = 0.5, ): ''' Parameters ---------- X_train: :class:`numpy.ndarray` or :class:`pandas.DataFrame` y_train: :class:`numpy.ndarray` X_test: :class:`numpy.ndarray` or :class:`pandas.DataFrame` y_test: :class:`numpy.ndarray` dataset_metadata: dict column_descriptions: dict ``column_descriptions`` is a dict, key is ``feature_group``, value is column (column name) or columns (list of column names). This is a list of some frequently-used built-in ``feature_group`` * ``id`` - id of this table. * ``ignore`` - some columns which contains irrelevant information. * ``target`` - column in the dataset is what your model will learn to predict. * ``nan`` - Not a Number, a column contain missing values. * ``num`` - numerical features, such as [1, 2, 3]. * ``cat`` - categorical features, such as ["a", "b", "c"]. * ``num_nan`` - numerical features contains missing values. such as [1, 2, NaN]. * ``cat_nan`` - categorical features contains missing values. such as ["a", "b", NaN]. * ``highR_nan`` - highly ratio NaN. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` * ``lowR_nan`` - lowly ratio NaN. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` * ``highR_cat`` - highly cardinality ratio categorical. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` * ``lowR_cat`` - lowly cardinality ratio categorical. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` highR_nan_threshold: float high ratio NaN threshold, you can find examples and practice in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` ''' self.logger = get_logger(self) dataset_metadata = dict(dataset_metadata) self.highR_nan_threshold = highR_nan_threshold self.dataset_metadata = dataset_metadata X_train = deepcopy(X_train) y_train = deepcopy(y_train) X_test = deepcopy(X_test) y_test = deepcopy(y_test) X_train, y_train, X_test, y_test, feature_groups, column2feature_groups = self.parse_column_descriptions( column_descriptions, X_train, y_train, X_test, y_test) self.feature_groups = feature_groups self.column2feature_groups = column2feature_groups self.ml_task: MLTask = get_ml_task_from_y(y_train) self.X_train = GenericDataFrame(X_train, feature_groups=feature_groups) self.y_train = y_train self.X_test = GenericDataFrame( X_test, feature_groups=feature_groups) if X_test is not None else None self.y_test = y_test if y_test is not None else None # todo: 用户自定义验证集可以通过RandomShuffle 或者mlxtend指定 # fixme: 不支持multilabel if len(y_train.shape) > 2: raise ValueError('y must not have more than two dimensions, ' 'but has %d.' % len(y_train.shape)) if X_train.shape[0] != y_train.shape[0]: raise ValueError('X and y must have the same number of ' 'datapoints, but have %d and %d.' % (X_train.shape[0], y_train.shape[0]))
from copy import deepcopy from typing import List, Union import numpy as np import pandas as pd from pandas._typing import FrameOrSeries from pandas.core.generic import bool_t from autoflow.utils.logging import get_logger logger = get_logger(__name__) class GenericDataFrame(pd.DataFrame): def __init__(self, *args, **kwargs): # self. if "feature_groups" in kwargs: feature_groups = kwargs.pop("feature_groups") else: feature_groups = None if "columns_metadata" in kwargs: columns_metadata = kwargs.pop("columns_metadata") else: columns_metadata = None super(GenericDataFrame, self).__init__(*args, **kwargs) if feature_groups is None: logger.debug( "feature_groups is None, set it all to 'cat' feature group.") feature_groups = ["cat"] * self.shape[1] assert (len(feature_groups) == self.shape[1]) self.set_feature_groups(pd.Series(feature_groups))
def test_pipeline(self): self.logger = get_logger(self) df = pd.read_csv("../examples/classification/train_classification.csv") y = df.pop("Survived").values df = df.loc[:, ["Sex", "Cabin", "Age"]] feature_groups = ["cat_nan", "cat_nan", "num_nan"] df_train, df_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=10) df_train = GenericDataFrame(df_train, feature_groups=feature_groups) df_test = GenericDataFrame(df_test, feature_groups=feature_groups) cv = KFold(n_splits=5, random_state=10, shuffle=True) train_ix, valid_ix = next(cv.split(df_train)) df_train, df_valid = df_train.split([train_ix, valid_ix]) y_valid = y_train[valid_ix] y_train = y_train[train_ix] fill_cat = FillCat() fill_cat.in_feature_groups = "cat_nan" fill_cat.out_feature_groups = "cat" fill_cat.update_hyperparams({"strategy": "<NULL>"}) fill_num = FillNum() fill_num.in_feature_groups = "num_nan" fill_num.out_feature_groups = "num" fill_num.update_hyperparams({"strategy": "median"}) ohe = OneHotEncoder() ohe.in_feature_groups = "cat" ohe.out_feature_groups = "num" sgd = SGD() sgd.in_feature_groups = "num" sgd.update_hyperparams({"loss": "log", "random_state": 10}) pipeline = GenericPipeline([ ("fill_cat", fill_cat), ("fill_num", fill_num), ("ohe", ohe), ("sgd", sgd), ]) pipeline.fit(df_train, y_train, df_valid, y_valid, df_test, y_test) pred_train = pipeline.predict(df_train) pred_test = pipeline.predict(df_test) pred_valid = pipeline.predict(df_valid) score_valid = pipeline.predict_proba(df_valid) self.logger.info(accuracy_score(y_train, pred_train)) self.logger.info(accuracy_score(y_valid, pred_valid)) self.logger.info(accuracy_score(y_test, pred_test)) result = pipeline.procedure(constants.binary_classification_task, df_train, y_train, df_valid, y_valid, df_test, y_test) pred_test = result["pred_test"] pred_valid = result["pred_valid"] self.logger.info( accuracy_score(y_valid, (pred_valid > .5).astype("int")[:, 1])) self.logger.info( accuracy_score(y_test, (pred_test > .5).astype("int")[:, 1])) pipeline = GenericPipeline([ ("fill_cat", fill_cat), ("fill_num", fill_num), ("ohe", ohe), ]) pipeline.fit(df_train, y_train, df_valid, y_valid, df_test, y_test) ret1 = pipeline.transform(df_train, df_valid, df_test) ret2 = pipeline.fit_transform(df_train, y_train, df_valid, y_valid, df_test, y_test) for key in ["X_train", "X_valid", "X_test"]: assert np.all(ret1[key] == ret2[key]) pipeline = GenericPipeline([ ("sgd", sgd), ]) result = pipeline.procedure(constants.binary_classification_task, ret1["X_train"], y_train, ret1["X_valid"], y_valid, ret1["X_test"], y_test) pred_test = result["pred_test"] pred_valid = result["pred_valid"] self.logger.info( accuracy_score(y_valid, (pred_valid > .5).astype("int")[:, 1])) self.logger.info( accuracy_score(y_test, (pred_test > .5).astype("int")[:, 1]))
def __init__(self, store_path="~/autoflow", file_system="local", file_system_params=frozendict(), db_type="sqlite", db_params=frozendict(), redis_params=frozendict(), max_persistent_estimators=50, persistent_mode="fs", compress_suffix="bz2"): ''' Parameters ---------- store_path: str A path store files, such as metadata and model file and database file, which belong to AutoFlow. file_system: str Indicator-string about which file system or storage system will be used. Available options list below: * ``local`` * ``hdfs`` * ``s3`` ``local`` is default value. file_system_params: dict Specific file_system configuration. db_type: str Indicator-string about which file system or storage system will be used. Available options list below: * ``sqlite`` * ``postgresql`` * ``mysql`` ``sqlite`` is default value. db_params: dict Specific database configuration. redis_params: dict Redis configuration. max_persistent_estimators: int Maximal number of models can persistent in single task. If more than this number, the The worst performing model file will be delete, the corresponding database record will also be deleted. persistent_mode: str Indicator-string about which persistent mode will be used. Available options list below: * ``db`` - serialize entity to bytes and store in database directly. * ``fs`` - serialize entity to bytes and form a pickle file upload to storage system or save in local. compress_suffix: str compress file's suffix, default is bz2 ''' # --logger------------------- self.logger = get_logger(self) # --preprocessing------------ file_system_params = dict(file_system_params) db_params = dict(db_params) redis_params = dict(redis_params) # ---file_system------------ directory = os.path.split(generic_fs.__file__)[0] file_system2cls = find_components(generic_fs.__package__, directory, FileSystem) self.file_system_type = file_system if file_system not in file_system2cls: raise Exception(f"Invalid file_system {file_system}") self.file_system: FileSystem = file_system2cls[file_system]( **file_system_params) if self.file_system_type == "local": store_path = os.path.expandvars(os.path.expanduser(store_path)) self.store_path = store_path # ---data_base------------ assert db_type in ("sqlite", "postgresql", "mysql") self.db_type = db_type self.db_params = dict(db_params) if db_type == "sqlite": assert self.file_system_type == "local" # ---redis---------------- self.redis_params = redis_params # ---max_persistent_model--- self.max_persistent_estimators = max_persistent_estimators # ---persistent_mode------- self.persistent_mode = persistent_mode assert self.persistent_mode in ("fs", "db") # ---compress_suffix------------ self.compress_suffix = compress_suffix # ---post_process------------ self.store_path = store_path self.file_system.mkdir(self.store_path) self.is_init_experiments_db = False self.is_init_tasks_db = False self.is_init_hdls_db = False self.is_init_trials_db = False self.is_init_redis = False self.is_master = False # --some specific path based on file_system--- self.datasets_dir = self.file_system.join(self.store_path, "datasets") self.databases_dir = self.file_system.join(self.store_path, "databases") self.parent_trials_dir = self.file_system.join(self.store_path, "trials") self.parent_experiments_dir = self.file_system.join( self.store_path, "experiments") for dir_path in [ self.datasets_dir, self.databases_dir, self.parent_experiments_dir, self.parent_trials_dir ]: self.file_system.mkdir(dir_path) # --db----------------------------------------- self.Datebase = get_db_class_by_db_type(self.db_type) # --JSONField----------------------------------------- if self.db_type == "sqlite": from playhouse.sqlite_ext import JSONField self.JSONField = JSONField elif self.db_type == "postgresql": from playhouse.postgres_ext import JSONField self.JSONField = JSONField elif self.db_type == "mysql": from playhouse.mysql_ext import JSONField self.JSONField = JSONField
def __init__(self): self.ml_task = None self.logger = get_logger(__name__)
def __init__(self, tuner: Union[Tuner, List[Tuner], None, dict] = None, hdl_constructor: Union[HDL_Constructor, List[HDL_Constructor], None, dict] = None, resource_manager: Union[ResourceManager, str] = None, random_state=42, log_file: str = None, log_config: Optional[dict] = None, highR_nan_threshold=0.5, highR_cat_threshold=0.5, **kwargs): ''' Parameters ---------- tuner: :class:`autoflow.tuner.tuner.Tuner` or None ``Tuner`` if class who agent an abstract search process. hdl_constructor: :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` or None ``HDL`` is abbreviation of Hyper-parameter Descriptions Language. It describes an abstract hyperparametric space that independent with concrete implementation. ``HDL_Constructor`` is a class who is responsible for translating dict-type ``DAG-workflow`` into ``H.D.L`` . resource_manager: :class:`autoflow.manager.resource_manager.ResourceManager` or None ``ResourceManager`` is a class manager computer resources such like ``file_system`` and ``data_base``. random_state: int random state log_file: path which file to store log, if is None, ``autoflow.log`` will be used. log_config: dict logging configuration highR_nan_threshold: float high ratio NaN threshold, you can find example and practice in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` highR_cat_threshold: float high ratio categorical feature's cardinality threshold, you can find example and practice in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` kwargs: dict if parameters like ``tuner`` or ``hdl_constructor`` and ``resource_manager`` are passing None, you can passing kwargs to make passed parameter work. See the following example. Examples --------- In this example, you can see a trick to seed kwargs parameters with out initializing :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` or other class. In following example, user pass ``DAG_workflow`` and ``hdl_bank`` by key-work arguments method. And we can see hdl_constructor is instanced by kwargs implicitly. >>> from autoflow import AutoFlowClassifier >>> classifier = AutoFlowClassifier(DAG_workflow={"num->target":["lightgbm"]}, ... hdl_bank={"classification":{"lightgbm":{"boosting_type": {"_type": "choice", "_value":["gbdt","dart","goss"]}}}}) AutoFlowClassifier(hdl_constructor=HDL_Constructor( DAG_workflow={'num->target': ['lightgbm']} hdl_bank_path=None hdl_bank={'classification': {'lightgbm': {'boosting_type': {'_type': 'choice', '_value': ['gbdt', 'dart', 'goss']}}}} included_classifiers=('adaboost', 'catboost', 'decision_tree', 'extra_trees', 'gaussian_nb', 'k_nearest_neighbors', 'liblinear_svc', 'lib... ''' self.log_config = log_config self.highR_nan_threshold = highR_nan_threshold self.highR_cat_threshold = highR_cat_threshold # ---logger------------------------------------ self.log_file = log_file setup_logger(self.log_file, self.log_config) self.logger = get_logger(self) # ---random_state----------------------------------- self.random_state = random_state # ---tuner----------------------------------- tuner = instancing(tuner, Tuner, kwargs) # ---tuners----------------------------------- self.tuners = sequencing(tuner, Tuner) self.tuner = self.tuners[0] # ---hdl_constructor-------------------------- hdl_constructor = instancing(hdl_constructor, HDL_Constructor, kwargs) # ---hdl_constructors------------------------- self.hdl_constructors = sequencing(hdl_constructor, HDL_Constructor) self.hdl_constructor = self.hdl_constructors[0] # ---resource_manager----------------------------------- self.resource_manager = instancing(resource_manager, ResourceManager, kwargs) # ---member_variable------------------------------------ self.estimator = None self.ensemble_estimator = None
def __init__( self, evaluator: Union[Callable, str] = "TrainEvaluator", search_method: str = "smac", run_limit: int = 100, initial_runs: int = 20, search_method_params: dict = frozendict(), n_jobs: int = 1, exit_processes: Optional[int] = None, limit_resource: bool = True, per_run_time_limit: float = 60, per_run_memory_limit: float = 3072, time_left_for_this_task: float = None, debug=False ): ''' Parameters ---------- evaluator: callable, str ``evaluator`` is a function or callable class (implement magic method ``__call__``) or string-indicator. ``evaluator`` can receive a shp(SMAC Hyper Param, :class:`ConfigSpace.ConfigurationSpace`), and return a dict ,which contains such keys: * ``loss``, you can think of it as negative reward. * ``status``, a string , ``SUCCESS`` means fine, ``FAILED`` means crashed. As default, "TrainEvaluator" is the string-indicator of :class:`autoflow.evaluation.train_evaluator.TrainEvaluator` . search_method: str Specific searching method, ``random``, ``smac``, ``grid`` are available. * ``random`` Random Search Algorithm, * ``grid`` Grid Search Algorithm, * ``smac`` Bayes Search by SMAC Algorithm. run_limit: int Limitation of running step. initial_runs: int If you choose ``smac`` algorithm, you should realize the SMAC algorithm has a initialize procedure, The algorithm needs enough initial runs to get enough experience. This param will be omitted if ``random`` or ``grid`` is selected. search_method_params: dict Configuration for specific search method. n_jobs: int ``n_jobs`` searching process will start. exit_processes: int limit_resource: bool If ``limit_resource = True``, a searching trial will be killed if it use more CPU times or memory. per_run_time_limit: float will active if ``limit_resource = True``. a searching trial will be killed if it use CPU times more than ``per_run_time_limit``. per_run_memory_limit: float will active if ``limit_resource = True``. a searching trial will be killed if it use memory more than ``per_run_memory_limit``. time_left_for_this_task: float will active if ``limit_resource = True``. a searching task will be killed if it's totally run time more than ``time_left_for_this_task``. debug: bool For debug mode. Exception will be re-raised if ``debug = True`` ''' self.debug = debug self.per_run_memory_limit = per_run_memory_limit self.time_left_for_this_task = time_left_for_this_task self.per_run_time_limit = per_run_time_limit self.limit_resource = limit_resource self.logger = get_logger(self) if self.debug and self.limit_resource: self.logger.warning( "Tuner.debug and Tuner.limit_resource cannot be both True. set Tuner.limit_resource to False.") self.limit_resource = False search_method_params = dict(search_method_params) if isinstance(evaluator, str): if evaluator == "TrainEvaluator": evaluator = TrainEvaluator elif evaluator == "EnsembleEvaluator": evaluator = EnsembleEvaluator else: raise NotImplementedError assert callable(evaluator) self.evaluator_prototype = evaluator if inspect.isfunction(evaluator): self.evaluator = evaluator else: self.evaluator = evaluator() self.evaluator.debug = self.debug self.search_method_params = search_method_params assert search_method in ("smac", "grid", "random") if search_method in ("grid", "random"): initial_runs = 0 self.initial_runs = initial_runs self.run_limit = run_limit self.search_method = search_method self.random_state = 0 self.addition_info = {} self.resource_manager = None self.ml_task = None self.data_manager = None self.n_jobs = parse_n_jobs(n_jobs) if exit_processes is None: exit_processes = max(self.n_jobs // 3, 1) self.exit_processes = exit_processes
def __init__( self, DAG_workflow: Union[str, Dict[str, Any]] = "generic_recommend", hdl_bank_path=None, hdl_bank=None, included_classifiers=("adaboost", "catboost", "decision_tree", "extra_trees", "gaussian_nb", "k_nearest_neighbors", "liblinear_svc", "libsvm_svc", "lightgbm", "logistic_regression", "random_forest", "sgd"), included_regressors=("adaboost", "bayesian_ridge", "catboost", "decision_tree", "elasticnet", "extra_trees", "gaussian_process", "k_nearest_neighbors", "kernel_ridge", "liblinear_svr", "lightgbm", "random_forest", "sgd"), included_highR_nan_imputers=("operate.drop", { "_name": "operate.merge", "__rely_model": "boost_model" }), included_cat_nan_imputers=("impute.fill_cat", { "_name": "impute.fill_abnormal", "__rely_model": "boost_model" }), included_num_nan_imputers=("impute.fill_num", { "_name": "impute.fill_abnormal", "__rely_model": "boost_model" }), included_highR_cat_encoders=("operate.drop", "encode.label", "encode.cat_boost"), included_lowR_cat_encoders=("encode.one_hot", "encode.label", "encode.cat_boost"), ): ''' Parameters ---------- DAG_workflow: str or dict, default="generic_recommend" directed acyclic graph (DAG) workflow to describe the machine-learning procedure. By default, this value is "generic_recommend", means HDL_Constructor will analyze the training data to recommend a valid DAG workflow. If you want design DAG workflow by yourself, you can seed a dict . hdl_bank_path: str, default=None ``hdl_bank`` is a json file which contains all the hyper-parameters of the algorithm. ``hdl_bank_path`` is this file's path. If it is None, ``autoflow/hdl/hdl_bank.json`` will be choosed. hdl_bank: dict, default=None If you pass param ``hdl_bank_path=None`` and pass ``hdl_bank`` as a dict, program will not load ``hdl_bank.json``, it uses passed ``hdl_bank`` directly. included_classifiers: list or tuple active if ``DAG_workflow="generic_recommend"``, and all of the following params will active in such situation. It decides which **classifiers** will consider in the algorithm selection. included_regressors: list or tuple It decides which **regressors** will consider in the algorithm selection. included_highR_nan_imputers: list or tuple ``highR_nan`` is a feature_group, means ``NaN`` has a high ratio in a column. for example: >>> from numpy import NaN >>> column = [1, 2, NaN, NaN, NaN] # nan ratio is 60% , more than 50% (default highR_nan_threshold) ``highR_nan_imputers`` algorithms will handle such columns contain high ratio missing value. included_cat_nan_imputers: list or tuple ``cat_nan`` is a feature_group, means a categorical feature column contains ``NaN`` value. for example: >>> column = ["a", "b", "c", "d", NaN] ``cat_nan_imputers`` algorithms will handle such columns. included_num_nan_imputers: list or tuple ``num_nan`` is a feature_group, means a numerical feature column contains ``NaN`` value. for example: >>> column = [1, 2, 3, 4, NaN] ``num_nan_imputers`` algorithms will handle such columns. included_highR_cat_encoders: list or tuple ``highR_cat`` is a feature_group, means a categorical feature column contains highly cardinality ratio. for example: >>> import numpy as np >>> column = ["a", "b", "c", "d", "a"] >>> rows = len(column) >>> np.unique(column).size / rows # result is 0.8 , is higher than 0.5 (default highR_cat_ratio) 0.8 ``highR_cat_imputers`` algorithms will handle such columns. included_lowR_cat_encoders: list or tuple ``lowR_cat`` is a feature_group, means a categorical feature column contains lowly cardinality ratio. for example: >>> import numpy as np >>> column = ["a", "a", "a", "d", "a"] >>> rows = len(column) >>> np.unique(column).size / rows # result is 0.4 , is lower than 0.5 (default lowR_cat_ratio) 0.4 ``lowR_cat_imputers`` algorithms will handle such columns. Attributes ---------- random_state: int ml_task: :class:`autoflow.utils.ml_task.MLTask` data_manager: :class:`autoflow.manager.data_manager.DataManager` hdl: dict construct by :meth:`run` Examples ---------- >>> import numpy as np >>> from autoflow.manager.data_manager import DataManager >>> from autoflow.hdl.hdl_constructor import HDL_Constructor >>> hdl_constructor = HDL_Constructor(DAG_workflow={"num->target":["lightgbm"]}, ... hdl_bank={"classification":{"lightgbm":{"boosting_type": {"_type": "choice", "_value":["gbdt","dart","goss"]}}}}) >>> data_manager = DataManager(X_train=np.random.rand(3,3), y_train=np.arange(3)) >>> hdl_constructor.run(data_manager, 42, 0.5) >>> hdl_constructor.hdl {'preprocessing': {}, 'estimating(choice)': {'lightgbm': {'boosting_type': {'_type': 'choice', '_value': ['gbdt', 'dart', 'goss']}}}} ''' self.included_lowR_cat_encoders = included_lowR_cat_encoders self.included_highR_cat_encoders = included_highR_cat_encoders self.included_num_nan_imputers = included_num_nan_imputers self.included_cat_nan_imputers = included_cat_nan_imputers self.included_highR_nan_imputers = included_highR_nan_imputers self.included_regressors = included_regressors self.included_classifiers = included_classifiers self.logger = get_logger(self) self.hdl_bank_path = hdl_bank_path self.DAG_workflow = DAG_workflow if hdl_bank is None: if hdl_bank_path: hdl_bank = get_hdl_bank(hdl_bank_path) else: hdl_bank = get_default_hdl_bank() if hdl_bank is None: hdl_bank = {} self.logger.warning("No hdl_bank, will use DAG_descriptions only.") self.hdl_bank = hdl_bank self.random_state = 42 self.ml_task = None self.data_manager = None