Пример #1
0
 def __init__(self,
              estimator=None,
              max_depth=7,
              n_estimators="auto",
              perc=100,
              alpha=0.05,
              two_step=True,
              max_iter=10,
              random_state=42,
              verbose=1,
              budget=10,
              weak=True,
              n_jobs=-1,
              imp_mask=None):
     self.imp_mask = imp_mask
     self.n_jobs = n_jobs
     self.max_depth = max_depth
     self.weak = weak
     self.budget = budget
     self.estimator = estimator
     self.n_estimators = n_estimators
     self.perc = perc
     self.alpha = alpha
     self.two_step = two_step
     self.max_iter = max_iter
     self.random_state = random_state
     self.verbose = verbose
     self.__version__ = '0.3'
     self._is_lightgbm = 'lightgbm' in str(type(self.estimator))
     self.logger = get_logger(self)
     self.logging_level = 20 if self.verbose > 0 else 10
Пример #2
0
 def __init__(self):
     self.resource_manager = None
     self.estimator = None
     self.in_feature_groups = None
     self.out_feature_groups = None
     self.hyperparams = {}
     self.logger = get_logger(self)
Пример #3
0
 def __init__(self, threshold, n_jobs=1, max_delete=1):
     self.max_delete = max_delete
     self.to_delete = []
     self.threshold = threshold
     self.n_jobs = n_jobs
     self._type = "DataFrame"
     self.logger = get_logger(self)
Пример #4
0
    def init_data(
            self,
            random_state,
            data_manager: DataManager,
            metric: Scorer,
            should_calc_all_metric: bool,
            splitter,
            should_store_intermediate_result: bool,
            resource_manager: ResourceManager,
            should_finally_fit: bool
    ):
        self.random_state = random_state
        if hasattr(splitter, "random_state"):
            setattr(splitter, "random_state", self.random_state)
        self.splitter = splitter
        self.data_manager = data_manager
        self.X_train = self.data_manager.X_train
        self.y_train = self.data_manager.y_train
        self.X_test = self.data_manager.X_test
        self.y_test = self.data_manager.y_test
        self.should_store_intermediate_result = should_store_intermediate_result
        self.metric = metric
        self.ml_task: MLTask = self.data_manager.ml_task

        self.should_calc_all_metric = should_calc_all_metric

        if self.ml_task.mainTask == "regression":
            self.predict_function = self._predict_regression
        else:
            self.predict_function = self._predict_proba

        self.logger = get_logger(self)
        self.resource_manager = resource_manager
        self.should_finally_fit = should_finally_fit
Пример #5
0
 def __init__(self, **kwargs):
     self.resource_manager = None
     self.component = None
     self.in_feature_groups = None
     self.out_feature_groups = None
     self.hyperparams = kwargs
     self.set_inside_dict(kwargs)
     self.logger = get_logger(self)
Пример #6
0
 def __init__(self,
              categorical_feature=None,
              numerical_feature=None,
              copy=True,
              missing_rate=0.4):
     self.missing_rate = missing_rate
     self.numerical_feature = numerical_feature
     self.copy = copy
     self.categorical_feature = categorical_feature
     self.logger = get_logger(self)
Пример #7
0
    def __init__(
            self,
            run_id,
            nameserver=None,
            nameserver_port=None,
            host=None,
            worker_id=None,
            timeout=None,
            debug=False
    ):
        """

        Parameters
        ----------
        run_id: anything with a __str__ method
            unique id to identify individual HpBandSter run
        nameserver: str
            hostname or IP of the nameserver
        nameserver_port: int
            port of the nameserver
        logger: logging.logger instance
            logger used for debugging output
        host: str
            hostname for this worker process
        worker_id: anything with a __str__method
            if multiple workers are started in the same process, you MUST provide a unique id for each one of them using the `id` argument.
        timeout: int or float or None
            specifies the timeout a worker will wait for a new after finishing a computation before shutting down.
            Towards the end of a long run with multiple workers, this helps to shutdown idling workers. We recommend
            a timeout that is roughly half the time it would take for the second largest budget to finish.
            The default (None) means that the worker will wait indefinitely and never shutdown on its own.
        """
        self.debug = debug
        self.run_id = run_id
        self.host = host
        self.nameserver = nameserver
        self.nameserver_port = nameserver_port
        self.worker_id = "opt.run_%s.worker.%s.%i" % (self.run_id, socket.gethostname(), os.getpid())
        self.manifest_id = uuid4().hex[-8:]
        self.timeout = timeout
        self.timer = None
        worker_id = str(worker_id)
        if not worker_id is None:
            self.worker_id += f".{worker_id}"
            self.manifest_id = str(worker_id)

        self.thread = None

        self.logger = get_logger(f"Worker[{self.manifest_id}]")  # 分布式环境下的命名问题

        self.busy = False
        self.thread_cond = threading.Condition(threading.Lock())
Пример #8
0
    def __init__(self,
                 new_result_callback,
                 run_id='0',
                 ping_interval=10,
                 nameserver='localhost',
                 nameserver_port=None,
                 host=None,
                 queue_callback=None):
        """
        Parameters
        ----------
        new_result_callback: function
            function that will be called with a `Job instance <opt.core.dispatcher.Job>`_ as argument.
            From the `Job` the result can be read and e.g. logged.
        run_id: str
            unique run_id associated with the HPB run
        ping_interval: int
            how often to ping for workers (in seconds)
        nameserver: str
            address of the Pyro4 nameserver
        nameserver_port: int
            port of Pyro4 nameserver
        host: str
            ip (or name that resolves to that) of the network interface to use
        queue_callback: function
            gets called with the number of workers in the pool on every update-cycle
        """

        self.new_result_callback = new_result_callback
        self.queue_callback = queue_callback
        self.run_id = run_id
        self.nameserver = nameserver
        self.nameserver_port = nameserver_port
        self.host = host
        self.ping_interval = int(ping_interval)
        self.shutdown_all_threads = False

        self.logger = get_logger(self)

        self.worker_pool = {}

        self.waiting_jobs = queue.Queue()
        self.running_jobs = {}
        self.idle_workers = set()

        self.thread_lock = threading.Lock()
        self.runner_cond = threading.Condition(self.thread_lock)
        self.discover_cond = threading.Condition(self.thread_lock)

        self.pyro_id = "opt.run_%s.dispatcher" % self.run_id
Пример #9
0
 def __init__(self,
              method="tsvd",
              n_components="auto",
              problem_type=None,
              random_state=0,
              budget=10,
              n_jobs=-1):
     self.budget = budget
     self.n_components = n_components
     self.n_jobs = n_jobs
     self.random_state = random_state
     self.problem_type = problem_type
     self.method = method
     self.logger = get_logger(self)
Пример #10
0
 def __init__(
     self,
     top_n_percent=15,
     bandwidth_factor=3,
     min_bandwidth=1e3,
     bw_estimation="normal_reference",
     min_points_in_kde=2,
 ):
     self.min_points_in_kde = min_points_in_kde
     self.bw_estimation = bw_estimation
     self.min_bandwidth = min_bandwidth
     self.bandwidth_factor = bandwidth_factor
     self.top_n_percent = top_n_percent
     self.config_transformer: Optional[ConfigurationTransformer] = None
     self.logger = get_logger(self)
Пример #11
0
 def __init__(
         self,
         n_estimators=2048,
         objective=None,
         boosting_type="gbdt",
         # objective="binary",
         learning_rate=0.01,
         max_depth=31,
         num_leaves=31,
         feature_fraction=0.8,
         bagging_fraction=0.8,
         bagging_freq=1,
         random_state=0,
         # cat_smooth=35,
         lambda_l1=0.1,
         lambda_l2=0.2,
         subsample_for_bin=40000,
         # min_data_in_leaf=4,
         min_child_weight=0.01,
         early_stopping_rounds=256,
         verbose=-1,
         n_jobs=1,
         warm_start=True):
     self.warm_start = warm_start
     assert self.is_classification is not None, NotImplementedError
     self.n_jobs = n_jobs
     self.objective = objective
     self.verbose = verbose
     self.early_stopping_rounds = early_stopping_rounds
     self.min_child_weight = min_child_weight
     self.subsample_for_bin = subsample_for_bin
     self.lambda_l2 = lambda_l2
     self.lambda_l1 = lambda_l1
     self.random_state = random_state
     self.bagging_freq = bagging_freq
     self.feature_fraction = feature_fraction
     self.bagging_fraction = bagging_fraction
     self.num_leaves = num_leaves
     self.max_depth = max_depth
     self.learning_rate = learning_rate
     self.objective = objective
     self.boosting_type = boosting_type
     self.n_estimators = n_estimators
     self.model = None
     self.current_iterations = 0
     self.early_stopped = False
     self.logger = get_logger(self)
Пример #12
0
    def __init__(self):
        """
        Parameters
        ----------

        directory: string
            where the results are logged
        logger: opt.utils.result_logger_v??
            the logger to store the data, defaults to v1
        overwrite: bool
            whether or not existing data will be overwritten
        logger: logging.logger
            for some debug output

        """

        self.logger = get_logger(self)
Пример #13
0
 def __init__(self,
              budget2epm,
              budget,
              acq_func="EI",
              acq_func_params=frozendict()):
     self.acq_func_params = dict(acq_func_params)
     # todo: 引入包的形式
     if acq_func == "EI":
         acq_func_cls = EI
     elif acq_func == "LogEI":
         acq_func_cls = LogEI
     else:
         raise NotImplementedError
     self.acq_func = acq_func_cls(**self.acq_func_params)
     self.budget2weight = None
     self.budget = budget
     self.budget2epm = budget2epm
     self.logger = get_logger(self)
Пример #14
0
 def __init__(self, steps, should_store_intermediate_result=False, resource_manager=None):
     self.config_id = None
     self.config = None
     self.logger = get_logger(self)
     if resource_manager is None:
         from autoflow import ResourceManager
         self.logger.warning(
             "In ML_Workflow __init__, resource_manager is None, create a default local resource_manager.")
         resource_manager = ResourceManager()
     self.resource_manager = resource_manager
     self.should_store_intermediate_result = should_store_intermediate_result
     self.steps = steps
     self.memory = None
     self.verbose = False
     self._validate_steps()
     self.intermediate_result = {}
     self.fitted = False
     self.budget = 0
Пример #15
0
 def __init__(
     self,
     meta_learner=None,
     use_features_in_secondary=False,
 ):
     self.use_features_in_secondary = use_features_in_secondary
     assert self.mainTask in ("classification", "regression")
     if not meta_learner:
         if self.mainTask == "classification":
             meta_learner = LogisticRegression(penalty='elasticnet',
                                               solver="saga",
                                               l1_ratio=0.5,
                                               C=1.0,
                                               fit_intercept=False)
         elif self.mainTask == "regression":
             meta_learner = ElasticNet(fit_intercept=False, random_state=10)
     self.meta_learner = meta_learner
     self.logger = get_logger(self)
Пример #16
0
 def __init__(self,
              n_uniques: np.ndarray,
              A=10,
              B=5,
              dropout1=0.1,
              dropout2=0.1,
              dropout3=0.1,
              n_class=2):
     super(EntityEmbeddingNN, self).__init__()
     self.dropout3 = dropout3
     self.logger = get_logger(self)
     self.epoch = 0
     self.n_class = n_class
     self.dropout2 = dropout2
     self.dropout1 = dropout1
     self.n_uniques = n_uniques
     self.A = A
     self.B = B
     self.embed_dims = self.get_embed_dims(n_uniques)
     sum_ = np.log(self.embed_dims).sum()
     self.n_layer1 = min(1000, int(A * (n_uniques.size**0.5) * sum_ + 1))
     self.n_layer2 = int(self.n_layer1 / B) + 2
     self.embedding_blocks = nn.ModuleList([
         nn.Embedding(int(n_unique), int(embed_dim))
         for n_unique, embed_dim in zip(self.n_uniques, self.embed_dims)
     ])
     embed_dims_size = self.embed_dims.sum()
     layer1 = self.get_block(embed_dims_size, self.n_layer1, False,
                             dropout1, "leaky_relu")
     layer2 = self.get_block(self.n_layer1, self.n_layer2, False, dropout2,
                             "leaky_relu")
     layer3 = self.get_block(self.n_layer2, self.n_class, False, dropout3,
                             "leaky_relu")
     self.deep_net = nn.Sequential(layer1, layer2, layer3)
     self.wide_net = self.get_block(embed_dims_size, self.n_class, False,
                                    dropout3, "leaky_relu")
     output_modules = []
     if self.n_class > 1:
         output_modules.append(nn.Softmax(dim=1))
     self.output_layer = nn.Sequential(*output_modules)
     self.initializing_modules(
         chain(self.deep_net.modules(), self.wide_net.modules(),
               self.output_layer.modules(),
               self.embedding_blocks.modules()))
Пример #17
0
 def init_variables(self):
     self.scaler = StandardScaler(copy=True)
     self.rng = np.random.RandomState(self.random_state)
     self.logger = get_logger(self)
     self.model = None
     self.learning_curve = [
         [],  # train_sizes_abs [0]
         [],  # train_scores    [1]
         [],  # test_scores     [2]
     ]
     self.performance_history = np.full(self.early_stopping_rounds, -np.inf)
     self.iteration_history = np.full(self.early_stopping_rounds, 0, dtype="int32")
     N = len(self.performance_history)
     self.best_estimators = np.zeros([N], dtype="object")
     if self.is_classification:
         self.score_func = accuracy_score
     else:
         self.score_func = r2_score
     self.early_stopped = False
     self.best_iteration = 0
Пример #18
0
 def __init__(self,
              lr=1e-2,
              max_epoch=25,
              n_class=None,
              nn_params=frozendict(),
              random_state=1000,
              batch_size=1024,
              optimizer="adam",
              n_jobs=-1,
              class_weight=None):
     self.class_weight = class_weight
     self.n_jobs = check_n_jobs(n_jobs)
     self.optimizer = optimizer
     self.batch_size = batch_size
     self.random_state = random_state
     self.nn_params = nn_params
     self.n_class = n_class
     self.max_epoch = max_epoch
     self.lr = lr
     self.rng = check_random_state(random_state)
     self.logger = get_logger(self)
Пример #19
0
 def __init__(self,
              percentage=20,
              feats_must_less_than_rows=True,
              lgbm_w=0.5,
              et_iters=100,
              lgbm_iters=100,
              et_budget=1.5,
              lgbm_budget=1.5,
              step=10,
              n_jobs=-1,
              random_state=42):
     self.feats_must_less_than_rows = feats_must_less_than_rows
     self.random_state = random_state
     self.lgbm_w = lgbm_w
     self.lgbm_budget = lgbm_budget
     self.et_budget = et_budget
     self.n_jobs = n_jobs
     self.step = step
     self.lgbm_iters = lgbm_iters
     self.et_iters = et_iters
     self.percentage = float(np.clip(percentage, 0, 100))
     self.logger = get_logger(self)
Пример #20
0
 def __init__(self,
              dataset_source="",
              dataset_path=None,
              dataset_instance=None,
              dataset_id=None,
              resource_manager=None,
              dataset_metadata=frozendict(),
              upload_type="fs"):
     self.upload_type = upload_type
     self.dataset_id = None
     self.dataset_source = dataset_source
     self.dataset_metadata = dict(dataset_metadata)
     self.dataset_metadata.update(dataset_source=dataset_source)
     self.uploaded_hash = None
     from autoflow.resource_manager.base import ResourceManager
     self.logger = get_logger(self)
     if resource_manager is None:
         self.logger.warning(
             "In DataContainer __init__, resource_manager is None, create a default local resource_manager."
         )
         resource_manager = ResourceManager()
     self.resource_manager: ResourceManager = resource_manager
     data_indicators = [dataset_path, dataset_instance, dataset_id]
     data_indicators = np.array(list(
         map(lambda x: x is not None, data_indicators)),
                                dtype='int32')
     assert data_indicators.sum() == 1
     if dataset_path is not None:
         data = self.read_local(dataset_path)
         self.data = self.process_dataset_instance(data)
     elif dataset_instance is not None:
         assert isinstance(dataset_instance, self.VALID_INSTANCE)
         self.data = self.process_dataset_instance(dataset_instance)
     elif dataset_id is not None:
         self.download(dataset_id)
     else:
         raise NotImplementedError
Пример #21
0
 def __init__(
         self,
         budget_per_trial=1,
         budget=10,
         # n_jobs=-1,
         verbose=0,
         random_state=42,
         cv=3,
         lr_iter_step=10,
         lr_max_iter=100,
         lr_es_round=4,
         problem_type=None):
     self.lr_es_round = lr_es_round
     self.lr_max_iter = lr_max_iter
     self.lr_iter_step = lr_iter_step
     self.problem_type = problem_type
     self.cv = cv
     self.random_state = random_state
     self.verbose = verbose
     # self.n_jobs = check_n_jobs(n_jobs)
     self.budget = budget
     self.budget_per_trial = budget_per_trial
     self.logging_level = 20 if verbose > 0 else 20
     self.logger = get_logger(self)
Пример #22
0
 def __init__(
     self,
     base_model="lgbm",  # ["lgbm", "et", "ridge"] are recommended
     n_jobs=-1,
     random_state=42,
     max_dichotomy=None,
     cv=3,
     cv_budget=2,
     test_size=0.33,
     model_params=frozendict()):
     if max_dichotomy is None:
         if base_model in ("et", "lgbm"):
             max_dichotomy = 10
         else:
             max_dichotomy = 5
     self.model_params = model_params
     self.cv_budget = cv_budget
     self.test_size = test_size
     self.cv = cv
     self.max_dichotomy = max_dichotomy
     self.random_state = random_state
     self.n_jobs = n_jobs
     self.base_model = base_model
     self.logger = get_logger(self)
Пример #23
0
    def __init__(
        self,
        X_train: Union[pd.DataFrame, GenericDataFrame, np.ndarray,
                       None] = None,
        y_train: Union[pd.Series, np.ndarray, str, None] = None,
        X_test: Union[pd.DataFrame, GenericDataFrame, np.ndarray, None] = None,
        y_test: Union[pd.Series, np.ndarray, str, None] = None,
        dataset_metadata: Dict[str, Any] = frozenset(),
        column_descriptions: Dict[str, Union[List[str], str]] = None,
        highR_nan_threshold: float = 0.5,
    ):
        '''

        Parameters
        ----------
        X_train: :class:`numpy.ndarray` or :class:`pandas.DataFrame`
        y_train: :class:`numpy.ndarray`
        X_test: :class:`numpy.ndarray` or :class:`pandas.DataFrame`
        y_test: :class:`numpy.ndarray`
        dataset_metadata: dict
        column_descriptions: dict
            ``column_descriptions`` is a dict, key is ``feature_group``,

            value is column (column name) or columns (list of column names).

            This is a list of some frequently-used built-in ``feature_group``
                * ``id``       - id of this table.
                * ``ignore``   - some columns which contains irrelevant information.
                * ``target``   - column in the dataset is what your model will learn to predict.
                * ``nan``      - Not a Number, a column contain missing values.
                * ``num``      - numerical features, such as [1, 2, 3].
                * ``cat``      - categorical features, such as ["a", "b", "c"].
                * ``num_nan``  - numerical features contains missing values. such as [1, 2, NaN].
                * ``cat_nan``  - categorical features contains missing values. such as ["a", "b", NaN].
                * ``highR_nan``  - highly ratio NaN. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`
                * ``lowR_nan``   - lowly ratio NaN. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`
                * ``highR_cat``  - highly cardinality ratio categorical. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`
                * ``lowR_cat``  -  lowly cardinality ratio categorical. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`

        highR_nan_threshold: float
            high ratio NaN threshold, you can find examples and practice in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`
        '''
        self.logger = get_logger(self)
        dataset_metadata = dict(dataset_metadata)
        self.highR_nan_threshold = highR_nan_threshold
        self.dataset_metadata = dataset_metadata
        X_train = deepcopy(X_train)
        y_train = deepcopy(y_train)
        X_test = deepcopy(X_test)
        y_test = deepcopy(y_test)
        X_train, y_train, X_test, y_test, feature_groups, column2feature_groups = self.parse_column_descriptions(
            column_descriptions, X_train, y_train, X_test, y_test)
        self.feature_groups = feature_groups
        self.column2feature_groups = column2feature_groups
        self.ml_task: MLTask = get_ml_task_from_y(y_train)
        self.X_train = GenericDataFrame(X_train, feature_groups=feature_groups)
        self.y_train = y_train
        self.X_test = GenericDataFrame(
            X_test,
            feature_groups=feature_groups) if X_test is not None else None
        self.y_test = y_test if y_test is not None else None

        # todo: 用户自定义验证集可以通过RandomShuffle 或者mlxtend指定
        # fixme: 不支持multilabel
        if len(y_train.shape) > 2:
            raise ValueError('y must not have more than two dimensions, '
                             'but has %d.' % len(y_train.shape))

        if X_train.shape[0] != y_train.shape[0]:
            raise ValueError('X and y must have the same number of '
                             'datapoints, but have %d and %d.' %
                             (X_train.shape[0], y_train.shape[0]))
Пример #24
0
    def __init__(self,
                 tuner: Union[Tuner, List[Tuner], None, dict] = None,
                 hdl_constructor: Union[HDL_Constructor, List[HDL_Constructor],
                                        None, dict] = None,
                 resource_manager: Union[ResourceManager, str] = None,
                 random_state=42,
                 log_file: str = None,
                 log_config: Optional[dict] = None,
                 highR_nan_threshold=0.5,
                 highR_cat_threshold=0.5,
                 should_store_intermediate_result=False,
                 should_finally_fit=False,
                 should_calc_all_metrics=True,
                 **kwargs):
        '''
        Parameters
        ----------
        tuner: :class:`autoflow.tuner.tuner.Tuner` or None
            ``Tuner`` if class who agent an abstract search process.

        hdl_constructor: :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` or None
            ``HDL`` is abbreviation of Hyper-parameter Descriptions Language.

            It describes an abstract hyperparametric space that independent with concrete implementation.

            ``HDL_Constructor`` is a class who is responsible for translating dict-type ``DAG-workflow`` into ``H.D.L`` .

        resource_manager: :class:`autoflow.manager.resource_manager.ResourceManager` or None
            ``ResourceManager`` is a class manager computer resources such like ``file_system`` and ``data_base``.

        random_state: int
            random state

        log_file: path
            which file to store log, if is None, ``autoflow.log`` will be used.

        log_config: dict
            logging configuration

        highR_nan_threshold: float
            high ratio NaN threshold, you can find example and practice in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`

        highR_cat_threshold: float
            high ratio categorical feature's cardinality threshold, you can find example and practice in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`

        kwargs
            if parameters like ``tuner`` or ``hdl_constructor`` and ``resource_manager`` are passing None,

            you can passing kwargs to make passed parameter work. See the following example.

        Examples
        ---------
        In this example, you can see a trick to seed kwargs parameters with out initializing
        :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` or other class.

        In following example, user pass ``DAG_workflow`` and ``hdl_bank`` by key-work arguments method.
        And we can see  hdl_constructor is instanced by kwargs implicitly.

        >>> from autoflow import AutoFlowClassifier
        >>> classifier = AutoFlowClassifier(DAG_workflow={"num->target":["lightgbm"]},
        ...   hdl_bank={"classification":{"lightgbm":{"boosting_type":  {"_type": "choice", "_value":["gbdt","dart","goss"]}}}})
        AutoFlowClassifier(hdl_constructor=HDL_Constructor(
            DAG_workflow={'num->target': ['lightgbm']}
            hdl_bank_path=None
            hdl_bank={'classification': {'lightgbm': {'boosting_type': {'_type': 'choice', '_value': ['gbdt', 'dart', 'goss']}}}}
            included_classifiers=('adaboost', 'catboost', 'decision_tree', 'extra_trees', 'gaussian_nb', 'k_nearest_neighbors', 'liblinear_svc', 'lib...
        '''
        self.should_finally_fit = should_finally_fit
        self.should_store_intermediate_result = should_store_intermediate_result
        self.should_calc_all_metrics = should_calc_all_metrics
        self.log_config = log_config
        self.highR_nan_threshold = highR_nan_threshold
        self.highR_cat_threshold = highR_cat_threshold

        # ---logger------------------------------------
        self.log_file = log_file
        setup_logger(self.log_file, self.log_config)
        self.logger = get_logger(self)
        # ---random_state-----------------------------------
        self.random_state = random_state
        # ---tuner-----------------------------------
        tuner = instancing(tuner, Tuner, kwargs)
        # ---tuners-----------------------------------
        self.tuners = sequencing(tuner, Tuner)
        self.tuner = self.tuners[0]
        # ---hdl_constructor--------------------------
        hdl_constructor = instancing(hdl_constructor, HDL_Constructor, kwargs)
        # ---hdl_constructors-------------------------
        self.hdl_constructors = sequencing(hdl_constructor, HDL_Constructor)
        self.hdl_constructor = self.hdl_constructors[0]
        # ---resource_manager-----------------------------------
        self.resource_manager = instancing(resource_manager, ResourceManager,
                                           kwargs)
        # ---member_variable------------------------------------
        self.estimator = None
        self.ensemble_estimator = None
Пример #25
0
from copy import deepcopy
from fractions import Fraction
from typing import Dict, Optional, Union, List

import numpy as np
from ConfigSpace import ConfigurationSpace, Constant, CategoricalHyperparameter, Configuration
from ConfigSpace.util import deactivate_inactive_hyperparameters
from scipy.spatial.distance import euclidean
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder, StandardScaler

from autoflow.constants import ERR_LOSS
from autoflow.utils.logging_ import get_logger

inc_logger = get_logger("incumbent trajectory")
logger = get_logger(__name__)


def is_top_level_activated(config_space, config, hp_name, hp_value=None):
    parent_conditions = config_space.get_parent_conditions_of(hp_name)
    if len(parent_conditions):
        parent_condition = parent_conditions[0]
        parent_value = parent_condition.value
        parent_name = parent_condition.parent.name
        return is_top_level_activated(config_space, config, parent_name,
                                      parent_value)
    # 没有条件依赖,就是parent
    if hp_value is None:
        return True
    return config[hp_name] == hp_value
Пример #26
0
    def __init__(self,
                 store_path="~/autoflow",
                 file_system="local",
                 file_system_params=frozendict(),
                 db_type="sqlite",
                 db_params=frozendict(),
                 redis_params=frozendict(),
                 max_persistent_estimators=50,
                 compress_suffix="bz2"):
        '''

        Parameters
        ----------
        store_path: str
            A path store files, such as metadata and model file and database file, which belong to AutoFlow.
        file_system: str
            Indicator-string about which file system or storage system will be used.

            Available options list below:
                * ``local``
                * ``hdfs``
                * ``s3``

            ``local`` is default value.
        file_system_params: dict
            Specific file_system configuration.
        db_type: str
            Indicator-string about which file system or storage system will be used.

            Available options list below:
                * ``sqlite``
                * ``postgresql``
                * ``mysql``

            ``sqlite`` is default value.
        db_params: dict
            Specific database configuration.
        redis_params: dict
            Redis configuration.
        max_persistent_estimators: int
            Maximal number of models can persistent in single task.

            If more than this number, the The worst performing model file will be delete,

            the corresponding database record will also be deleted.
        compress_suffix: str
            compress file's suffix, default is bz2
        '''
        # --logger-------------------
        self.logger = get_logger(self)
        # --preprocessing------------
        file_system_params = dict(file_system_params)
        db_params = dict(db_params)
        redis_params = dict(redis_params)
        # ---file_system------------
        self.file_system_type = file_system
        self.file_system: FileSystem = get_file_system(file_system)(
            **file_system_params)
        if self.file_system_type == "local":
            store_path = os.path.expandvars(os.path.expanduser(store_path))
        self.store_path = store_path
        # ---data_base------------
        assert db_type in ("sqlite", "postgresql", "mysql")
        self.db_type = db_type
        self.db_params = dict(db_params)
        if db_type == "sqlite":
            assert self.file_system_type == "local"
        # ---redis----------------
        self.redis_params = dict(redis_params)
        # ---max_persistent_model---
        self.max_persistent_estimators = max_persistent_estimators
        # ---compress_suffix------------
        self.compress_suffix = compress_suffix
        # ---post_process------------
        self.store_path = store_path
        self.file_system.mkdir(self.store_path)
        self.is_init_experiments_db = False
        self.is_init_tasks_db = False
        self.is_init_hdls_db = False
        self.is_init_trials_db = False
        self.is_init_redis = False
        self.is_master = False
        # --some specific path based on file_system---
        self.datasets_dir = self.file_system.join(self.store_path, "datasets")
        self.databases_dir = self.file_system.join(self.store_path,
                                                   "databases")
        self.parent_trials_dir = self.file_system.join(self.store_path,
                                                       "trials")
        self.parent_experiments_dir = self.file_system.join(
            self.store_path, "experiments")
        for dir_path in [
                self.datasets_dir, self.databases_dir,
                self.parent_experiments_dir, self.parent_trials_dir
        ]:
            self.file_system.mkdir(dir_path)
        # --db-----------------------------------------
        self.Datebase = get_db_class_by_db_type(self.db_type)
        # --JSONField-----------------------------------------
        self.JSONField = get_JSONField(self.db_type)
        # --database_name---------------------------------
        # None means didn't create database
        self._meta_records_db_name = None  # meta records database
        self._tasks_db_name = None
Пример #27
0
 def __init__(
     self,
     DAG_workflow: Union[str, Dict[str, Any]] = "generic_recommend",
     hdl_bank_path=None,
     hdl_bank=None,
     hdl_metadata=frozendict(),
     balance_strategies=("weight", "None"),
     included_classifiers=("extra_trees", "lightgbm", "logistic_regression",
                           "random_forest", "gbt_lr", "tabular_nn"),
     included_regressors=("extra_trees", "lightgbm", "elasticnet",
                          "random_forest", "gbt_lr", "tabular_nn"),
     included_imputers=("impute.simple", "impute.gbt"),
     included_highC_cat_encoders=("encode.entity", "encode.ordinal",
                                  "encode.cat_boost"),
     combine_rare=True,
     included_cat_encoders=("encode.one_hot", "encode.ordinal"),
     num2normed_workflow=frozendict({
         "num->normed": ["scale.standard",
                         "operate.keep_going"],  # "scale.adaptive",
     }),
     text2normed_workflow=frozendict({
         "text->tokenized":
         "text.tokenize.simple",
         "tokenized->normed": [
             "text.topic.tsvd",
             "text.topic.lsi",
             "text.topic.nmf",
         ]
     }),
     date2normed_workflow=frozendict({}),
     normed2final_workflow=frozendict({
         "normed->final":
         ["operate.keep_going", "select.boruta", "generate.autofeat"]
     })):
     self.combine_rare = combine_rare
     self.balance_strategies = balance_strategies
     self.date2normed_workflow = date2normed_workflow
     self.text2normed_workflow = text2normed_workflow
     self.normed2final_workflow = normed2final_workflow
     self.num2normed_workflow = num2normed_workflow
     self.hdl_metadata = dict(hdl_metadata)
     self.included_cat_encoders = included_cat_encoders
     self.included_highC_cat_encoders = included_highC_cat_encoders
     self.included_imputers = included_imputers
     self.included_regressors = included_regressors
     self.included_classifiers = included_classifiers
     self.logger = get_logger(self)
     self.hdl_bank_path = hdl_bank_path
     self.DAG_workflow = DAG_workflow
     if hdl_bank is None:
         if hdl_bank_path:
             hdl_bank = get_hdl_bank(hdl_bank_path)
         else:
             hdl_bank = get_default_hdl_bank()
     if hdl_bank is None:
         hdl_bank = {}
         self.logger.warning("No hdl_bank, will use DAG_descriptions only.")
     self.hdl_bank = hdl_bank
     self.random_state = 42
     self.ml_task = None
     self.data_manager = None
Пример #28
0
 def __init__(self):
     self.ml_task = None
     self.logger = get_logger(__name__)
Пример #29
0
    def __init__(self,
                 resource_manager=None,
                 X_train: Union[pd.DataFrame, DataFrameContainer, np.ndarray,
                                None, str] = None,
                 y_train: Union[pd.Series, np.ndarray, None] = None,
                 X_test: Union[pd.DataFrame, DataFrameContainer, np.ndarray,
                               None, str] = None,
                 y_test: Union[pd.Series, np.ndarray, None] = None,
                 dataset_metadata: Dict[str, Any] = frozendict(),
                 column_descriptions: Dict[str, Union[List[str],
                                                      str]] = frozendict(),
                 highR_nan_threshold: float = 0.5,
                 highC_cat_threshold: int = 4,
                 consider_ordinal_as_cat=False,
                 upload_type="fs"):
        '''

        Parameters
        ----------
        X_train: :class:`numpy.ndarray` or :class:`pandas.DataFrame`
        y_train: :class:`numpy.ndarray`
        X_test: :class:`numpy.ndarray` or :class:`pandas.DataFrame`
        y_test: :class:`numpy.ndarray`
        dataset_metadata: dict
        column_descriptions: dict
            ``column_descriptions`` is a dict, key is ``feature_group``,

            value is column (column name) or columns (list of column names).

            This is a list of some frequently-used built-in ``feature_group``
                * ``id``       - id of this table.
                * ``ignore``   - some columns which contains irrelevant information.
                * ``target``   - column in the dataset is what your model will learn to predict.
                * ``nan``      - Not a Number, a column contain missing values.
                * ``num``      - numerical features, such as [1, 2, 3].
                * ``cat``      - categorical features, such as ["a", "b", "c"].
                * ``num_nan``  - numerical features contains missing values. such as [1, 2, NaN].
                * ``cat_nan``  - categorical features contains missing values. such as ["a", "b", NaN].
                * ``highR_nan``  - highly ratio NaN. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`
                * ``lowR_nan``   - lowly ratio NaN. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`
                * ``highC_cat``  - highly cardinality ratio categorical. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`
                * ``lowR_cat``  -  lowly cardinality ratio categorical. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`

        highR_nan_threshold: float
            high ratio NaN threshold, you can find examples and practice in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`
        '''
        self.upload_type = upload_type
        from autoflow.resource_manager.base import ResourceManager
        self.logger = get_logger(self)
        if resource_manager is None:
            self.logger.warning(
                "In DataManager __init__, resource_manager is None, create a default local resource_manager."
            )
            resource_manager = ResourceManager()
        self.resource_manager: ResourceManager = resource_manager
        self.resource_manager = resource_manager
        self.highC_cat_threshold = highC_cat_threshold
        self.consider_ordinal_as_cat = consider_ordinal_as_cat
        dataset_metadata = dict(dataset_metadata)
        self.highR_nan_threshold = highR_nan_threshold
        self.dataset_metadata = dataset_metadata
        self.column_descriptions = dict(column_descriptions)
        # --load data to container---------------------------------
        self.X_test, self.input_test_hash = self.parse_data_container(
            "TestSet", X_test, y_test)
        #             train set 靠后,以train set 的column_descriptions为准
        self.X_train, self.input_train_hash = self.parse_data_container(
            "TrainSet", X_train, y_train)
        # --migrate column descriptions------------------------------
        # if X is dataset_id , remote data_container's column_descriptions will assigned to  final_column_descriptions
        if self.final_column_descriptions is not None:
            self.column_descriptions = deepcopy(self.final_column_descriptions)
        # --column descriptions------------------------------
        self.parse_column_descriptions()
        # 注意,此时feature_groups与columns不是一一匹配的,删除了辅助特征组
        # ---check target-----------------------------------------------------
        assert "target" in self.column_descriptions
        self.target_col_name = self.column_descriptions["target"]
        # todo: 测试集预测的情况
        # --final column descriptions------------------------------
        # 用户定义的 column descriptions 和 remote 下载的column description都不应该包含nan的内容
        # update `column2essential_feature_groups` to `final_column_descriptions`
        if self.final_column_descriptions is None:
            final_column_descriptions = defaultdict(list)
            final_column_descriptions.update(self.column_descriptions)
            # 先将非唯一的特征组处理为列表
            for feat_grp, cols in final_column_descriptions.items():
                if feat_grp not in UNIQUE_FEATURE_GROUPS:
                    if isinstance(cols, str):
                        final_column_descriptions[feat_grp] = [cols]
            # 然后开始更新
            for column, essential_feature_group in self.column2feature_groups.items(
            ):
                if column not in final_column_descriptions[
                        essential_feature_group]:
                    final_column_descriptions[essential_feature_group].append(
                        column)
            self.final_column_descriptions = final_column_descriptions
        self.final_column_descriptions = dict(self.final_column_descriptions)
        # ---set column descriptions, upload to dataset-----------------------------------------------------
        if self.X_train is not None:
            self.X_train.set_column_descriptions(
                self.final_column_descriptions)
            self.X_train.upload(self.upload_type)
            self.logger.info(
                f"TrainSet's DataSet ID = {self.X_train.dataset_id}")
        if self.X_test is not None:
            self.X_test.set_column_descriptions(self.final_column_descriptions)
            self.X_test.upload(self.upload_type)
            self.logger.info(
                f"TestSet's DataSet ID = {self.X_test.dataset_id}")
        # ---origin hash-----------------------------------------------------
        self.train_set_id = self.X_train.get_hash(
        ) if self.X_train is not None else ""
        self.test_set_id = self.X_test.get_hash(
        ) if self.X_test is not None else ""
        if self.input_train_hash:
            assert self.input_train_hash == self.train_set_id
        if self.input_test_hash:
            assert self.input_test_hash == self.test_set_id
        # ---pop auxiliary columns-----------------------------------------------------
        y_train, y_test = self.pop_auxiliary_feature_groups()
        # --验证X与X_test的列应该相同
        if self.X_test is not None and self.X_train is not None:
            assert self.X_train.shape[1] == self.X_test.shape[1]
            assert np.all(self.X_train.columns == self.X_test.columns)
        # --设置feature_groups--
        if self.X_train is not None:
            self.X_train.set_feature_groups(self.feature_groups)
        if self.X_test is not None:
            self.X_test.set_feature_groups(self.feature_groups)
        # --设置参数--
        y_train = to_array(y_train)
        y_test = to_array(y_test)
        # encode label
        assert y_train is not None, ValueError(
            f"{self.target_col_name} does not exist!")
        self.label_encoder = None
        if is_target_need_label_encode(y_train):
            self.label_encoder = LabelEncoder()
            y_train = self.label_encoder.fit_transform(y_train)
            y_test = self.encode_label(y_test)
        if y_train is not None:
            y_train = NdArrayContainer("TrainLabel",
                                       dataset_instance=y_train,
                                       resource_manager=self.resource_manager)
            y_train.upload()
        if y_test is not None:
            y_test = NdArrayContainer("TestLabel",
                                      dataset_instance=y_test,
                                      resource_manager=self.resource_manager)
            y_test.upload()
        self.ml_task: MLTask = get_ml_task_from_y(y_train.data)
        self.y_train = y_train
        self.y_test = y_test
        self.train_label_id = self.y_train.get_hash(
        ) if self.y_train is not None else ""
        self.test_label_id = self.y_test.get_hash(
        ) if self.y_test is not None else ""
        if self.X_train is not None:
            self.columns = self.X_train.columns
        else:
            self.columns = self.X_test.columns
Пример #30
0
    def __init__(
        self,
        url=None,
        email=None,
        password=None,
        user_id=None,
        user_token=None,
    ):

        if url is None:
            # url = "http://192.168.1.182:9901"
            url = os.getenv("XENON_URL", "https://xacs.nitrogen.fun:9090")
        # todo: 增加encrypt字段
        self.url = url
        self.user_token = user_token
        self.user_id = user_id
        self.password = password
        self.email = email
        self.db_params = {
            "http_client": True,
            "url": url,
            "headers": {
                'Content-Type': 'application/json',
                'accept': 'application/json',
            }
        }
        token_dir = f"{os.getenv('HOME')}/autoflow/auth"
        token_file = f"{token_dir}/config.json"
        self.login_logger = get_logger("Login")
        if email is None or password is None:
            self.login_logger.info(
                "'email' or 'password' is None, try to "
                "verify User Authentication by 'user_id' and 'user_token'.")
            if user_id is None or user_token is None:
                self.login_logger.info(
                    "'user_id' and 'user_token' is None, "
                    f"try to load token file '{token_file}'")
                if not Path(token_file).exists():
                    self.login_logger.error(
                        f"user_token file '{token_file} do not exists! AutoFlow-SDK will exit..."
                    )
                    sys.exit(-1)
                config_data = json.loads(Path(token_file).read_text())
                if "user_token" not in config_data or "user_id" not in config_data:
                    self.login_logger.error(
                        f"'user_token' and 'user_id' did not exist in '{token_file}'! AutoFlow-SDK will exit..."
                    )
                    sys.exit(-1)
                self.user_token = config_data["user_token"]
                self.user_id = config_data["user_id"]
            self.db_params["headers"].update({
                "user_id": str(self.user_id),
                "user_token": self.user_token
            })
        else:
            self.db_params["user"] = self.email
            self.db_params["password"] = self.password
            self.user_id, self.user_token = self.login()
            Path(token_dir).mkdir(parents=True, exist_ok=True)
            Path(token_file).write_text(
                json.dumps({
                    "user_id": self.user_id,
                    "user_token": self.user_token
                }))
        super(HttpResourceManager,
              self).__init__(store_path="xenon",
                             db_params=self.db_params,
                             user_id=self.user_id,
                             file_system="nitrogen",
                             file_system_params={"db_params": self.db_params},
                             del_local_log_path=False)