Пример #1
0
 def _get_logger(self, name):
     logger_name = 'SolnML-%s(%d)' % (name, self.seed)
     setup_logger(
         os.path.join(self.output_dir, '%s.log' % str(logger_name)),
         self.logging_config,
     )
     return get_logger(logger_name)
Пример #2
0
 def _get_logger(self, name):
     import os
     logger_name = 'solnml-%s-%d:%s' % (self.task_id, self._seed, name)
     setup_logger(os.path.join(self.tmp_directory, '%s.log' % str(logger_name)),
                  self.logging_config,
                  )
     return get_logger(logger_name)
Пример #3
0
    def __init__(self,
                 estimator_id,
                 scorer=None,
                 data_node=None,
                 task_type=REGRESSION,
                 resampling_strategy='cv',
                 resampling_params=None,
                 timestamp=None,
                 output_dir=None,
                 seed=1,
                 if_imbal=False):
        self.resampling_strategy = resampling_strategy
        self.resampling_params = resampling_params

        self.estimator_id = estimator_id
        self.scorer = scorer if scorer is not None else balanced_accuracy_scorer
        self.task_type = task_type
        self.data_node = data_node
        self.output_dir = output_dir
        self.seed = seed
        self.onehot_encoder = None
        self.logger = get_logger(self.__module__ + "." +
                                 self.__class__.__name__)
        self.continue_training = False

        self.train_node = data_node.copy_()
        self.val_node = data_node.copy_()

        self.timestamp = timestamp
        # TODO: Top-k k?
        self.topk_model_saver = CombinedTopKModelSaver(
            k=60, model_dir=self.output_dir, identifier=timestamp)
Пример #4
0
    def __init__(self,
                 evaluator: _BaseEvaluator,
                 config_space,
                 name,
                 timestamp,
                 eval_type,
                 output_dir=None,
                 seed=None):
        self.evaluator = evaluator
        self.config_space = config_space

        assert name in ['hpo', 'fe']
        self.name = name
        self.seed = np.random.random_integers(
            MAX_INT) if seed is None else seed
        self.start_time = time.time()
        self.timing_list = list()
        self.incumbent = None
        self.eval_type = eval_type
        self.logger = get_logger(self.__module__ + "." +
                                 self.__class__.__name__)
        self.init_hpo_iter_num = None
        self.early_stopped_flag = False
        self.timestamp = timestamp
        self.output_dir = output_dir
        self.topk_saver = CombinedTopKModelSaver(k=50,
                                                 model_dir=self.output_dir,
                                                 identifier=self.timestamp)
Пример #5
0
 def __init__(self,
              clf_config,
              task_type,
              model_dir='data/dl_models/',
              max_epoch=150,
              scorer=None,
              dataset=None,
              continue_training=True,
              device='cpu',
              seed=1,
              timestamp=None,
              **kwargs):
     self.hpo_config = clf_config
     self.task_type = task_type
     self.max_epoch = max_epoch
     self.scorer = scorer if scorer is not None else accuracy_scorer
     self.dataset = copy.deepcopy(dataset)
     self.continue_training = continue_training
     self.seed = seed
     self.timestamp = timestamp
     self.eval_id = 0
     self.onehot_encoder = None
     self.topk_model_saver = TopKModelSaver(k=20,
                                            model_dir=model_dir,
                                            identifier=timestamp)
     self.model_dir = model_dir
     self.device = device
     self.logger = get_logger(self.__module__ + "." +
                              self.__class__.__name__)
     if task_type == IMG_CLS:
         self.image_size = kwargs['image_size']
Пример #6
0
    def __init__(self,
                 fixed_config=None,
                 scorer=None,
                 data_node=None,
                 task_type=0,
                 resampling_strategy='cv',
                 resampling_params=None,
                 timestamp=None,
                 output_dir=None,
                 seed=1,
                 if_imbal=False):
        self.resampling_strategy = resampling_strategy
        self.resampling_params = resampling_params

        self.fixed_config = fixed_config
        self.scorer = scorer if scorer is not None else balanced_accuracy_scorer
        self.if_imbal = if_imbal
        self.task_type = task_type
        self.data_node = data_node
        self.output_dir = output_dir
        self.seed = seed
        self.onehot_encoder = None
        self.logger = get_logger(self.__module__ + "." +
                                 self.__class__.__name__)
        self.continue_training = False

        self.train_node = data_node.copy_()
        self.val_node = data_node.copy_()

        self.timestamp = timestamp
Пример #7
0
    def __init__(self, clf_config, fe_config, estimator_id, if_imbal=False, scorer=None, data_node=None, name=None,
                 resampling_strategy='cv', resampling_params=None, seed=1,
                 timestamp=None, output_dir=None):
        self.resampling_strategy = resampling_strategy
        self.resampling_params = resampling_params
        self.hpo_config = clf_config

        # TODO: Optimize: Fit the same transformers only once
        self.fe_config = fe_config
        self.estimator_id = estimator_id
        self.scorer = scorer if scorer is not None else balanced_accuracy_scorer
        self.if_imbal = if_imbal
        self.data_node = data_node
        self.name = name
        self.seed = seed
        self.onehot_encoder = None
        self.logger = get_logger(self.__module__ + "." + self.__class__.__name__)

        self.output_dir = output_dir
        self.timestamp = timestamp

        self.train_node = data_node.copy_()
        self.val_node = data_node.copy_()

        self.continue_training = False

        self.topk_model_saver = BanditTopKModelSaver(k=60, model_dir=self.output_dir, identifier=timestamp)
Пример #8
0
    def __init__(self, eval_func, config_space,
                 seed=1, R=81, eta=3, n_jobs=1):
        self.eval_func = eval_func
        self.config_space = config_space
        self.n_workers = n_jobs

        self.trial_cnt = 0
        self.configs = list()
        self.perfs = list()
        self.incumbent_perf = float("-INF")
        self.incumbent_config = self.config_space.get_default_configuration()
        self.incumbent_configs = list()
        self.incumbent_perfs = list()
        self.evaluation_stats = dict()
        self.evaluation_stats['timestamps'] = list()
        self.evaluation_stats['val_scores'] = list()
        self.global_start_time = time.time()
        self.logger = get_logger(self.__module__ + "." + self.__class__.__name__)

        # Parameters in Hyperband framework.
        self.restart_needed = True
        self.R = R
        self.eta = eta
        self.seed = seed
        self.logeta = lambda x: log(x) / log(self.eta)
        self.s_max = int(self.logeta(self.R))
        self.B = (self.s_max + 1) * self.R
        self.s_values = list(reversed(range(self.s_max + 1)))
        self.inner_iter_id = 0

        # Parameters in MFSE-HB.
        self.weight_update_id = 0
        self.iterate_r = []
        self.target_x = dict()
        self.target_y = dict()
        self.exp_output = dict()
        for index, item in enumerate(np.logspace(0, self.s_max, self.s_max + 1, base=self.eta)):
            r = int(item)
            self.iterate_r.append(r)
            self.target_x[r] = list()
            self.target_y[r] = list()

        types, bounds = get_types(self.config_space)
        self.num_config = len(bounds)
        init_weight = [1. / self.s_max] * self.s_max + [0.]
        self.weighted_surrogate = WeightedRandomForestCluster(types, bounds, self.s_max,
                                                              self.eta, init_weight, 'gpoe')
        self.weight_changed_cnt = 0
        self.hist_weights = list()

        self.weighted_acquisition_func = EI(model=self.weighted_surrogate)
        self.weighted_acq_optimizer = RandomSampling(self.weighted_acquisition_func,
                                                     self.config_space,
                                                     n_samples=2000,
                                                     rng=np.random.RandomState(seed))
        self.eval_dict = dict()
Пример #9
0
 def __init__(self, name, task_type, datanode, seed=1):
     self.name = name
     self._seed = seed
     self.root_node = datanode.copy_()
     self.incumbent = self.root_node
     self.task_type = task_type
     self.graph = TransformationGraph()
     self.graph.add_node(self.root_node)
     self.time_budget = None
     self.maximum_evaluation_num = None
     self.logger = get_logger(self.__module__ + "." + self.__class__.__name__)
Пример #10
0
 def __init__(self, evaluator: _BaseEvaluator, config_space, seed=None):
     self.evaluator = evaluator
     self.config_space = config_space
     self.seed = np.random.random_integers(
         MAX_INT) if seed is None else seed
     self.start_time = time.time()
     self.timing_list = list()
     self.incumbent = None
     self.logger = get_logger(self.__module__ + "." +
                              self.__class__.__name__)
     self.init_hpo_iter_num = None
Пример #11
0
    def __init__(self,
                 n_algorithm=3,
                 task_type=None,
                 metric='acc',
                 rep=3,
                 total_resource=20,
                 meta_algorithm='lightgbm',
                 exclude_datasets=None,
                 meta_dir=None):
        self.logger = get_logger(self.__module__ + "." +
                                 self.__class__.__name__)
        self.n_algorithm = n_algorithm
        self.n_algo_candidates = len(_buildin_algorithms)
        self.task_type = task_type
        self.meta_algo = meta_algorithm
        if task_type in CLS_TASKS:
            if metric not in ['acc', 'bal_acc']:
                self.logger.info(
                    'Meta information about metric-%s does not exist, use accuracy instead.'
                    % str(metric))
                metric = 'acc'
        elif task_type in REG_TASKS:
            raise NotImplementedError()
        else:
            raise ValueError('Invalid metric: %s.' % metric)

        self.metric = metric
        self.rep = rep
        self.total_resource = total_resource
        self.exclude_datasets = exclude_datasets
        self.meta_learner = None
        buildin_loc = os.path.dirname(__file__) + '/../meta_resource/'
        self.meta_dir = meta_dir if meta_dir is not None else buildin_loc

        if self.exclude_datasets is None:
            self.hash_id = 'none'
        else:
            self.exclude_datasets = list(set(exclude_datasets))
            exclude_str = ','.join(sorted(self.exclude_datasets))
            md5 = hashlib.md5()
            md5.update(exclude_str.encode('utf-8'))
            self.hash_id = md5.hexdigest()
        meta_datasets = set()
        meta_runs_dir = self.meta_dir + 'meta_runs/%s/' % self.metric
        for _record in os.listdir(meta_runs_dir):
            if _record.endswith('.pkl') and _record.find('-') != -1:
                meta_name = '-'.join(_record.split('-')[:-4])
                if self.exclude_datasets is not None and meta_name in self.exclude_datasets:
                    continue
                meta_datasets.add(meta_name)
        self._buildin_datasets = list(meta_datasets)
        if not self.meta_dir.endswith('/'):
            self.meta_dir += '/'
Пример #12
0
 def __init__(self, clf_config, scorer=None, data_node=None, name=None,
              resampling_strategy='cv', resampling_params=None, seed=1):
     self.resampling_strategy = resampling_strategy
     self.resampling_params = resampling_params
     self.hpo_config = clf_config
     self.scorer = scorer if scorer is not None else balanced_accuracy_scorer
     self.data_node = data_node
     self.name = name
     self.seed = seed
     self.eval_id = 0
     self.onehot_encoder = None
     self.logger = get_logger(self.__module__ + "." + self.__class__.__name__)
Пример #13
0
 def __init__(self, name, task_type, datanode, seed=1):
     self.name = name
     self._seed = seed
     self.root_node = datanode.copy_()
     self.incumbent = self.root_node
     self.task_type = task_type
     self.graph = TransformationGraph()
     self.graph.add_node(self.root_node)
     self.time_budget = None
     self.maximum_evaluation_num = None
     logger_name = '%s(%d)' % (self.name, self._seed)
     self.logger = get_logger(logger_name)
Пример #14
0
 def __init__(self, reg_config, scorer=None, data_node=None, name=None,
              resampling_strategy='holdout', resampling_params=None, seed=1,
              estimator=None):
     self.hpo_config = reg_config
     self.scorer = scorer
     self.data_node = data_node
     self.name = name
     self.estimator = estimator
     self.resampling_strategy = resampling_strategy
     self.resampling_params = resampling_params
     self.seed = seed
     self.eval_id = 0
     self.logger = get_logger(self.__module__ + "." + self.__class__.__name__)
Пример #15
0
 def __init__(self,
              n_algorithm=3,
              task_type=None,
              metric='acc',
              exclude_datasets=None):
     self.logger = get_logger(self.__module__ + "." +
                              self.__class__.__name__)
     super().__init__(n_algorithm,
                      task_type,
                      metric=metric,
                      meta_algorithm='lightgbm',
                      exclude_datasets=exclude_datasets)
     self.model = None
Пример #16
0
 def __init__(self,
              rep=3,
              metric='acc',
              n_algorithm=3,
              task_type=None,
              total_resource=1200,
              exclude_datasets=None,
              meta_dir=None):
     self.logger = get_logger(self.__module__ + "." +
                              self.__class__.__name__)
     super().__init__(n_algorithm, task_type, metric, rep, total_resource,
                      'ranknet', exclude_datasets, meta_dir)
     self.model = None
Пример #17
0
    def __init__(self,
                 eval_func,
                 config_space,
                 seed=1,
                 R=81,
                 eta=3,
                 n_jobs=1,
                 output_dir='./'):
        self.eval_func = eval_func
        self.config_space = config_space
        self.n_workers = n_jobs

        self.trial_cnt = 0
        self.configs = list()
        self.perfs = list()
        self.incumbent_perf = float("-INF")
        self.incumbent_config = self.config_space.get_default_configuration()
        self.incumbent_configs = list()
        self.incumbent_perfs = list()
        self.evaluation_stats = dict()
        self.evaluation_stats['timestamps'] = list()
        self.evaluation_stats['val_scores'] = list()
        self.global_start_time = time.time()
        self.logger = get_logger(self.__module__ + "." +
                                 self.__class__.__name__)

        # Parameters in Hyperband framework.
        self.restart_needed = True
        self.R = R
        self.eta = eta
        self.seed = seed
        self.logeta = lambda x: log(x) / log(self.eta)
        self.s_max = int(self.logeta(self.R))
        self.B = (self.s_max + 1) * self.R
        self.s_values = list(reversed(range(self.s_max + 1)))
        self.inner_iter_id = 0

        # Parameters in MFSE-HB.
        self.iterate_r = []
        self.target_x = dict()
        self.target_y = dict()
        self.exp_output = dict()
        for index, item in enumerate(
                np.logspace(0, self.s_max, self.s_max + 1, base=self.eta)):
            r = int(item)
            self.iterate_r.append(r)
            self.target_x[r] = list()
            self.target_y[r] = list()

        self.mf_advisor = MFBatchAdvisor(config_space, output_dir=output_dir)
        self.eval_dict = dict()
Пример #18
0
    def __init__(self,
                 task_type,
                 architectures,
                 time_limit,
                 sampling_strategy='uniform',
                 R=27,
                 eta=3,
                 N=9,
                 n_jobs=1):
        self.architectures = architectures
        self.time_limit = time_limit
        self.task_type = task_type
        self.n_jobs = n_jobs
        self.R = R
        self.eta = eta
        self.N = N
        self.logeta = lambda x: log(x) / log(self.eta)
        self.s_max = int(self.logeta(self.R))
        self.sampling_strategy = sampling_strategy
        self.logger = get_logger(self.__module__ + "." +
                                 self.__class__.__name__)

        from solnml.components.models.img_classification import _classifiers as _img_estimators, _addons as _img_addons
        from solnml.components.models.text_classification import _classifiers as _text_estimators, \
            _addons as _text_addons
        from solnml.components.models.object_detection import _classifiers as _od_estimators, _addons as _od_addons

        self.time_limit = time_limit
        self.elimination_strategy = 'bandit'
        # Runtime stats.
        self.evaluation_stats = dict()

        self.update_cs = dict()

        if task_type == IMG_CLS:
            self._estimators = _img_estimators
            self._addons = _img_addons
        elif task_type == TEXT_CLS:
            self._estimators = _text_estimators
            self._addons = _text_addons
        elif task_type == OBJECT_DET:
            self._estimators = _od_estimators
            self._addons = _od_addons
        else:
            raise ValueError("Unknown task type %s" % task_type)
        self.eval_hist_configs = dict()
        self.eval_hist_perfs = dict()

        self.tpe_config_gen = dict()
        self.mfse_config_gen = dict()
Пример #19
0
    def __init__(self,
                 scorer=None,
                 data_node=None,
                 task_type=0,
                 resampling_strategy='cv',
                 resampling_params=None,
                 seed=1):
        self.resampling_strategy = resampling_strategy
        self.resampling_params = resampling_params
        self.scorer = scorer if scorer is not None else balanced_accuracy_scorer
        self.data_node = data_node
        self.seed = seed
        self.eval_id = 0
        self.onehot_encoder = None
        self.logger = get_logger(self.__module__ + "." +
                                 self.__class__.__name__)
        self.continue_training = False

        tmp_evaluator = ClassificationEvaluator(None)
        self.tmp_bo = AnotherBayesianOptimizationOptimizer(
            task_type, data_node, tmp_evaluator, 'adaboost', 1, 1, 1)
Пример #20
0
    def __init__(self,
                 stats,
                 ensemble_method: str,
                 ensemble_size: int,
                 task_type: int,
                 max_epoch: int,
                 metric: _BaseScorer,
                 timestamp: float,
                 output_dir=None,
                 device='cpu'):
        self.stats = stats
        self.ensemble_method = ensemble_method
        self.ensemble_size = ensemble_size
        self.task_type = task_type
        self.max_epoch = max_epoch
        self.metric = metric
        self.output_dir = output_dir
        self.device = device

        self.seed = 1
        self.timestamp = str(timestamp)
        logger_name = 'EnsembleBuilder'
        self.logger = get_logger(logger_name)
Пример #21
0
def calculate_all_metafeatures(X,
                               y,
                               categorical,
                               dataset_name,
                               task_type,
                               calculate=None,
                               dont_calculate=None,
                               densify_threshold=1000):
    logger = get_logger(__name__)
    """Calculate all metafeatures."""
    helper_functions.clear()
    metafeatures.clear()
    mf_ = dict()

    visited = set()
    to_visit = deque()
    to_visit.extend(metafeatures)

    X_transformed = None
    y_transformed = None

    func_cls = [
        'NumberOfClasses', 'LogNumberOfFeatures', 'ClassProbabilityMin',
        'ClassProbabilityMax', 'ClassProbabilityMean', "ClassProbabilitySTD",
        'ClassEntropy', 'LandmarkLDA', 'LandmarkNaiveBayes',
        'LandmarkDecisionTree', 'LandmarkDecisionNodeLearner',
        'LandmarkRandomNodeLearner', 'LandmarkWorstNodeLearner', 'Landmark1NN'
    ]

    # TODO calculate the numpy metafeatures after all others to consume less
    # memory
    while len(to_visit) > 0:
        name = to_visit.pop()
        if calculate is not None and name not in calculate:
            continue
        if dont_calculate is not None and name in dont_calculate:
            continue
        if name in func_cls and task_type not in CLS_TASKS:
            continue

        if name in npy_metafeatures:
            if X_transformed is None:
                # TODO make sure this is done as efficient as possible (no copy for
                # sparse matrices because of wrong sparse format)
                sparse = scipy.sparse.issparse(X)

                imputer = SimpleImputer(strategy='most_frequent', copy=False)
                X_transformed = imputer.fit_transform(X.copy())
                if any(categorical):
                    categorical_idx = [
                        idx for idx, i in enumerate(categorical) if i
                    ]
                    ohe = ColumnTransformer(
                        [('one-hot', OneHotEncoder(), categorical_idx)],
                        remainder="passthrough")
                    X_transformed = ohe.fit_transform(X_transformed)

                center = not scipy.sparse.isspmatrix(X_transformed)
                standard_scaler = StandardScaler(copy=False, with_mean=center)
                X_transformed = standard_scaler.fit_transform(X_transformed)
                categorical_transformed = [False] * X_transformed.shape[1]

                # Densify the transformed matrix
                if not sparse and scipy.sparse.issparse(X_transformed):
                    bytes_per_float = X_transformed.dtype.itemsize
                    num_elements = X_transformed.shape[
                        0] * X_transformed.shape[1]
                    megabytes_required = num_elements * bytes_per_float / 1000 / 1000
                    if megabytes_required < densify_threshold:
                        X_transformed = X_transformed.todense()

                # This is not only important for datasets which are somehow
                # sorted in a strange way, but also prevents lda from failing in
                # some cases.
                # Because this is advanced indexing, a copy of the data is returned!!!
                X_transformed = check_array(X_transformed,
                                            force_all_finite=True,
                                            accept_sparse='csr')
                rs = np.random.RandomState(42)
                indices = np.arange(X_transformed.shape[0])
                rs.shuffle(indices)
                # TODO Shuffle inplace
                X_transformed = X_transformed[indices]
                y_transformed = y[indices]

            X_ = X_transformed
            y_ = y_transformed
            categorical_ = categorical_transformed
        else:
            X_ = X
            y_ = y
            categorical_ = categorical

        dependency = metafeatures.get_dependency(name)
        if dependency is not None:
            is_metafeature = dependency in metafeatures
            is_helper_function = dependency in helper_functions

            if is_metafeature and is_helper_function:
                raise NotImplementedError()
            elif not is_metafeature and not is_helper_function:
                raise ValueError(dependency)
            elif is_metafeature and not metafeatures.is_calculated(dependency):
                to_visit.appendleft(name)
                continue
            elif is_helper_function and not helper_functions.is_calculated(
                    dependency):
                logger.debug("%s: Going to calculate: %s", dataset_name,
                             dependency)
                value = helper_functions[dependency](X_, y_, categorical_)
                helper_functions.set_value(dependency, value)
                mf_[dependency] = value

        logger.debug("%s: Going to calculate: %s", dataset_name, name)

        value = metafeatures[name](X_, y_, categorical_)
        metafeatures.set_value(name, value)
        mf_[name] = value
        visited.add(name)

    mf_ = DatasetMetafeatures(dataset_name, mf_, task_type=task_type)
    return mf_
Пример #22
0
    def __init__(self,
                 stats,
                 ensemble_method: str,
                 ensemble_size: int,
                 task_type: int,
                 metric: _BaseScorer,
                 data_node,
                 output_dir=None):
        self.stats = stats
        self.ensemble_method = ensemble_method
        self.ensemble_size = ensemble_size
        self.task_type = task_type
        self.metric = metric
        self.output_dir = output_dir
        self.node = data_node

        self.predictions = []
        self.train_labels = None
        self.timestamp = str(time.time())
        logger_name = 'EnsembleBuilder'
        self.logger = get_logger(logger_name)

        for algo_id in self.stats.keys():
            model_to_eval = self.stats[algo_id]
            for idx, (_, _, path) in enumerate(model_to_eval):
                with open(path, 'rb') as f:
                    op_list, model, _ = pkl.load(f)
                _node = self.node.copy_()
                _node = construct_node(_node, op_list)

                # TODO: Test size
                test_size = 0.33
                X, y = _node.data

                if self.task_type in CLS_TASKS:
                    ss = StratifiedShuffleSplit(n_splits=1,
                                                test_size=test_size,
                                                random_state=1)
                else:
                    ss = ShuffleSplit(n_splits=1,
                                      test_size=test_size,
                                      random_state=1)

                for train_index, val_index in ss.split(X, y):
                    X_valid = X[val_index]
                    y_valid = y[val_index]

                if self.train_labels is not None:
                    assert (self.train_labels == y_valid).all()
                else:
                    self.train_labels = y_valid

                if self.task_type in CLS_TASKS:
                    y_valid_pred = model.predict_proba(X_valid)
                else:
                    y_valid_pred = model.predict(X_valid)
                self.predictions.append(y_valid_pred)

        if len(self.predictions) < self.ensemble_size:
            self.ensemble_size = len(self.predictions)

        if ensemble_method == 'ensemble_selection':
            return

        if task_type in CLS_TASKS:
            self.base_model_mask = choose_base_models_classification(
                np.array(self.predictions), self.ensemble_size)
        else:
            self.base_model_mask = choose_base_models_regression(
                np.array(self.predictions), np.array(y_valid),
                self.ensemble_size)
        self.ensemble_size = sum(self.base_model_mask)
Пример #23
0
    def __init__(self,
                 n_algorithm=3,
                 task_type=None,
                 metric='bal_acc',
                 rep=3,
                 total_resource=1200,
                 meta_algorithm='lightgbm',
                 exclude_datasets=None,
                 meta_dir=None):
        self.logger = get_logger(self.__module__ + "." +
                                 self.__class__.__name__)
        self.n_algorithm = n_algorithm
        self.n_algo_candidates = len(_cls_builtin_algorithms)
        self.task_type = task_type
        self.meta_algo = meta_algorithm
        self.rep = rep
        self.metric = metric
        if task_type in CLS_TASKS:
            self.algorithms = _cls_builtin_algorithms
            self.n_algo_candidates = len(_cls_builtin_algorithms)
            if metric not in ['acc', 'bal_acc']:
                self.logger.info(
                    'Meta information about metric-%s does not exist, use accuracy instead.'
                    % str(metric))
                metric = 'acc'
        elif task_type in RGS_TASKS:
            self.algorithms = _rgs_builtin_algorithms
            self.n_algo_candidates = len(_rgs_builtin_algorithms)
            if metric not in ['mse']:
                self.logger.info(
                    'Meta information about metric-%s does not exist, use accuracy instead.'
                    % str(metric))
                metric = 'mse'
        else:
            raise ValueError('Invalid metric: %s.' % metric)

        self.total_resource = total_resource
        self.exclude_datasets = exclude_datasets

        builtin_loc = os.path.dirname(__file__)
        builtin_loc = os.path.join(builtin_loc, '..')
        builtin_loc = os.path.join(builtin_loc, 'meta_resource')
        self.meta_dir = meta_dir if meta_dir is not None else builtin_loc

        if self.exclude_datasets is None:
            self.hash_id = 'none'
        else:
            self.exclude_datasets = list(set(exclude_datasets))
            exclude_str = ','.join(sorted(self.exclude_datasets))
            md5 = hashlib.md5()
            md5.update(exclude_str.encode('utf-8'))
            self.hash_id = md5.hexdigest()
        meta_datasets = set()
        _folder = os.path.join(self.meta_dir, 'meta_dataset_vec')

        if task_type in CLS_TASKS:
            task_prefix = 'cls'
        else:
            task_prefix = 'rgs'

        embedding_path = os.path.join(
            _folder, '%s_meta_dataset_embedding.pkl' % task_prefix)
        with open(embedding_path, 'rb') as f:
            d = pkl.load(f)
            meta_datasets = d['task_ids']

        self._builtin_datasets = sorted(list(meta_datasets))

        self.metadata_manager = MetaDataManager(self.meta_dir,
                                                self.algorithms,
                                                self._builtin_datasets,
                                                metric,
                                                total_resource,
                                                task_type=task_type,
                                                rep=rep)
        self.meta_learner = None
Пример #24
0
    def __init__(self,
                 node_list,
                 node_index,
                 task_type,
                 timestamp,
                 fe_config_space: ConfigurationSpace,
                 cash_config_space: ConfigurationSpace,
                 data: DataNode,
                 fixed_config=None,
                 trial_num=0,
                 time_limit=None,
                 metric='acc',
                 ensemble_method='ensemble_selection',
                 ensemble_size=50,
                 per_run_time_limit=300,
                 output_dir="logs",
                 dataset_name='default_dataset',
                 eval_type='holdout',
                 resampling_params=None,
                 n_jobs=1,
                 seed=1):
        # Tree setting
        self.node_list = node_list
        self.node_index = node_index

        # Set up backend.
        self.dataset_name = dataset_name
        self.trial_num = trial_num
        self.time_limit = time_limit
        self.per_run_time_limit = per_run_time_limit
        self.start_time = time.time()
        self.logger = get_logger('Soln-ml: %s' % dataset_name)

        # Basic settings.
        self.eval_type = eval_type
        self.resampling_params = resampling_params
        self.task_type = task_type
        self.timestamp = timestamp
        self.fe_config_space = fe_config_space
        self.cash_config_space = cash_config_space
        self.fixed_config = fixed_config
        self.original_data = data.copy_()
        self.metric = get_metric(metric)
        self.ensemble_method = ensemble_method
        self.ensemble_size = ensemble_size
        self.n_jobs = n_jobs
        self.seed = seed
        self.output_dir = output_dir

        self.early_stop_flag = False
        self.timeout_flag = False
        self.incumbent_perf = -float("INF")
        self.incumbent = None
        self.eval_dict = dict()

        if self.task_type in CLS_TASKS:
            self.if_imbal = is_imbalanced_dataset(self.original_data)
        else:
            self.if_imbal = False

        self.es = None
Пример #25
0
    def __init__(self,
                 eval_func,
                 config_space,
                 config_generator='tpe',
                 seed=1,
                 R=27,
                 eta=3,
                 n_jobs=1):
        self.eval_func = eval_func
        self.config_space = config_space
        self.config_generator = config_generator
        self.n_workers = n_jobs

        self.trial_cnt = 0
        self.configs = list()
        self.perfs = list()
        self.incumbent_perf = float("-INF")
        self.incumbent_config = self.config_space.get_default_configuration()
        self.incumbent_configs = list()
        self.incumbent_perfs = list()
        self.global_start_time = time.time()
        self.time_ticks = list()
        self.logger = get_logger(self.__module__ + "." +
                                 self.__class__.__name__)

        # Parameters in Hyperband framework.
        self.restart_needed = True
        self.R = R
        self.eta = eta
        self.seed = seed
        self.logeta = lambda x: log(x) / log(self.eta)
        self.s_max = int(self.logeta(self.R))
        self.B = (self.s_max + 1) * self.R
        self.s_values = list(reversed(range(self.s_max + 1)))
        self.inner_iter_id = 0

        # Parameters in BOHB.
        self.iterate_r = list()
        self.target_x = dict()
        self.target_y = dict()
        self.exp_output = dict()
        for index, item in enumerate(
                np.logspace(0, self.s_max, self.s_max + 1, base=self.eta)):
            r = int(item)
            self.iterate_r.append(r)
            self.target_x[r] = list()
            self.target_y[r] = list()

        types, bounds = get_types(self.config_space)
        self.num_config = len(bounds)
        self.surrogate = RandomForestWithInstances(types, bounds)

        # self.executor = ParallelEvaluator(self.eval_func, n_worker=n_jobs)
        # self.executor = ParallelProcessEvaluator(self.eval_func, n_worker=n_jobs)
        self.acquisition_func = EI(model=self.surrogate)
        self.acq_optimizer = RandomSampling(self.acquisition_func,
                                            self.config_space,
                                            n_samples=2000,
                                            rng=np.random.RandomState(seed))

        self.config_gen = TPE(config_space)

        self.eval_dict = dict()
Пример #26
0
 def __init__(self):
     self.logger = get_logger(__name__)
Пример #27
0
    def __init__(self,
                 task_type,
                 trial_num,
                 classifier_ids: List[str],
                 data: DataNode,
                 metric='acc',
                 ensemble_method='ensemble_selection',
                 ensemble_size=10,
                 per_run_time_limit=300,
                 output_dir=None,
                 dataset_name='default_dataset',
                 eval_type='holdout',
                 share_feature=False,
                 inner_opt_algorithm='rb',
                 fe_algo='bo',
                 time_limit=None,
                 n_jobs=1,
                 seed=1):
        """
        :param classifier_ids: subset of {'adaboost','bernoulli_nb','decision_tree','extra_trees','gaussian_nb','gradient_boosting',
        'gradient_boosting','k_nearest_neighbors','lda','liblinear_svc','libsvm_svc','multinomial_nb','passive_aggressive','qda',
        'random_forest','sgd'}
        """
        self.timestamp = time.time()
        self.task_type = task_type
        self.metric = get_metric(metric)
        self.original_data = data.copy_()
        self.ensemble_method = ensemble_method
        self.ensemble_size = ensemble_size
        self.trial_num = trial_num
        self.n_jobs = n_jobs
        self.alpha = 4
        self.B = 0.01
        self.seed = seed
        self.shared_mode = share_feature
        self.output_dir = output_dir
        np.random.seed(self.seed)

        # Best configuration.
        self.optimal_algo_id = None
        self.nbest_algo_ids = None
        self.best_lower_bounds = None
        self.es = None

        # Set up backend.
        self.dataset_name = dataset_name
        self.time_limit = time_limit
        self.start_time = time.time()
        self.logger = get_logger('Soln-ml: %s' % dataset_name)

        # Bandit settings.
        self.incumbent_perf = -1.
        self.arms = classifier_ids
        self.include_algorithms = classifier_ids
        self.rewards = dict()
        self.sub_bandits = dict()
        self.evaluation_cost = dict()
        self.fe_datanodes = dict()
        self.eval_type = eval_type
        self.fe_algo = fe_algo
        self.inner_opt_algorithm = inner_opt_algorithm
        for arm in self.arms:
            self.rewards[arm] = list()
            self.evaluation_cost[arm] = list()
            self.fe_datanodes[arm] = list()
            self.sub_bandits[arm] = SecondLayerBandit(
                self.task_type,
                arm,
                self.original_data,
                metric=self.metric,
                output_dir=output_dir,
                per_run_time_limit=per_run_time_limit,
                share_fe=self.shared_mode,
                seed=self.seed,
                eval_type=eval_type,
                dataset_id=dataset_name,
                n_jobs=self.n_jobs,
                fe_algo=fe_algo,
                mth=inner_opt_algorithm,
            )

        self.action_sequence = list()
        self.final_rewards = list()
        self.start_time = time.time()
        self.time_records = list()
Пример #28
0
    def __init__(self,
                 configspace,
                 min_points_in_model=None,
                 top_n_percent=15,
                 num_samples=64,
                 random_fraction=1 / 3,
                 bandwidth_factor=3,
                 min_bandwidth=1e-3,
                 **kwargs):
        """
            Fits for each given budget a kernel density estimator on the best N percent of the
            evaluated configurations on this budget.


            Parameters:
            -----------
            configspace: ConfigSpace
                Configuration space object
            top_n_percent: int
                Determines the percentile of configurations that will be used as training data
                for the kernel density estimator, e.g if set to 10 the 10% best configurations will be considered
                for training.
            min_points_in_model: int
                minimum number of datapoints needed to fit a model
            num_samples: int
                number of samples drawn to optimize EI via sampling
            random_fraction: float
                fraction of random configurations returned
            bandwidth_factor: float
                widens the bandwidth for contiuous parameters for proposed points to optimize EI
            min_bandwidth: float
                to keep diversity, even when all (good) samples have the same value for one of the parameters,
                a minimum bandwidth (Default: 1e-3) is used instead of zero.

        """
        super().__init__(**kwargs)
        self.top_n_percent = top_n_percent
        self.configspace = configspace
        self.bw_factor = bandwidth_factor
        self.min_bandwidth = min_bandwidth

        self.min_points_in_model = min_points_in_model
        if min_points_in_model is None:
            self.min_points_in_model = len(
                self.configspace.get_hyperparameters()) + 1

        if self.min_points_in_model < len(
                self.configspace.get_hyperparameters()) + 1:
            self.min_points_in_model = len(
                self.configspace.get_hyperparameters()) + 1

        self.num_samples = num_samples
        self.random_fraction = random_fraction

        hps = self.configspace.get_hyperparameters()

        self.kde_vartypes = ""
        self.vartypes = []

        for h in hps:
            if hasattr(h, 'sequence'):
                raise RuntimeError(
                    'This version on BOHB does not support ordinal hyperparameters. Please encode %s as an integer parameter!'
                    % (h.name))

            if hasattr(h, 'choices'):
                self.kde_vartypes += 'u'
                self.vartypes += [len(h.choices)]
            else:
                self.kde_vartypes += 'c'
                self.vartypes += [0]

        self.vartypes = np.array(self.vartypes, dtype=int)

        # store precomputed probs for the categorical parameters
        self.cat_probs = []

        self.configs = dict()
        self.losses = dict()
        self.good_config_rankings = dict()
        self.kde_models = dict()
        self.logger = get_logger(self.__module__ + "." +
                                 self.__class__.__name__)
Пример #29
0
    def __init__(self,
                 stats,
                 ensemble_method: str,
                 ensemble_size: int,
                 task_type: int,
                 metric: _BaseScorer,
                 base_save=False,
                 output_dir=None):
        self.stats = stats
        self.ensemble_method = ensemble_method
        self.ensemble_size = ensemble_size
        self.task_type = task_type
        self.metric = metric
        self.output_dir = output_dir

        self.train_predictions = []
        self.config_list = []
        self.train_data_dict = {}
        self.train_labels = None
        self.seed = self.stats['split_seed']
        self.timestamp = str(time.time())
        logger_name = 'EnsembleBuilder'
        self.logger = get_logger(logger_name)
        model_cnt = 0
        for algo_id in self.stats["include_algorithms"]:
            model_to_eval = self.stats[algo_id]['model_to_eval']
            for idx, (node, config) in enumerate(model_to_eval):
                X, y = node.data

                # TODO: Hyperparameter
                test_size = 0.33

                if self.task_type in CLS_TASKS:
                    ss = StratifiedShuffleSplit(n_splits=1,
                                                test_size=test_size,
                                                random_state=self.seed)
                else:
                    ss = ShuffleSplit(n_splits=1,
                                      test_size=test_size,
                                      random_state=self.seed)

                for train_index, test_index in ss.split(X, y):
                    X_train, X_valid = X[train_index], X[test_index]
                    y_train, y_valid = y[train_index], y[test_index]

                if self.train_labels is not None:
                    assert (self.train_labels == y_valid).all()
                else:
                    self.train_labels = y_valid

                estimator = fetch_predict_estimator(
                    self.task_type,
                    config,
                    X_train,
                    y_train,
                    weight_balance=node.enable_balance,
                    data_balance=node.data_balance)
                if base_save:  # For ensemble selection
                    with open(
                            os.path.join(
                                self.output_dir,
                                '%s-model%d' % (self.timestamp, model_cnt)),
                            'wb') as f:
                        pkl.dump(estimator, f)

                if self.task_type in CLS_TASKS:
                    y_valid_pred = estimator.predict_proba(X_valid)
                else:
                    y_valid_pred = estimator.predict(X_valid)
                self.train_predictions.append(y_valid_pred)
                model_cnt += 1

        if len(self.train_predictions) < self.ensemble_size:
            self.ensemble_size = len(self.train_predictions)

        if ensemble_method == 'ensemble_selection':
            return

        if task_type in CLS_TASKS:
            self.base_model_mask = choose_base_models_classification(
                np.array(self.train_predictions), self.ensemble_size)
        else:
            self.base_model_mask = choose_base_models_regression(
                np.array(self.train_predictions), np.array(y_valid),
                self.ensemble_size)
        self.ensemble_size = sum(self.base_model_mask)
Пример #30
0
    def __init__(self,
                 task_type,
                 trial_num,
                 classifier_ids: List[str],
                 data: DataNode,
                 include_preprocessors=None,
                 time_limit=None,
                 metric='acc',
                 ensemble_method='ensemble_selection',
                 ensemble_size=50,
                 per_run_time_limit=300,
                 output_dir="logs",
                 dataset_name='default_dataset',
                 eval_type='holdout',
                 inner_opt_algorithm='fixed',
                 enable_fe=True,
                 fe_algo='bo',
                 n_jobs=1,
                 seed=1):
        """
        :param classifier_ids: subset of {'adaboost','bernoulli_nb','decision_tree','extra_trees','gaussian_nb','gradient_boosting',
        'gradient_boosting','k_nearest_neighbors','lda','liblinear_svc','libsvm_svc','multinomial_nb','passive_aggressive','qda',
        'random_forest','sgd'}
        """
        self.timestamp = time.time()
        self.task_type = task_type
        self.include_preprocessors = include_preprocessors
        self.metric = get_metric(metric)
        self.original_data = data.copy_()
        self.ensemble_method = ensemble_method
        self.ensemble_size = ensemble_size
        self.trial_num = trial_num
        self.n_jobs = n_jobs
        self.alpha = 4
        self.seed = seed
        self.output_dir = output_dir
        self.early_stop_flag = False
        # np.random.seed(self.seed)

        # Best configuration.
        self.optimal_algo_id = None
        self.nbest_algo_ids = None
        self.best_lower_bounds = None
        self.es = None

        # Set up backend.
        self.dataset_name = dataset_name
        self.time_limit = time_limit
        self.start_time = time.time()
        self.logger = get_logger('Soln-ml: %s' % dataset_name)

        # Bandit settings.
        self.incumbent_perf = -float("INF")
        self.arms = classifier_ids
        self.include_algorithms = classifier_ids
        self.rewards = dict()
        self.sub_bandits = dict()
        self.evaluation_cost = dict()
        self.eval_type = eval_type
        self.enable_fe = enable_fe
        self.fe_algo = fe_algo
        self.inner_opt_algorithm = inner_opt_algorithm

        # Record the execution cost for each arm.
        if not (self.time_limit is None) ^ (self.trial_num is None):
            raise ValueError('Please set one of time_limit or trial_num.')

        self.arm_cost_stats = dict()
        for _arm in self.arms:
            self.arm_cost_stats[_arm] = list()

        for arm in self.arms:
            self.rewards[arm] = list()
            self.evaluation_cost[arm] = list()
            self.sub_bandits[arm] = SecondLayerBandit(
                self.task_type,
                arm,
                self.original_data,
                include_preprocessors=self.include_preprocessors,
                metric=self.metric,
                output_dir=output_dir,
                per_run_time_limit=per_run_time_limit,
                seed=self.seed,
                eval_type=eval_type,
                dataset_id=dataset_name,
                n_jobs=self.n_jobs,
                fe_algo=fe_algo,
                mth=self.inner_opt_algorithm,
                timestamp=self.timestamp)

        self.action_sequence = list()
        self.final_rewards = list()
        self.start_time = time.time()
        self.time_records = list()