Exemplo n.º 1
0
 def test_procedure(self):
     for cls in [
             EntityEncoder,
             TargetEncoder,
             BinaryEncoder,
             CatBoostEncoder,
             OrdinalEncoder,
             LeaveOneOutEncoder,
             OneHotEncoder,
             # WOEEncoder,  # 不支持回归
     ]:
         print("=========================")
         print(cls.__name__)
         print("=========================")
         start = time()
         hp = get_default_hp_of_cls(cls)
         workflow = ML_Workflow(steps=[
             ("encoder",
              cls(in_feature_groups="cat", out_feature_groups="num", **hp)),
             ("rf", RandomForestRegressor(random_state=0))
         ],
                                resource_manager=self.mock_resource_manager)
         workflow.fit(X_train=self.X_train,
                      X_valid=self.X_test,
                      y_train=self.y_train,
                      y_valid=self.y_test)
         y_pred = workflow.predict(self.X_test)
         score = r2_score(self.y_test.data, y_pred)
         print("r2 = ", score)
         print("time = ", time() - start)
         print("\n" * 2)
Exemplo n.º 2
0
 def test_procedure(self):
     for cls in [
             MinMaxScaler,
             StandardScaler,
             Normalizer,
             QuantileTransformer,
             RobustScaler,
             KeepGoing,
             # WOEEncoder,  # 不支持回归
     ]:
         print("=========================")
         print(cls.__name__)
         print("=========================")
         if cls == KeepGoing:
             hp = {}
         else:
             hp = get_default_hp_of_cls(cls)
         start = time()
         workflow = ML_Workflow(steps=[("scaler",
                                        cls(in_feature_groups="num",
                                            out_feature_groups="scaled",
                                            **hp)),
                                       ("rf", LinearSVR(random_state=0))],
                                resource_manager=self.mock_resource_manager)
         workflow.fit(X_train=self.X_train,
                      X_valid=self.X_test,
                      y_train=self.y_train,
                      y_valid=self.y_test)
         y_pred = workflow.predict(self.X_test)
         score = r2_score(self.y_test.data, y_pred)
         print("r2 = ", score)
         print("time = ", time() - start)
         print("\n" * 2)
    def test_classifier(self):
        train_df = datasets.load("titanic")[["Name", "Survived"]]
        y = np.array(train_df.pop("Survived"))

        X_train, X_test, y_train, y_test = train_test_split(train_df,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=0)
        X_train = DataFrameContainer(
            "TrainSet",
            dataset_instance=X_train,
            resource_manager=self.mock_resource_manager)
        X_test = DataFrameContainer(
            "TestSet",
            dataset_instance=X_test,
            resource_manager=self.mock_resource_manager)
        y_train = NdArrayContainer("TrainLabel",
                                   dataset_instance=y_train,
                                   resource_manager=self.mock_resource_manager)
        y_test = NdArrayContainer("TestLabel",
                                  dataset_instance=y_test,
                                  resource_manager=self.mock_resource_manager)
        X_train.set_feature_groups(["text"])
        X_test.set_feature_groups(["text"])
        est_cls_list = [
            TsvdTransformer,
            NmfTransformer,
            LsiTransformer,
            LdaTransformer,
            RpTransformer,
        ]
        for cls in est_cls_list:
            print("=========================")
            print(cls.__name__)
            print("=========================")
            tokenizer = SimpleTokenlizer(
                **get_default_hp_of_cls(SimpleTokenlizer))
            tokenizer.in_feature_groups = "text"
            tokenizer.out_feature_groups = "token"
            transformer = cls(**get_default_hp_of_cls(cls))
            transformer.in_feature_groups = "token"
            transformer.out_feature_groups = "num"
            classifier = RandomForestClassifier(
                **get_default_hp_of_cls(RandomForestClassifier))
            pipeline = ML_Workflow([
                ("tokenizer", tokenizer),
                ("transformer", transformer),
                ("classifier", classifier),
            ],
                                   resource_manager=self.mock_resource_manager)
            start = time()
            pipeline.fit(X_train, y_train, X_test, y_test)
            y_pred = pipeline.predict(X_test)
            score = accuracy_score(y_test.data, y_pred)
            end = time()
            print("score:", score)
            print("time:", end - start)
            self.assertGreater(score, 0.6)
            print('\n' * 2)
Exemplo n.º 4
0
 def create_preprocessor(self, dhp: Dict) -> Optional[ML_Workflow]:
     preprocessing_dict: dict = dhp[PHASE1]
     pipeline_list = []
     for key, value in preprocessing_dict.items():
         name = key  # like: "cat->num"
         in_feature_groups, out_feature_groups = self.parse_key(key)
         sub_dict = preprocessing_dict[name]
         component_name = list(sub_dict.keys())[0]
         # todo: 根据Component name 调整 out_feature_groups
         specific_out_feature_groups = self.specific_out_feature_groups_mapper.get(
             component_name)
         if specific_out_feature_groups is not None:
             out_feature_groups = specific_out_feature_groups
         if sub_dict is None:
             continue
         preprocessor = self.create_component(
             sub_dict,
             PHASE1,
             name,
             in_feature_groups,
             out_feature_groups,
         )
         pipeline_list.extend(preprocessor)
     if pipeline_list:
         return ML_Workflow(pipeline_list,
                            self.should_store_intermediate_result,
                            self.resource_manager)
     else:
         return None
Exemplo n.º 5
0
    def test_under_sample(self):

        est_cls_list = [
            AllKNN,
            ClusterCentroids,
            CondensedNearestNeighbour,
            EditedNearestNeighbours,
            InstanceHardnessThreshold,
            NearMiss,
            NeighbourhoodCleaningRule,
            OneSidedSelection,
            RandomUnderSampler,
            RepeatedEditedNearestNeighbours,
            TomekLinks,
        ]

        for cls in est_cls_list:
            print("=========================")
            print(cls.__name__)
            print("=========================")
            balancer = cls(**get_default_hp_of_cls(cls))
            classifier = LinearSVC(**get_default_hp_of_cls(LinearSVC))
            pipeline = ML_Workflow([
                ("balancer", balancer),
                ("classifier", classifier),
            ],
                                   resource_manager=self.mock_resource_manager,
                                   should_store_intermediate_result=True)
            start = time()
            pipeline.fit(self.X_train, self.y_train, self.X_test, self.y_test)
            balanced_y_train = NdArrayContainer(
                dataset_id=pipeline.intermediate_result["balancer"]["y_train"],
                resource_manager=self.mock_resource_manager)
            print("y_train:")
            print(Counter(self.y_train.data))
            print("balanced y_train:")
            print(Counter(balanced_y_train.data))

            y_pred = pipeline.predict(self.X_test)
            score = accuracy_score(self.y_test.data, y_pred)
            end = time()
            print("score:", score)
            print("time:", end - start)
            self.assertGreater(score, 0.6)
            print('\n' * 2)
Exemplo n.º 6
0
    def test_over_sample(self):
        est_cls_list = [
            RandomOverSampler,
            # ADASYN,
            BorderlineSMOTE,
            KMeansSMOTE,
            SMOTE,
            SVMSMOTE,
        ]

        for cls in est_cls_list:
            print("=========================")
            print(cls.__name__)
            print("=========================")
            balancer = cls(**get_default_hp_of_cls(cls))
            classifier = LinearSVC(**get_default_hp_of_cls(LinearSVC))
            pipeline = ML_Workflow([
                ("balancer", balancer),
                ("classifier", classifier),
            ],
                                   resource_manager=self.mock_resource_manager,
                                   should_store_intermediate_result=True)
            start = time()
            pipeline.fit(self.X_train, self.y_train, self.X_test, self.y_test)
            balanced_y_train = NdArrayContainer(
                dataset_id=pipeline.intermediate_result["balancer"]["y_train"],
                resource_manager=self.mock_resource_manager)
            print("y_train:")
            print(Counter(self.y_train.data))
            print("balanced y_train:")
            print(Counter(balanced_y_train.data))

            y_pred = pipeline.predict(self.X_test)
            score = accuracy_score(self.y_test.data, y_pred)
            end = time()
            print("score:", score)
            print("time:", end - start)
            self.assertGreater(score, 0.6)
            print('\n' * 2)
Exemplo n.º 7
0
def concat_pipeline(*args) -> Optional[ML_Workflow]:
    pipeline_list = []
    resource_manager = None
    should_store_intermediate_result = False
    for node in args:
        if isinstance(node, ML_Workflow):
            pipeline_list.extend(node.steps)
            resource_manager = node.resource_manager
            should_store_intermediate_result = node.should_store_intermediate_result
    if pipeline_list:
        return ML_Workflow(pipeline_list, should_store_intermediate_result,
                           resource_manager)
    else:
        return None
Exemplo n.º 8
0
 def create_estimator(self, dhp: Dict) -> ML_Workflow:
     # 根据超参构造一个估计器
     return ML_Workflow(
         self.create_component(dhp[PHASE2], PHASE2, self.ml_task.role),
         self.should_store_intermediate_result, self.resource_manager)
Exemplo n.º 9
0
    def evaluate(self, config_id, model: ML_Workflow, X, y, X_test, y_test,
                 budget, dhp, config):
        warning_info = StringIO()
        additional_info = {}
        final_model = model[-1]
        final_model_name = final_model.__class__.__name__
        support_early_stopping = getattr(final_model, "support_early_stopping",
                                         False)
        budget_mode = self.algo2budget_mode[final_model_name]
        is_iter_algo = self.algo2iter.get(final_model_name) is not None
        max_iter = -1
        # if final model is iterative algorithm, max_iter should be specified
        if is_iter_algo:
            if budget_mode == ITERATIONS_BUDGET_MODE:
                fraction = min(1, budget)
            else:
                fraction = 1
            max_iter = max(round(self.algo2iter[final_model_name] * fraction),
                           1)
        balance_strategy: Optional[dict] = dhp.get("strategies",
                                                   {}).get("balance")
        if isinstance(balance_strategy, dict):
            balance_strategy_name: str = list(balance_strategy.keys())[0]
            balance_strategy_params: dict = balance_strategy[
                balance_strategy_name]
        else:
            balance_strategy_name: str = "None"
            balance_strategy_params: dict = {}
        if balance_strategy_name == "weight":
            algo_name = model[-1].__class__.__name__
            if algo_name not in self.algo2weight_mode:
                self.logger.warning(
                    f"Algorithm '{algo_name}' not in self.algo2weight_mode !")
            weight_mode = self.algo2weight_mode.get(algo_name)
        else:
            weight_mode = None
        if weight_mode == "class_weight":
            model[-1].update_hyperparams({"class_weight": "balanced"})
        with redirect_stderr(warning_info):
            losses = []
            models = []
            y_true_indexes = []
            y_preds = []
            y_test_preds = []
            all_scores = []
            status = "SUCCESS"
            failed_info = ""
            intermediate_results = []
            start_time = datetime.datetime.now()
            confusion_matrices = []
            best_iterations = []
            component_infos = []
            cost_times = []
            for fold_ix, (train_index, valid_index) in enumerate(
                    self.splitter.split(X.data, y.data, self.groups)):
                cloned_model = model.copy()
                X: DataFrameContainer
                X_train = X.sub_sample(train_index)
                X_valid = X.sub_sample(valid_index)
                y_train = y.sub_sample(train_index)
                y_valid = y.sub_sample(valid_index)
                # subsamples budget_mode.
                if fold_ix == 0 and budget_mode == SUBSAMPLES_BUDGET_MODE and budget < 1:
                    X_train, y_train, (X_valid,
                                       X_test) = implement_subsample_budget(
                                           X_train, y_train, [X_valid, X_test],
                                           budget, self.random_state)
                cache_key = self.get_cache_key(config_id, X_train, y_train)
                cached_model = self.resource_manager.cache.get(cache_key)
                if cached_model is not None:
                    cloned_model = cached_model
                # 如果是iterations budget mode, 采用一个统一的接口调整 max_iter
                # 未来争取做到能缓存ML_Workflow, 只训练最后的拟合器
                if weight_mode == "sample_weight":  # sample_weight balance
                    cloned_model[-1].set_inside_dict({
                        "sample_weight":
                        self.calc_balanced_sample_weight(y_train.data)
                    })
                if self.debug:
                    procedure_result = cloned_model.procedure(
                        self.ml_task, X_train, y_train, X_valid, y_valid,
                        X_test, y_test, max_iter, budget,
                        (budget == self.max_budget))
                else:
                    try:
                        procedure_result = cloned_model.procedure(
                            self.ml_task, X_train, y_train, X_valid, y_valid,
                            X_test, y_test, max_iter, budget,
                            (budget == self.max_budget))
                    except Exception as e:
                        self.logger.error(str(e))
                        self.logger.error(str(config))
                        failed_info = get_trance_back_msg()
                        status = "FAILED"  # todo: 实现 timeout, memory out
                        self.logger.error("re-raise exception")
                        break
                # save model as cache
                if (budget_mode == ITERATIONS_BUDGET_MODE and budget <= 1) or \
                        (budget == 1):  # and isinstance(final_model, AutoFlowIterComponent)
                    self.resource_manager.cache.set(cache_key, cloned_model)
                intermediate_results.append(cloned_model.intermediate_result)
                models.append(cloned_model)
                y_true_indexes.append(valid_index)
                y_pred = procedure_result["pred_valid"]
                y_test_pred = procedure_result["pred_test"]
                if self.ml_task.mainTask == "classification":
                    confusion_matrices.append(
                        calculate_confusion_matrix(y_valid.data, y_pred))
                if support_early_stopping:
                    estimator = cloned_model.steps[-1][1]
                    # todo: 重构 LGBM等
                    best_iterations.append(
                        getattr(estimator, "best_iteration_", -1))
                cost_times.append(cloned_model.time_cost_list)
                component_info = {}
                for step_name, component in cloned_model.steps:
                    component_name = component.__class__.__name__
                    component_additional_info = component.additional_info
                    if bool(component_additional_info):
                        component_info[
                            component_name] = component.additional_info
                component_infos.append(component_info)
                y_preds.append(y_pred)
                if y_test_pred is not None:
                    y_test_preds.append(y_test_pred)
                loss, all_score = self.loss(
                    y_valid.data, y_pred)  # todo: 非1d-array情况下的用户自定义评估器
                losses.append(float(loss))
                all_scores.append(all_score)
                # when  budget  <= 1 , hold out validation
                if fold_ix == 0 and budget <= 1:
                    break
                # when  budget  > 1 , budget will be interpreted as kfolds num by 'budget2kfold'
                # for example, budget = 4 , budget2kfold = {4: 10}, we only do 10 times cross-validation,
                # so we break when fold_ix == 10 - 1 == 9
                if isinstance(self.budget2kfold,
                              dict) and budget in self.budget2kfold:
                    if budget > 1 and fold_ix == self.budget2kfold[budget] - 1:
                        break
            if self.ml_task.mainTask == "classification":
                additional_info["confusion_matrices"] = confusion_matrices
            if support_early_stopping:
                additional_info["best_iterations"] = best_iterations
            additional_info["cost_times"] = cost_times
            additional_info["component_infos"] = component_infos
            end_time = datetime.datetime.now()
            # finally fit
            if status == "SUCCESS" and self.should_finally_fit:
                # make sure have resource_manager to do things like connect redis
                model.resource_manager = self.resource_manager
                finally_fit_model = model.fit(X,
                                              y,
                                              X_test=X_test,
                                              y_test=y_test)
                if self.ml_task.mainTask == "classification":
                    y_test_pred_by_finally_fit_model = model.predict_proba(
                        X_test)
                else:
                    y_test_pred_by_finally_fit_model = model.predict(X_test)
                model.resource_manager = None
            else:
                finally_fit_model = None
                y_test_pred_by_finally_fit_model = None

            if len(losses) > 0:
                final_loss = float(np.array(losses).mean())
            else:
                # train and validation is failed.
                final_loss = ERR_LOSS
            if len(all_scores) > 0 and all_scores[0]:
                all_score = defaultdict(list)
                for cur_all_score in all_scores:
                    if isinstance(cur_all_score, dict):
                        for key, value in cur_all_score.items():
                            all_score[key].append(value)
                    else:
                        self.logger.warning(
                            f"TypeError: cur_all_score is not dict.\ncur_all_score = {cur_all_score}"
                        )
                for key in all_score.keys():
                    all_score[key] = float(np.mean(all_score[key]))
            else:
                all_score = {}
                all_scores = []
            info = {
                "loss": final_loss,
                "losses": losses,
                "all_score": all_score,
                "all_scores": all_scores,
                "models": models,
                "finally_fit_model": finally_fit_model,
                "y_true_indexes": y_true_indexes,
                "y_preds": y_preds,
                "intermediate_results": intermediate_results,
                "status": status,
                "failed_info": failed_info,
                "start_time": start_time,
                "end_time": end_time,
                "additional_info": additional_info
            }
            # todo
            if y_test is not None:
                # 验证集训练模型的组合去预测测试集的数据
                if self.should_finally_fit:
                    y_test_pred = y_test_pred_by_finally_fit_model
                else:
                    if self.ml_task.mainTask == "classification":
                        y_test_pred = vote_predicts(y_test_preds)
                    else:
                        y_test_pred = mean_predicts(y_test_preds)
                test_loss, test_all_score = self.loss(y_test.data, y_test_pred)
                # todo: 非1d-array情况下的用户自定义评估器
                info.update({
                    "test_loss": test_loss,
                    "test_all_score": test_all_score,
                    # "y_test_true": y_test,
                    "y_test_pred": y_test_pred
                })
        info["warning_info"] = warning_info.getvalue()
        return info