Пример #1
0
 def core_fit(self,
              estimator,
              X,
              y=None,
              X_valid=None,
              y_valid=None,
              X_test=None,
              y_test=None,
              feature_groups=None,
              columns_metadata=None):
     categorical_features_indices = get_categorical_features_indices(
         X, columns_metadata)
     X = to_array(X)
     X_valid = to_array(X_valid)
     if (X_valid is not None) and (y_valid is not None):
         eval_set = (X_valid, y_valid)
     else:
         eval_set = None
     return self.estimator.fit(
         X,
         y,
         categorical_feature=categorical_features_indices,
         eval_set=eval_set,
         verbose=False,
         early_stopping_rounds=self.hyperparams.get(
             "early_stopping_rounds"))
Пример #2
0
 def fit(self, X, y=None):
     X = to_array(X)
     encoders = []
     for i in range(X.shape[1]):
         cur = X[:, i]
         encoder = SklearnLabelEncoder().fit(cur)  # [cur != -999]
         encoders.append(encoder)
     self.encoders = encoders
     return self
Пример #3
0
 def transform(self, X, y=None):
     if isinstance(X, pd.DataFrame):
         columns = X.columns
         index = X.index
     else:
         columns = [str(i) for i in range(X.shape[1])]
         index = range(X.shape[0])
     X = to_array(X)
     arrs = []
     assert X.shape[1] == len(self.encoders)
     for i in range(X.shape[1]):
         cur = X[:, i]
         # arr = np.zeros_like(cur)
         encoder = self.encoders[i]
         arr = encoder.transform(cur)
         # arr[cur == -999] = -999
         arrs.append(arr)
     return pd.DataFrame(np.vstack(arrs).T, columns=columns, index=index)
Пример #4
0
 def parse_column_descriptions(self, column_descriptions, X_train, y_train,
                               X_test, y_test):
     # todo: 校验X是否存在重名列
     X_train = self.type_check(X_train)
     X_test = self.type_check(X_test)
     both_set = False
     if X_train is not None and X_test is None:
         X = X_train
         y = y_train
     elif X_train is None and X_test is not None:
         X = X_test
         y = y_test
     elif X_train is not None and X_test is not None:
         both_set = True
         X = X_train
         y = y_train
         self.logger.info("X_train and X_test are both set.")
     else:
         self.logger.error(
             "X_train and X_test are both None, it is invalide.")
         raise ValueError
     if column_descriptions is None:
         column_descriptions = {}
         # fixme : DataManager存在只托管X的情况
         # assert y is not None
     # --确定target--
     if isinstance(y, str) or "target" in column_descriptions:
         if isinstance(y, str):
             target_col = y
         elif "target" in column_descriptions:
             target_col = column_descriptions["target"]
         else:
             raise NotImplementedError
         y_train = pop_if_exists(X_train, target_col)
         y_test = pop_if_exists(X_test, target_col)
     # --确定id--
     if "id" in column_descriptions:
         id_col = column_descriptions["id"]
         self.id_seq = pop_if_exists(X_train, id_col)
         self.test_id_seq = pop_if_exists(X_test, id_col)
     # --确定ignore--
     if "ignore" in column_descriptions:
         ignore_cols = column_descriptions["ignore"]
         if not isinstance(ignore_cols, Sequence):
             ignore_cols = [ignore_cols]
         for ignore_col in ignore_cols:
             pop_if_exists(X_train, ignore_col)
             pop_if_exists(X_test, ignore_col)
     # --验证X与X_test的列应该相同
     if both_set:
         assert X_train.shape[1] == X_test.shape[1]
         assert np.all(X_train.columns == X_test.columns)
     # --确定其他列--
     column2feature_groups = {}
     for key, values in column_descriptions.items():
         if key in ("id", "target", "ignore"):
             continue
         if isinstance(values, str):
             values = [values]
         for value in values:
             column2feature_groups[value] = key
     # ----尝试将X_train与X_test拼在一起,然后做解析---------
     X = stack_Xs(X_train, None, X_test)
     # ----对于没有标注的列,打上nan,highR_nan,cat,num三种标记
     for column in X.columns:
         if column not in column2feature_groups:
             feature_group = self.parse_feature_groups(X[column])
             column2feature_groups[column] = feature_group
     feature_groups = [
         column2feature_groups[column] for column in X.columns
     ]
     L1 = X_train.shape[0] if X_train is not None else 0
     if X_test is not None:
         L2 = X_test.shape[0]
         X_test.index = range(L1, L1 + L2)
     X_train.index = range(L1)
     y_train = to_array(y_train)
     y_test = to_array(y_test)
     return X_train, y_train, X_test, y_test, feature_groups, column2feature_groups
Пример #5
0
 def before_pred_X(self, X):
     return to_array(X)
Пример #6
0
    def __init__(self,
                 resource_manager=None,
                 X_train: Union[pd.DataFrame, DataFrameContainer, np.ndarray,
                                None, str] = None,
                 y_train: Union[pd.Series, np.ndarray, None] = None,
                 X_test: Union[pd.DataFrame, DataFrameContainer, np.ndarray,
                               None, str] = None,
                 y_test: Union[pd.Series, np.ndarray, None] = None,
                 dataset_metadata: Dict[str, Any] = frozendict(),
                 column_descriptions: Dict[str, Union[List[str],
                                                      str]] = frozendict(),
                 highR_nan_threshold: float = 0.5,
                 highC_cat_threshold: int = 4,
                 consider_ordinal_as_cat=False,
                 upload_type="fs"):
        '''

        Parameters
        ----------
        X_train: :class:`numpy.ndarray` or :class:`pandas.DataFrame`
        y_train: :class:`numpy.ndarray`
        X_test: :class:`numpy.ndarray` or :class:`pandas.DataFrame`
        y_test: :class:`numpy.ndarray`
        dataset_metadata: dict
        column_descriptions: dict
            ``column_descriptions`` is a dict, key is ``feature_group``,

            value is column (column name) or columns (list of column names).

            This is a list of some frequently-used built-in ``feature_group``
                * ``id``       - id of this table.
                * ``ignore``   - some columns which contains irrelevant information.
                * ``target``   - column in the dataset is what your model will learn to predict.
                * ``nan``      - Not a Number, a column contain missing values.
                * ``num``      - numerical features, such as [1, 2, 3].
                * ``cat``      - categorical features, such as ["a", "b", "c"].
                * ``num_nan``  - numerical features contains missing values. such as [1, 2, NaN].
                * ``cat_nan``  - categorical features contains missing values. such as ["a", "b", NaN].
                * ``highR_nan``  - highly ratio NaN. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`
                * ``lowR_nan``   - lowly ratio NaN. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`
                * ``highC_cat``  - highly cardinality ratio categorical. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`
                * ``lowR_cat``  -  lowly cardinality ratio categorical. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`

        highR_nan_threshold: float
            high ratio NaN threshold, you can find examples and practice in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`
        '''
        self.upload_type = upload_type
        from autoflow.resource_manager.base import ResourceManager
        self.logger = get_logger(self)
        if resource_manager is None:
            self.logger.warning(
                "In DataManager __init__, resource_manager is None, create a default local resource_manager."
            )
            resource_manager = ResourceManager()
        self.resource_manager: ResourceManager = resource_manager
        self.resource_manager = resource_manager
        self.highC_cat_threshold = highC_cat_threshold
        self.consider_ordinal_as_cat = consider_ordinal_as_cat
        dataset_metadata = dict(dataset_metadata)
        self.highR_nan_threshold = highR_nan_threshold
        self.dataset_metadata = dataset_metadata
        self.column_descriptions = dict(column_descriptions)
        # --load data to container---------------------------------
        self.X_test, self.input_test_hash = self.parse_data_container(
            "TestSet", X_test, y_test)
        #             train set 靠后,以train set 的column_descriptions为准
        self.X_train, self.input_train_hash = self.parse_data_container(
            "TrainSet", X_train, y_train)
        # --migrate column descriptions------------------------------
        # if X is dataset_id , remote data_container's column_descriptions will assigned to  final_column_descriptions
        if self.final_column_descriptions is not None:
            self.column_descriptions = deepcopy(self.final_column_descriptions)
        # --column descriptions------------------------------
        self.parse_column_descriptions()
        # 注意,此时feature_groups与columns不是一一匹配的,删除了辅助特征组
        # ---check target-----------------------------------------------------
        assert "target" in self.column_descriptions
        self.target_col_name = self.column_descriptions["target"]
        # todo: 测试集预测的情况
        # --final column descriptions------------------------------
        # 用户定义的 column descriptions 和 remote 下载的column description都不应该包含nan的内容
        # update `column2essential_feature_groups` to `final_column_descriptions`
        if self.final_column_descriptions is None:
            final_column_descriptions = defaultdict(list)
            final_column_descriptions.update(self.column_descriptions)
            # 先将非唯一的特征组处理为列表
            for feat_grp, cols in final_column_descriptions.items():
                if feat_grp not in UNIQUE_FEATURE_GROUPS:
                    if isinstance(cols, str):
                        final_column_descriptions[feat_grp] = [cols]
            # 然后开始更新
            for column, essential_feature_group in self.column2feature_groups.items(
            ):
                if column not in final_column_descriptions[
                        essential_feature_group]:
                    final_column_descriptions[essential_feature_group].append(
                        column)
            self.final_column_descriptions = final_column_descriptions
        self.final_column_descriptions = dict(self.final_column_descriptions)
        # ---set column descriptions, upload to dataset-----------------------------------------------------
        if self.X_train is not None:
            self.X_train.set_column_descriptions(
                self.final_column_descriptions)
            self.X_train.upload(self.upload_type)
            self.logger.info(
                f"TrainSet's DataSet ID = {self.X_train.dataset_id}")
        if self.X_test is not None:
            self.X_test.set_column_descriptions(self.final_column_descriptions)
            self.X_test.upload(self.upload_type)
            self.logger.info(
                f"TestSet's DataSet ID = {self.X_test.dataset_id}")
        # ---origin hash-----------------------------------------------------
        self.train_set_id = self.X_train.get_hash(
        ) if self.X_train is not None else ""
        self.test_set_id = self.X_test.get_hash(
        ) if self.X_test is not None else ""
        if self.input_train_hash:
            assert self.input_train_hash == self.train_set_id
        if self.input_test_hash:
            assert self.input_test_hash == self.test_set_id
        # ---pop auxiliary columns-----------------------------------------------------
        y_train, y_test = self.pop_auxiliary_feature_groups()
        # --验证X与X_test的列应该相同
        if self.X_test is not None and self.X_train is not None:
            assert self.X_train.shape[1] == self.X_test.shape[1]
            assert np.all(self.X_train.columns == self.X_test.columns)
        # --设置feature_groups--
        if self.X_train is not None:
            self.X_train.set_feature_groups(self.feature_groups)
        if self.X_test is not None:
            self.X_test.set_feature_groups(self.feature_groups)
        # --设置参数--
        y_train = to_array(y_train)
        y_test = to_array(y_test)
        # encode label
        assert y_train is not None, ValueError(
            f"{self.target_col_name} does not exist!")
        self.label_encoder = None
        if is_target_need_label_encode(y_train):
            self.label_encoder = LabelEncoder()
            y_train = self.label_encoder.fit_transform(y_train)
            y_test = self.encode_label(y_test)
        if y_train is not None:
            y_train = NdArrayContainer("TrainLabel",
                                       dataset_instance=y_train,
                                       resource_manager=self.resource_manager)
            y_train.upload()
        if y_test is not None:
            y_test = NdArrayContainer("TestLabel",
                                      dataset_instance=y_test,
                                      resource_manager=self.resource_manager)
            y_test.upload()
        self.ml_task: MLTask = get_ml_task_from_y(y_train.data)
        self.y_train = y_train
        self.y_test = y_test
        self.train_label_id = self.y_train.get_hash(
        ) if self.y_train is not None else ""
        self.test_label_id = self.y_test.get_hash(
        ) if self.y_test is not None else ""
        if self.X_train is not None:
            self.columns = self.X_train.columns
        else:
            self.columns = self.X_test.columns