def core_fit(self, estimator, X, y=None, X_valid=None, y_valid=None, X_test=None, y_test=None, feature_groups=None, columns_metadata=None): categorical_features_indices = get_categorical_features_indices( X, columns_metadata) X = to_array(X) X_valid = to_array(X_valid) if (X_valid is not None) and (y_valid is not None): eval_set = (X_valid, y_valid) else: eval_set = None return self.estimator.fit( X, y, categorical_feature=categorical_features_indices, eval_set=eval_set, verbose=False, early_stopping_rounds=self.hyperparams.get( "early_stopping_rounds"))
def fit(self, X, y=None): X = to_array(X) encoders = [] for i in range(X.shape[1]): cur = X[:, i] encoder = SklearnLabelEncoder().fit(cur) # [cur != -999] encoders.append(encoder) self.encoders = encoders return self
def transform(self, X, y=None): if isinstance(X, pd.DataFrame): columns = X.columns index = X.index else: columns = [str(i) for i in range(X.shape[1])] index = range(X.shape[0]) X = to_array(X) arrs = [] assert X.shape[1] == len(self.encoders) for i in range(X.shape[1]): cur = X[:, i] # arr = np.zeros_like(cur) encoder = self.encoders[i] arr = encoder.transform(cur) # arr[cur == -999] = -999 arrs.append(arr) return pd.DataFrame(np.vstack(arrs).T, columns=columns, index=index)
def parse_column_descriptions(self, column_descriptions, X_train, y_train, X_test, y_test): # todo: 校验X是否存在重名列 X_train = self.type_check(X_train) X_test = self.type_check(X_test) both_set = False if X_train is not None and X_test is None: X = X_train y = y_train elif X_train is None and X_test is not None: X = X_test y = y_test elif X_train is not None and X_test is not None: both_set = True X = X_train y = y_train self.logger.info("X_train and X_test are both set.") else: self.logger.error( "X_train and X_test are both None, it is invalide.") raise ValueError if column_descriptions is None: column_descriptions = {} # fixme : DataManager存在只托管X的情况 # assert y is not None # --确定target-- if isinstance(y, str) or "target" in column_descriptions: if isinstance(y, str): target_col = y elif "target" in column_descriptions: target_col = column_descriptions["target"] else: raise NotImplementedError y_train = pop_if_exists(X_train, target_col) y_test = pop_if_exists(X_test, target_col) # --确定id-- if "id" in column_descriptions: id_col = column_descriptions["id"] self.id_seq = pop_if_exists(X_train, id_col) self.test_id_seq = pop_if_exists(X_test, id_col) # --确定ignore-- if "ignore" in column_descriptions: ignore_cols = column_descriptions["ignore"] if not isinstance(ignore_cols, Sequence): ignore_cols = [ignore_cols] for ignore_col in ignore_cols: pop_if_exists(X_train, ignore_col) pop_if_exists(X_test, ignore_col) # --验证X与X_test的列应该相同 if both_set: assert X_train.shape[1] == X_test.shape[1] assert np.all(X_train.columns == X_test.columns) # --确定其他列-- column2feature_groups = {} for key, values in column_descriptions.items(): if key in ("id", "target", "ignore"): continue if isinstance(values, str): values = [values] for value in values: column2feature_groups[value] = key # ----尝试将X_train与X_test拼在一起,然后做解析--------- X = stack_Xs(X_train, None, X_test) # ----对于没有标注的列,打上nan,highR_nan,cat,num三种标记 for column in X.columns: if column not in column2feature_groups: feature_group = self.parse_feature_groups(X[column]) column2feature_groups[column] = feature_group feature_groups = [ column2feature_groups[column] for column in X.columns ] L1 = X_train.shape[0] if X_train is not None else 0 if X_test is not None: L2 = X_test.shape[0] X_test.index = range(L1, L1 + L2) X_train.index = range(L1) y_train = to_array(y_train) y_test = to_array(y_test) return X_train, y_train, X_test, y_test, feature_groups, column2feature_groups
def before_pred_X(self, X): return to_array(X)
def __init__(self, resource_manager=None, X_train: Union[pd.DataFrame, DataFrameContainer, np.ndarray, None, str] = None, y_train: Union[pd.Series, np.ndarray, None] = None, X_test: Union[pd.DataFrame, DataFrameContainer, np.ndarray, None, str] = None, y_test: Union[pd.Series, np.ndarray, None] = None, dataset_metadata: Dict[str, Any] = frozendict(), column_descriptions: Dict[str, Union[List[str], str]] = frozendict(), highR_nan_threshold: float = 0.5, highC_cat_threshold: int = 4, consider_ordinal_as_cat=False, upload_type="fs"): ''' Parameters ---------- X_train: :class:`numpy.ndarray` or :class:`pandas.DataFrame` y_train: :class:`numpy.ndarray` X_test: :class:`numpy.ndarray` or :class:`pandas.DataFrame` y_test: :class:`numpy.ndarray` dataset_metadata: dict column_descriptions: dict ``column_descriptions`` is a dict, key is ``feature_group``, value is column (column name) or columns (list of column names). This is a list of some frequently-used built-in ``feature_group`` * ``id`` - id of this table. * ``ignore`` - some columns which contains irrelevant information. * ``target`` - column in the dataset is what your model will learn to predict. * ``nan`` - Not a Number, a column contain missing values. * ``num`` - numerical features, such as [1, 2, 3]. * ``cat`` - categorical features, such as ["a", "b", "c"]. * ``num_nan`` - numerical features contains missing values. such as [1, 2, NaN]. * ``cat_nan`` - categorical features contains missing values. such as ["a", "b", NaN]. * ``highR_nan`` - highly ratio NaN. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` * ``lowR_nan`` - lowly ratio NaN. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` * ``highC_cat`` - highly cardinality ratio categorical. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` * ``lowR_cat`` - lowly cardinality ratio categorical. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` highR_nan_threshold: float high ratio NaN threshold, you can find examples and practice in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` ''' self.upload_type = upload_type from autoflow.resource_manager.base import ResourceManager self.logger = get_logger(self) if resource_manager is None: self.logger.warning( "In DataManager __init__, resource_manager is None, create a default local resource_manager." ) resource_manager = ResourceManager() self.resource_manager: ResourceManager = resource_manager self.resource_manager = resource_manager self.highC_cat_threshold = highC_cat_threshold self.consider_ordinal_as_cat = consider_ordinal_as_cat dataset_metadata = dict(dataset_metadata) self.highR_nan_threshold = highR_nan_threshold self.dataset_metadata = dataset_metadata self.column_descriptions = dict(column_descriptions) # --load data to container--------------------------------- self.X_test, self.input_test_hash = self.parse_data_container( "TestSet", X_test, y_test) # train set 靠后,以train set 的column_descriptions为准 self.X_train, self.input_train_hash = self.parse_data_container( "TrainSet", X_train, y_train) # --migrate column descriptions------------------------------ # if X is dataset_id , remote data_container's column_descriptions will assigned to final_column_descriptions if self.final_column_descriptions is not None: self.column_descriptions = deepcopy(self.final_column_descriptions) # --column descriptions------------------------------ self.parse_column_descriptions() # 注意,此时feature_groups与columns不是一一匹配的,删除了辅助特征组 # ---check target----------------------------------------------------- assert "target" in self.column_descriptions self.target_col_name = self.column_descriptions["target"] # todo: 测试集预测的情况 # --final column descriptions------------------------------ # 用户定义的 column descriptions 和 remote 下载的column description都不应该包含nan的内容 # update `column2essential_feature_groups` to `final_column_descriptions` if self.final_column_descriptions is None: final_column_descriptions = defaultdict(list) final_column_descriptions.update(self.column_descriptions) # 先将非唯一的特征组处理为列表 for feat_grp, cols in final_column_descriptions.items(): if feat_grp not in UNIQUE_FEATURE_GROUPS: if isinstance(cols, str): final_column_descriptions[feat_grp] = [cols] # 然后开始更新 for column, essential_feature_group in self.column2feature_groups.items( ): if column not in final_column_descriptions[ essential_feature_group]: final_column_descriptions[essential_feature_group].append( column) self.final_column_descriptions = final_column_descriptions self.final_column_descriptions = dict(self.final_column_descriptions) # ---set column descriptions, upload to dataset----------------------------------------------------- if self.X_train is not None: self.X_train.set_column_descriptions( self.final_column_descriptions) self.X_train.upload(self.upload_type) self.logger.info( f"TrainSet's DataSet ID = {self.X_train.dataset_id}") if self.X_test is not None: self.X_test.set_column_descriptions(self.final_column_descriptions) self.X_test.upload(self.upload_type) self.logger.info( f"TestSet's DataSet ID = {self.X_test.dataset_id}") # ---origin hash----------------------------------------------------- self.train_set_id = self.X_train.get_hash( ) if self.X_train is not None else "" self.test_set_id = self.X_test.get_hash( ) if self.X_test is not None else "" if self.input_train_hash: assert self.input_train_hash == self.train_set_id if self.input_test_hash: assert self.input_test_hash == self.test_set_id # ---pop auxiliary columns----------------------------------------------------- y_train, y_test = self.pop_auxiliary_feature_groups() # --验证X与X_test的列应该相同 if self.X_test is not None and self.X_train is not None: assert self.X_train.shape[1] == self.X_test.shape[1] assert np.all(self.X_train.columns == self.X_test.columns) # --设置feature_groups-- if self.X_train is not None: self.X_train.set_feature_groups(self.feature_groups) if self.X_test is not None: self.X_test.set_feature_groups(self.feature_groups) # --设置参数-- y_train = to_array(y_train) y_test = to_array(y_test) # encode label assert y_train is not None, ValueError( f"{self.target_col_name} does not exist!") self.label_encoder = None if is_target_need_label_encode(y_train): self.label_encoder = LabelEncoder() y_train = self.label_encoder.fit_transform(y_train) y_test = self.encode_label(y_test) if y_train is not None: y_train = NdArrayContainer("TrainLabel", dataset_instance=y_train, resource_manager=self.resource_manager) y_train.upload() if y_test is not None: y_test = NdArrayContainer("TestLabel", dataset_instance=y_test, resource_manager=self.resource_manager) y_test.upload() self.ml_task: MLTask = get_ml_task_from_y(y_train.data) self.y_train = y_train self.y_test = y_test self.train_label_id = self.y_train.get_hash( ) if self.y_train is not None else "" self.test_label_id = self.y_test.get_hash( ) if self.y_test is not None else "" if self.X_train is not None: self.columns = self.X_train.columns else: self.columns = self.X_test.columns