def fit(self, X, y=None): if len(self.cols_lst) == 0: self.cols_lst = select_var_by_type(X, uid=self.uid, y=self.y, var_type=self.require_var_type) else: self.cols_lst = [ col for col in self.cols_lst if col in X.columns.tolist() ] self.skew_info = compute_df_skew(X[self.cols_lst]) self.status = True
def fit(self, X, y): if len(self.cols_lst) == 0: self.cols_lst = select_var_by_type(X, uid=self.uid, y=self.y, var_type=self.require_var_type) else: self.cols_lst = [col for col in self.cols_lst if col in X.columns] self.woe_info = f_df_woe(X[self.cols_lst], y) self.fit_status = True
def fit(self, X, y=None): if len(self.cols_lst) == 0: self.cols_lst = select_var_by_type(X, uid=self.uid, y=self.y, var_type='str') else: self.cols_lst = [ col for col in self.cols_lst if col in X.columns.tolist() ] self.nunique_cnt = compute_df_nunique_cnt(X[self.cols_lst]) self.var_filter = { k: v for k, v in self.nunique_cnt.items() if v == self.filter_thres } self.fit_status = True
def fit(self, X, y=None): if len(self.cols_lst) == 0: self.cols_lst = select_var_by_type(X, uid=self.uid, y=self.y, var_type=self.require_var_type) else: self.cols_lst = [ col for col in self.cols_lst if col in X.columns.tolist() ] self.null_rate = compute_df_null_rate(X[self.cols_lst]) self.fill_info = self.fill_value_fun[self.fill_type](X, self.cols_lst) for col in self.cols_lst: if self.null_rate[col] >= self.fill_thres: self.fill_info[col] = 'null'
def fit(self, X, y): if len(self.cols_lst) == 0: self.cols_lst = select_var_by_type(X, uid=self.uid, y=self.y, var_type=self.require_var_type) else: self.cols_lst = [ col for col in self.cols_lst if col in X.columns.to_list() ] self.iv_info = f_df_iv(X[self.cols_lst], y) self.var_filter = { k: v for k, v in self.iv_info.items() if v < self.iv_thres } self.fit_status = True
def fit(self, X, y=None): if len(self.cols_lst) == 0: self.cols_lst = select_var_by_type(X, uid=self.uid, y=self.y, var_type=self.require_var_type) #self.LabelEncoders = {col:LabelEncoder() for col in self.cols_lst} else: self.cols_lst = [ col for col in self.cols_lst if col in X.columns.tolist() ] self.LabelEncoders = {col: LabelEncoder() for col in self.cols_lst} for col in self.cols_lst: self.LabelEncoders[col].fit(X[col]) self.fit_status = True
def fit(self, X, y=None): if len(self.cols_lst) == 0: self.cols_lst = select_var_by_type(X, uid=self.uid, y=self.y, var_type=self.require_var_type) else: self.cols_lst = [ col for col in self.cols_lst if col in X.columns.tolist() ] if y is None: y = X[self.y] self.var_importance = computer_df_importance_xgb_regressor( X[self.cols_lst], y) self.var_filter = { k: v for k, v in self.var_importance.items() if v <= self.filter_thres } self.fit_status = True
def fit(self, X, y=None): if len(self.cols_lst) == 0: self.cols_lst = select_var_by_type(X, uid=self.uid, y=self.y, var_type=self.require_var_type) else: self.cols_lst = [ col for col in self.cols_lst if col in X.columns.tolist() ] self.bin_info = f_df_str_chisq_group(X=X[self.cols_lst], y=y, p_value=self.p_value, pct=self.pct, max_groups=self.max_groups, num_least=self.num_least) self.fit_status = True
def fit(self, X, y=None): if len(self.cols_lst) == 0: self.cols_lst = select_var_by_type(X, uid=self.uid, y=self.y, var_type=self.require_var_type) else: self.cols_lst = [ col for col in self.cols_lst if col in X.columns.tolist() ] self.MinMaxScalers = { col: MinMaxScaler(feature_range=self.feature_range) for col in self.cols_lst } for col in self.cols_lst: self.MinMaxScalers[col].fit(X[col].values.reshape(-1, 1)) self.fit_status = True
def fit(self, X, y=None): if len(self.cols_lst) == 0: self.cols_lst = select_var_by_type(X, uid=self.uid, y=self.y, var_type=self.require_var_type) else: self.cols_lst = [ col for col in self.cols_lst if col in X.columns.tolist() ] if y is None: y = X[self.y] self.corr_info = compute_df_corr(X[self.cols_lst], y) self.var_filter = { k: v for k, v in self.corr_info.items() if abs(v) < self.filter_thres } self.fit_status = True
def fit(self, X, y=None): if len(self.cols_lst) == 0: self.cols_lst = select_var_by_type(X, uid=self.uid, y=self.y, var_type=self.require_var_type) #self.OneHotEncoders = {col:OneHotEncoder(handle_unknown = 'ignore') for col in self.cols_lst} else: self.cols_lst = [ col for col in self.cols_lst if col in X.columns.tolist() ] self.OneHotEncoders = { col: OneHotEncoder(handle_unknown='ignore') for col in self.cols_lst } for col in self.cols_lst: self.OneHotEncoders[col].fit(X[col].values.reshape(-1, 1)) self.fit_status = True
def fit(self, X, y=None): if len(self.cols_lst) == 0: self.cols_lst = select_var_by_type(X, uid=self.uid, y=self.y, var_type=self.require_var_type) else: self.cols_lst = [ col for col in self.cols_lst if col in X.columns.tolist() ] self.StandardScalers = { col: StandardScaler(copy=False, with_mean=self.with_mean, with_std=self.with_std) for col in self.cols_lst } for col in self.cols_lst: self.StandardScalers[col].fit(X[col].values.reshape(-1, 1)) self.fit_status = True
def fit(self, X, y=None): if len(self.cols_lst) == 0: self.cols_lst = [ col for col in select_var_by_type( X, uid=self.uid, y=self.y, var_type=['int', 'float']) ] else: self.cols_lst = [ col for col in self.cols_lst if col in X.columns.tolist() ] if self.bin_type == 'dtree': self.cut_info = self.bin_fun[self.bin_type]( X[self.cols_lst], y=y, criterion=self.criterion, min_samples_leaf=self.min_samples_leaf, max_leaf_nodes=self.max_leaf_nodes, min_impurity_decrease=self.min_impurity_decrease) elif self.bin_type == 'chisq': self.cut_info = self.bin_fun[self.bin_type]( X[self.cols_lst], y=y, p_value=self.p_value, pct=self.pct, max_groups=self.max_groups, num_least=self.num_least) elif self.bin_type in ['quantile', 'equal_width']: self.cut_info = self.bin_fun[self.bin_type](X[self.cols_lst], self.q) else: raise ValueError('bin_type参数只支持dtree、chisq、quantile和equal_width值') self.fit_status = True