예제 #1
0
    def fit(self, X, y=None):
        if len(self.cols_lst) == 0:
            self.cols_lst = select_var_by_type(X,
                                               uid=self.uid,
                                               y=self.y,
                                               var_type=self.require_var_type)
        else:
            self.cols_lst = [
                col for col in self.cols_lst if col in X.columns.tolist()
            ]

        self.skew_info = compute_df_skew(X[self.cols_lst])
        self.status = True
예제 #2
0
    def fit(self, X, y):

        if len(self.cols_lst) == 0:
            self.cols_lst = select_var_by_type(X,
                                               uid=self.uid,
                                               y=self.y,
                                               var_type=self.require_var_type)

        else:
            self.cols_lst = [col for col in self.cols_lst if col in X.columns]

        self.woe_info = f_df_woe(X[self.cols_lst], y)

        self.fit_status = True
예제 #3
0
    def fit(self, X, y=None):
        if len(self.cols_lst) == 0:
            self.cols_lst = select_var_by_type(X,
                                               uid=self.uid,
                                               y=self.y,
                                               var_type='str')
        else:
            self.cols_lst = [
                col for col in self.cols_lst if col in X.columns.tolist()
            ]

        self.nunique_cnt = compute_df_nunique_cnt(X[self.cols_lst])
        self.var_filter = {
            k: v
            for k, v in self.nunique_cnt.items() if v == self.filter_thres
        }
        self.fit_status = True
예제 #4
0
    def fit(self, X, y=None):

        if len(self.cols_lst) == 0:
            self.cols_lst = select_var_by_type(X,
                                               uid=self.uid,
                                               y=self.y,
                                               var_type=self.require_var_type)
        else:
            self.cols_lst = [
                col for col in self.cols_lst if col in X.columns.tolist()
            ]

        self.null_rate = compute_df_null_rate(X[self.cols_lst])
        self.fill_info = self.fill_value_fun[self.fill_type](X, self.cols_lst)
        for col in self.cols_lst:
            if self.null_rate[col] >= self.fill_thres:
                self.fill_info[col] = 'null'
예제 #5
0
    def fit(self, X, y):

        if len(self.cols_lst) == 0:
            self.cols_lst = select_var_by_type(X,
                                               uid=self.uid,
                                               y=self.y,
                                               var_type=self.require_var_type)

        else:
            self.cols_lst = [
                col for col in self.cols_lst if col in X.columns.to_list()
            ]

        self.iv_info = f_df_iv(X[self.cols_lst], y)
        self.var_filter = {
            k: v
            for k, v in self.iv_info.items() if v < self.iv_thres
        }
        self.fit_status = True
예제 #6
0
    def fit(self, X, y=None):

        if len(self.cols_lst) == 0:
            self.cols_lst = select_var_by_type(X,
                                               uid=self.uid,
                                               y=self.y,
                                               var_type=self.require_var_type)
            #self.LabelEncoders = {col:LabelEncoder() for col in self.cols_lst}
        else:
            self.cols_lst = [
                col for col in self.cols_lst if col in X.columns.tolist()
            ]

        self.LabelEncoders = {col: LabelEncoder() for col in self.cols_lst}

        for col in self.cols_lst:
            self.LabelEncoders[col].fit(X[col])

        self.fit_status = True
예제 #7
0
    def fit(self, X, y=None):
        if len(self.cols_lst) == 0:
            self.cols_lst = select_var_by_type(X,
                                               uid=self.uid,
                                               y=self.y,
                                               var_type=self.require_var_type)
        else:
            self.cols_lst = [
                col for col in self.cols_lst if col in X.columns.tolist()
            ]

        if y is None:
            y = X[self.y]
        self.var_importance = computer_df_importance_xgb_regressor(
            X[self.cols_lst], y)
        self.var_filter = {
            k: v
            for k, v in self.var_importance.items() if v <= self.filter_thres
        }
        self.fit_status = True
예제 #8
0
    def fit(self, X, y=None):

        if len(self.cols_lst) == 0:
            self.cols_lst = select_var_by_type(X,
                                               uid=self.uid,
                                               y=self.y,
                                               var_type=self.require_var_type)
        else:
            self.cols_lst = [
                col for col in self.cols_lst if col in X.columns.tolist()
            ]

        self.bin_info = f_df_str_chisq_group(X=X[self.cols_lst],
                                             y=y,
                                             p_value=self.p_value,
                                             pct=self.pct,
                                             max_groups=self.max_groups,
                                             num_least=self.num_least)

        self.fit_status = True
예제 #9
0
    def fit(self, X, y=None):
        if len(self.cols_lst) == 0:
            self.cols_lst = select_var_by_type(X,
                                               uid=self.uid,
                                               y=self.y,
                                               var_type=self.require_var_type)
        else:
            self.cols_lst = [
                col for col in self.cols_lst if col in X.columns.tolist()
            ]

        self.MinMaxScalers = {
            col: MinMaxScaler(feature_range=self.feature_range)
            for col in self.cols_lst
        }

        for col in self.cols_lst:
            self.MinMaxScalers[col].fit(X[col].values.reshape(-1, 1))

        self.fit_status = True
예제 #10
0
    def fit(self, X, y=None):

        if len(self.cols_lst) == 0:
            self.cols_lst = select_var_by_type(X,
                                               uid=self.uid,
                                               y=self.y,
                                               var_type=self.require_var_type)
        else:
            self.cols_lst = [
                col for col in self.cols_lst if col in X.columns.tolist()
            ]

        if y is None:
            y = X[self.y]

        self.corr_info = compute_df_corr(X[self.cols_lst], y)
        self.var_filter = {
            k: v
            for k, v in self.corr_info.items() if abs(v) < self.filter_thres
        }
        self.fit_status = True
예제 #11
0
    def fit(self, X, y=None):

        if len(self.cols_lst) == 0:
            self.cols_lst = select_var_by_type(X,
                                               uid=self.uid,
                                               y=self.y,
                                               var_type=self.require_var_type)
            #self.OneHotEncoders = {col:OneHotEncoder(handle_unknown = 'ignore') for col in self.cols_lst}
        else:
            self.cols_lst = [
                col for col in self.cols_lst if col in X.columns.tolist()
            ]

        self.OneHotEncoders = {
            col: OneHotEncoder(handle_unknown='ignore')
            for col in self.cols_lst
        }

        for col in self.cols_lst:
            self.OneHotEncoders[col].fit(X[col].values.reshape(-1, 1))
        self.fit_status = True
예제 #12
0
    def fit(self, X, y=None):
        if len(self.cols_lst) == 0:
            self.cols_lst = select_var_by_type(X,
                                               uid=self.uid,
                                               y=self.y,
                                               var_type=self.require_var_type)
        else:
            self.cols_lst = [
                col for col in self.cols_lst if col in X.columns.tolist()
            ]

        self.StandardScalers = {
            col: StandardScaler(copy=False,
                                with_mean=self.with_mean,
                                with_std=self.with_std)
            for col in self.cols_lst
        }

        for col in self.cols_lst:
            self.StandardScalers[col].fit(X[col].values.reshape(-1, 1))

        self.fit_status = True
예제 #13
0
    def fit(self, X, y=None):

        if len(self.cols_lst) == 0:
            self.cols_lst = [
                col for col in select_var_by_type(
                    X, uid=self.uid, y=self.y, var_type=['int', 'float'])
            ]

        else:
            self.cols_lst = [
                col for col in self.cols_lst if col in X.columns.tolist()
            ]

        if self.bin_type == 'dtree':
            self.cut_info = self.bin_fun[self.bin_type](
                X[self.cols_lst],
                y=y,
                criterion=self.criterion,
                min_samples_leaf=self.min_samples_leaf,
                max_leaf_nodes=self.max_leaf_nodes,
                min_impurity_decrease=self.min_impurity_decrease)

        elif self.bin_type == 'chisq':
            self.cut_info = self.bin_fun[self.bin_type](
                X[self.cols_lst],
                y=y,
                p_value=self.p_value,
                pct=self.pct,
                max_groups=self.max_groups,
                num_least=self.num_least)

        elif self.bin_type in ['quantile', 'equal_width']:
            self.cut_info = self.bin_fun[self.bin_type](X[self.cols_lst],
                                                        self.q)

        else:
            raise ValueError('bin_type参数只支持dtree、chisq、quantile和equal_width值')
        self.fit_status = True