示例#1
0
 def transform(self, X):
     log(f"start TP transform")
     for i in range(len(self.exec_cols)):
         cat_col = self.exec_cols[i][0]
         new_col = self.new_cols[i]
         X[new_col] = downcast(X[cat_col].map(self.mean_map_dict[new_col]),
                               accuracy_loss=False)
示例#2
0
        def func(df):
            col = df.columns[0]
            num_col = df.columns[1]

            df[num_col] = df[num_col].astype('float32')

            means = df.groupby(col, sort=False)[num_col].mean()
            return tuple(df.columns), downcast(means)
示例#3
0
 def mean_label(ss: pd.Series, y):
     col = ss.name
     df = pd.concat([ss, y], axis=1)
     df.columns = [col, 'label']
     df = pd.concat([
         df[col].value_counts(), df.loc[df['label'] == 1,
                                        col].value_counts()
     ],
                    axis=1)
     df.columns = ['cnt', 'pos_cnt']
     df['rate'] = df['cnt'] / df['pos_cnt'] * 1.0
     return col, downcast(df['rate'], accuracy_loss=False)
示例#4
0
    def transform(self, X: pd.DataFrame):
        log(f"start TP transform")

        def func(df):
            cats = gen_combine_cats(df, df.columns)
            return tuple(df.columns), cats

        res = Parallel(n_jobs=CONSTANT.JOBS,
                       require='sharedmem')(delayed(func)(X[[col1, col2]])
                                            for col1, col2 in self.exec_cols)
        for cols, cats in res:
            new_col = self.new_cols[self.exec_cols.index(cols)]
            X[new_col] = downcast(cats.map(self.cnt_map_dict[new_col]),
                                  accuracy_loss=False)
示例#5
0
 def func(df):
     cats = gen_combine_cats(df, df.columns)
     cnt = cats.value_counts()
     return tuple(df.columns), downcast(cnt)
示例#6
0
 def values_cnt(ss: pd.Series):
     counts = ss.value_counts()
     return ss.name, downcast(counts)
示例#7
0
 def time_atr(ss: pd.Series, atr):
     return atr, downcast(getattr(ss.dt, atr), accuracy_loss=False)
示例#8
0
 def func(df):
     cols = list(df.columns)
     diff = df[cols[0]] - df[cols[1]]
     return tuple(df.columns), downcast(diff, accuracy_loss=False)
示例#9
0
 def values_cnt(ss: pd.Series):
     counts = ss.groupby(ss).cumcount()
     return ss.name, downcast(counts)