Пример #1
0
def c_cats_combine(df, col2max):
    columns = df.columns
    ss = df[columns[0]].astype('float64')
    for col in columns[1:]:
        mx = col2max[col]
        ss *= mx
        ss += df[col]
    downcast(ss, accuracy_loss=False)
    return ss
Пример #2
0
    def explore_params(self, X, y, categories):
        self.cat_cols = tuple(categories)
        self.num_cols = [col for col in X.columns if col not in categories]

        log(f'train num col: {self.num_cols}')
        log(f'train cat col:{self.cat_cols}')

        cat_feats = self.cat_fit_transform(X, mode='fit_trans')
        num_feats = self.num_fit_transform(X, mode='fit_trans')

        if len(cat_feats) > 0 and len(num_feats) > 0:
            feats = np.concatenate([cat_feats, num_feats], axis=1)
        elif len(cat_feats) > 0:
            feats = cat_feats
        elif len(num_feats) > 0:
            feats = num_feats
        log(f'before downcast {feats.dtype}')
        feats = downcast(feats)
        log(f'aft downcast {feats.dtype}')

        feats = feats[-50000:]
        y = y.iloc[-50000:]

        log(f'train features shape : {feats.shape}')

        X_train, X_eval, y_train, y_eval = train_test_split(feats,
                                                            y,
                                                            test_size=0.2,
                                                            shuffle=False,
                                                            random_state=0)

        self.select_cols(X_train, X_eval, y_train, y_eval)
        final_rmse = self.select_alpha(X_train, X_eval, y_train, y_eval)

        return final_rmse
Пример #3
0
def groupby_mean(df):
    col = df.columns[0]
    num_col = df.columns[1]
    means = df.groupby(col, sort=False)[num_col].mean()
    ss = df[col].map(means)
    ss = downcast(ss)
    return ss
Пример #4
0
            def n_diff(df, time_col, cat_col, num_col):
                index = df.index
                df.reset_index(drop=True, inplace=True)
                if cat_col is not None:
                    df.sort_values([cat_col, time_col], inplace=True)
                else:
                    df.sort_values([time_col], inplace=True)

                num_ss = df[num_col]

                delta = num_ss.diff()
                if cat_col is not None:
                    cat_ss = df[cat_col].diff()
                    cat_ss2 = df[cat_col].diff(2)

                delta_ratio = delta / (num_ss.shift(1) + 1e-3)

                delta = delta.shift(1)
                delta_ratio = delta_ratio.shift(1)
                delta_delta = delta.diff()

                delta = downcast(delta)
                delta_delta = downcast(delta_delta)
                delta_ratio = downcast(delta_ratio)

                if cat_col is not None:
                    delta[cat_ss != 0] = np.nan
                    delta_ratio[cat_ss != 0] = np.nan
                    delta_delta[cat_ss2 != 0] = np.nan

                new_df = pd.concat([delta, delta_delta, delta_ratio], axis=1)

                new_df.sort_index(inplace=True)
                new_df.index = index

                return new_df
Пример #5
0
    def transform(self, ss):
        codes = pd.Categorical(ss,
                               categories=self.cats).codes + CONSTANT.CAT_SHIFT
        # more = set(self.cats) - set(ss)
        # print(f'more:{more}')

        codes = codes.astype('float')
        codes[codes == (CONSTANT.CAT_SHIFT - 1)] = np.nan

        # nan_ratio = np.isnan(codes).mean()
        # print(f'nan_ratio')
        # print(nan_ratio)

        codes = downcast(codes, accuracy_loss=False)
        return codes
Пример #6
0
    def fit(self, table):
        #self.cols1 = table.cat_cols
        self.cols1 = ['c_TimeDate:A1:hour']
        key_col = table.key_col

        cols2 = [table.label]
        if len(self.cols1) == 0 or len(cols2) == 0:
            return
        df = table.train_X

        df['key_cross'] = df['c_TimeDate:A1:hour'] * df[key_col]
        self.cols1 = ['key_cross']

        for col1 in self.cols1:
            for col2 in cols2:
                obj = f'({col1})({col2})'
                param = None
                new_col = FeatNamer.gen_feat_name(self.__class__.__name__, obj,
                                                  param,
                                                  CONSTANT.NUMERICAL_TYPE)

                mean_ss = df.groupby([col1], sort=False)[col2].mean()
                mean_ss = downcast(mean_ss)
                self.res[new_col] = mean_ss.to_dict()
Пример #7
0
    def fit(self, X, y, categories):

        if self.good_cols is not None:
            X = X[self.good_cols]
            self.cat_cols = tuple(
                [col for col in categories if col in self.good_cols])
            self.num_cols = tuple(
                [col for col in X.columns if col not in categories])
        else:
            self.cat_cols = tuple(categories)
            self.num_cols = tuple(
                [col for col in X.columns if col not in categories])
        #X, y = sample(X, y, 500000, random_state=2019)
        if self.train_shape is None:
            self.train_shape = X.shape[0]

        X_sample = X.iloc[:self.train_shape].sample(frac=0.8,
                                                    random_state=2019)
        y_sample = y.loc[X_sample.index]

        X_test = X.iloc[self.train_shape:]
        y_test = y.loc[self.train_shape:]

        X = pd.concat([X_sample, X_test], axis=0)
        y = pd.concat([y_sample, y_test], axis=0)

        del X_sample, y_sample, X_test, y_test
        gc.collect()

        # self.cat_cols = tuple()
        # self.num_cols = tuple([col for col in X.columns])

        #log(f'train num col: {self.num_cols}')
        #log(f'train cat col:{self.cat_cols}')

        cat_feats = self.cat_fit_transform(X, mode='fit_trans')
        num_feats = self.num_fit_transform(X, mode='fit_trans')

        if len(cat_feats) > 0 and len(num_feats) > 0:
            feats = np.concatenate([cat_feats, num_feats], axis=1)
        elif len(cat_feats) > 0:
            feats = cat_feats
        elif len(num_feats) > 0:
            feats = num_feats

        feats = downcast(feats)

        self.model = Ridge(solver='svd', max_iter=300, alpha=self.best_alpha)

        try:
            if not self.do_sample:
                self.model.fit(feats, y)
            else:
                m = min(int(feats.shape[0] / 2), 500000)
                if self.do_lsqr:
                    self.model = Ridge(solver='lsqr',
                                       max_iter=500,
                                       alpha=self.best_alpha)
                else:
                    self.model = Ridge(solver='svd',
                                       max_iter=300,
                                       alpha=self.best_alpha)
                self.model.fit(feats[-m:], y.iloc[-m:])
        except:
            try:
                m = min(int(feats.shape[0] / 2), 500000)
                self.model.fit(feats[-m:], y.iloc[-m:])
                self.do_sample = True
            except:
                m = min(int(feats.shape[0] / 2), 500000)
                self.model = Ridge(solver='lsqr',
                                   max_iter=500,
                                   alpha=self.best_alpha)
                self.model.fit(feats[-m:], y.iloc[-m:])
                self.do_sample = True
                self.do_lsqr = True

        return self
Пример #8
0
 def transform(self, ss):
     return downcast(ss)
Пример #9
0
def c_values_cnt(ss):
    counts = ss.value_counts()
    ss = ss.map(counts)
    ss = downcast(ss)
    return ss
Пример #10
0
def n_plus_n(ss_1, ss_2):
    new_ss = ss_1 + ss_2
    return downcast(new_ss)
Пример #11
0
def n_minus_n(ss_1, ss_2):
    new_ss = ss_1 - ss_2
    return downcast(new_ss)
Пример #12
0
def n_multiply_n(ss_1, ss_2):
    new_ss = ss_1 * ss_2
    return downcast(new_ss)
Пример #13
0
def n_div_n(ss_1, ss_2):
    new_ss = ss_1 / ss_2
    return downcast(new_ss)
Пример #14
0
    def fit(self, X, y, categories):
        log(f'debug{self.good_cols}')
        if self.good_cols is not None:
            X = X[self.good_cols]
            self.cat_cols = tuple(
                [col for col in categories if col in self.good_cols])
            self.num_cols = tuple(
                [col for col in X.columns if col not in categories])
        else:
            self.cat_cols = tuple(categories)
            self.num_cols = tuple(
                [col for col in X.columns if col not in categories])

        if self.train_shape is None:
            self.train_shape = X.shape[0]

        X_sample = X.iloc[:self.train_shape].sample(frac=0.8,
                                                    random_state=2020)
        y_sample = y.loc[X_sample.index]

        X_test = X.iloc[self.train_shape:]
        y_test = y.loc[self.train_shape:]

        X = pd.concat([X_sample, X_test], axis=0)
        y = pd.concat([y_sample, y_test], axis=0)

        del X_sample, y_sample, X_test, y_test
        gc.collect()

        # self.cat_cols = tuple()
        # self.num_cols = tuple([col for col in X.columns])

        #log(f'train num col: {self.num_cols}')
        #log(f'train cat col:{self.cat_cols}')

        cat_feats = self.cat_fit_transform(X, mode='fit_trans')
        num_feats = self.num_fit_transform(X, mode='fit_trans')

        if len(cat_feats) > 0 and len(num_feats) > 0:
            feats = np.concatenate([cat_feats, num_feats], axis=1)
        elif len(cat_feats) > 0:
            feats = cat_feats
        elif len(num_feats) > 0:
            feats = num_feats
        #log(f'before downcast {feats.dtype}')
        feats = downcast(feats)

        self.model = Lasso(alpha=self.best_alpha, max_iter=500)

        try:
            if not self.do_sample:
                self.model.fit(feats, y)
            else:
                self.model.fit(feats[-self.size:], y.iloc[-self.size:])
        except:
            try:
                m = int(feats.shape[0] / 2)
                self.model.fit(feats[-m:], y.iloc[-m:])
                self.do_sample = True
                self.size = m
            except:
                m = min(int(feats.shape[0] / 5), 500000)
                self.model.fit(feats[-m:], y.iloc[-m:])
                self.size = m
                self.do_sample = True
        return self
Пример #15
0
def time_atr(ss: pd.Series, atr):
    return downcast(getattr(ss.dt, atr), accuracy_loss=False)