Exemplo n.º 1
0
    def test_aggregate(self):
        df = pd.read_csv('./tests/data/dummy.csv')
        keys = ['a', 'b']
        targets = ['x', 'y']
        aggs = ['mean', 'sum']

        expected_columns = [
            'mean_x_groupby_a_b', 'sum_x_groupby_a_b', 'mean_y_groupby_a_b',
            'sum_y_groupby_a_b'
        ]
        expected_mean_x_groupby_a_b_1 = (0.4 + 0.1 + 0.2) / 3
        expected_mean_x_groupby_a_b_2 = 0.3
        sum_y_groupby_a_b_1 = 0.4 + 0.1 + 0.2
        sum_y_groupby_a_b_2 = 0.3

        transform = BasicGroupByTransform(keys, targets, aggs)
        df_output = transform.aggregate(df, keys, targets, aggs)

        assert df_output.columns.tolist() == keys + expected_columns
        assert df_output.loc[0, 'mean_x_groupby_a_b'] == approx(
            expected_mean_x_groupby_a_b_1)
        assert df_output.loc[1, 'mean_x_groupby_a_b'] == approx(
            expected_mean_x_groupby_a_b_2)
        assert df_output.loc[0, 'sum_x_groupby_a_b'] == approx(
            sum_y_groupby_a_b_1)
        assert df_output.loc[1, 'sum_x_groupby_a_b'] == approx(
            sum_y_groupby_a_b_2)
Exemplo n.º 2
0
    def create(self,
               base: pd.DataFrame,
               others: Optional[Dict[str, pd.DataFrame]] = None,
               *args,
               **kwargs) -> pd.DataFrame:
        df_main = others['main'].copy()

        count_transformer = BasicGroupByTransform(keys=['Year_of_Release'],
                                                  targets=['id'],
                                                  aggs=['count'])
        count_agg = count_transformer(df_main)
        df_main = pd.merge(df_main,
                           count_agg,
                           how='left',
                           on='Year_of_Release')

        year_rank = df_main.groupby('Year_of_Release')['id'].rank()
        df_main['year_rank'] = year_rank

        df_main.loc[df_main.loc[:, count_agg.columns[-1]].isna(),
                    'year_rank'] = np.nan
        df_main[
            'year_rank_rate'] = df_main.loc[:,
                                            'year_rank'] / df_main.loc[:,
                                                                       count_agg
                                                                       .
                                                                       columns[
                                                                           -1]]
        df_main[
            'year_rank_plus'] = df_main.loc[:,
                                            'year_rank_rate'] + df_main.loc[:,
                                                                            'Year_of_Release']
        return df_main.loc[:, ['year_rank_rate', 'year_rank_plus']]
Exemplo n.º 3
0
    def create(self,
               base: pd.DataFrame,
               others: Optional[Dict[str, pd.DataFrame]] = None,
               *args,
               **kwargs) -> pd.DataFrame:
        df_main = others['main'].copy()
        df_another = others['another'].copy()
        if self.train:
            df_whole = pd.concat([df_main, df_another])
        else:
            df_whole = pd.concat([df_another, df_main])

        df_whole.loc[df_whole.loc[:, 'User_Score'] == 'tbd',
                     'User_Score'] = np.nan
        df_whole.loc[:, 'User_Score'] = df_whole.loc[:, 'User_Score'].astype(
            np.float32) / 10.0
        df_whole.loc[:,
                     'Critic_Score'] = df_whole.loc[:, 'Critic_Score'] / 100.0
        transform = BasicGroupByTransform(keys=['Year_of_Release'],
                                          targets=TARGET_COLUMNS,
                                          aggs=['mean', 'max', 'min'])
        platform_scores = transform(df_whole)
        df_main = pd.merge(df_main,
                           platform_scores,
                           how='left',
                           on='Year_of_Release')
        return df_main.loc[:, platform_scores.columns[1:]]
Exemplo n.º 4
0
    def create(self,
               base: pd.DataFrame,
               others: Optional[Dict[str, pd.DataFrame]] = None,
               *args,
               **kwargs) -> pd.DataFrame:
        df_main = others['main'].copy()
        if self.train:
            df_train = others['main'].copy()
        else:
            df_train = others['another'].copy()

        added_columns = []
        for c in COLUMNS:
            transform = BasicGroupByTransform([c], SALES_COLUMNS, ['sum'])
            df_sum = transform(df_train)
            global_column = df_sum.columns[-1]
            country_columns = df_sum.columns[1:-1]
            for cc in country_columns:
                df_sum[
                    f'{cc}_rate'] = df_sum.loc[:,
                                               cc] / df_sum.loc[:,
                                                                global_column]
            df_main = pd.merge(df_main, df_sum, how='left', on=c)
            country_columns = [f'{cc}_rate' for cc in country_columns]
            added_columns.extend(country_columns)
        return df_main.loc[:, added_columns]
Exemplo n.º 5
0
    def create(self,
               base: pd.DataFrame,
               others: Optional[Dict[str, pd.DataFrame]] = None,
               *args,
               **kwargs) -> pd.DataFrame:
        df_main = others['main'].copy()
        if self.train:
            df_train = others['main'].copy()
        else:
            df_train = others['another'].copy()

        mean_transformer = BasicGroupByTransform(
            COLUMNS, ['target'], ['mean', 'median', 'sum', 'std'])
        te = mean_transformer(df_train)
        df_main = pd.merge(df_main, te, how='left', on=COLUMNS)
        return df_main.loc[:, te.columns[-4:]]
Exemplo n.º 6
0
    def create(self,
               base: pd.DataFrame,
               others: Optional[Dict[str, pd.DataFrame]] = None,
               *args,
               **kwargs) -> pd.DataFrame:
        df_main = others['main'].copy()
        if self.train:
            df_train = others['main'].copy()
        else:
            df_train = others['another'].copy()

        added_columns = []
        for c in COLUMNS:
            mean_transformer = BasicGroupByTransform([c], ['target'], ['mean'])
            te = mean_transformer(df_train)
            df_main = pd.merge(df_main, te, how='left', on=c)
            added_columns.append(te.columns[-1])
        return df_main.loc[:, added_columns]
Exemplo n.º 7
0
    def create(
        self,
        base: pd.DataFrame,
        others: Optional[Dict[str, pd.DataFrame]] = None,
        *args, **kwargs
    ) -> pd.DataFrame:
        df_main = others['main'].copy()
        if self.train:
            df_train = others['main'].copy()
        else:
            df_train = others['another'].copy()

        added_columns = []
        for c in COLUMNS:
            transform = BasicGroupByTransform(keys=[c], targets=SALES_COLUMNS, aggs=['mean'])
            platform_sales = transform(df_train)
            df_main = pd.merge(df_main, platform_sales, how='left', on=c)
            added_columns.extend(platform_sales.columns[1:])
        return df_main.loc[:, added_columns]
Exemplo n.º 8
0
    def create(self,
               base: pd.DataFrame,
               others: Optional[Dict[str, pd.DataFrame]] = None,
               *args,
               **kwargs) -> pd.DataFrame:
        df_main = others['main'].copy()
        df_another = others['another'].copy()
        if self.train:
            df_whole = pd.concat([df_main, df_another])
        else:
            df_whole = pd.concat([df_another, df_main])

        added_columns = []
        for c in COLUMNS:
            count_transformer = BasicGroupByTransform([c], ['id'], ['count'])
            count_agg = count_transformer(df_whole)
            df_main = pd.merge(df_main, count_agg, how='left', on=c)
            added_columns.append(count_agg.columns[-1])
        return df_main.loc[:, added_columns]
Exemplo n.º 9
0
    def create(
        self,
        base: pd.DataFrame,
        others: Optional[Dict[str, pd.DataFrame]] = None,
        *args, **kwargs
    ) -> pd.DataFrame:
        df_main = others['main'].copy()
        df_another = others['another'].copy()
        if self.train:
            df_whole = pd.concat([df_main, df_another])
        else:
            df_whole = pd.concat([df_another, df_main])

        transform = BasicGroupByTransform(keys=['Publisher'], targets=['id'], aggs=['count'])
        pub_count = transform(df_whole)
        df_main = pd.merge(df_main, pub_count, how='left', on='Publisher')
        df_main['count_over_500_Publisher'] = df_main.loc[:, pub_count.columns[-1]] >= 500
        df_main['count_over_100_Publisher'] = df_main.loc[:, pub_count.columns[-1]] >= 100

        return df_main.loc[:, ['count_over_500_Publisher', 'count_over_100_Publisher']]
Exemplo n.º 10
0
    def create(
        self,
        base: pd.DataFrame,
        others: Optional[Dict[str, pd.DataFrame]] = None,
        *args, **kwargs
    ) -> pd.DataFrame:
        df_main = others['main'].copy()
        df_another = others['another'].copy()
        if self.train:
            df_whole = pd.concat([df_main, df_another])
        else:
            df_whole = pd.concat([df_another, df_main])

        agg_columns = []
        for c in COLUMNS:
            transform = BasicGroupByTransform(keys=['Publisher'], targets=[c], aggs=['nunique'])
            pub_to_c = transform(df_whole)
            agg_columns.append(pub_to_c.columns[-1])
            df_main = pd.merge(df_main, pub_to_c, how='left', on='Publisher')

        return df_main.loc[:, agg_columns]
Exemplo n.º 11
0
    def create(self,
               base: pd.DataFrame,
               others: Optional[Dict[str, pd.DataFrame]] = None,
               *args,
               **kwargs) -> pd.DataFrame:
        df_main = others['main'].copy()
        df_another = others['another'].copy()
        if self.train:
            df_whole = pd.concat([df_main, df_another])
        else:
            df_whole = pd.concat([df_another, df_main])

        df_main.loc[df_main.loc[:, 'User_Score'] == 'tbd',
                    'User_Score'] = np.nan
        df_whole.loc[df_whole.loc[:, 'User_Score'] == 'tbd',
                     'User_Score'] = np.nan

        # normalize
        df_main.loc[:, 'User_Score'] = df_main.loc[:, 'User_Score'].astype(
            np.float32) / 10.0
        df_main.loc[:, 'Critic_Score'] = df_main.loc[:, 'Critic_Score'] / 100.0
        df_whole.loc[:, 'User_Score'] = df_whole.loc[:, 'User_Score'].astype(
            np.float32) / 10.0
        df_whole.loc[:,
                     'Critic_Score'] = df_whole.loc[:, 'Critic_Score'] / 100.0

        transform = BasicGroupByTransform(keys=['Platform', 'Year_of_Release'],
                                          targets=COLUMNS,
                                          aggs=['mean'])
        platform_scores = transform(df_whole)
        df_main = pd.merge(df_main,
                           platform_scores,
                           how='left',
                           on=['Platform', 'Year_of_Release'])

        for c in COLUMNS:
            c_na = df_main[c].isna()
            df_main.loc[c_na, c] = df_main.loc[
                c_na, f'mean_{c}_groupby_Platform_Year_of_Release']
        return df_main.loc[:, COLUMNS]
    def create(self,
               base: pd.DataFrame,
               others: Optional[Dict[str, pd.DataFrame]] = None,
               *args,
               **kwargs) -> pd.DataFrame:
        df_main = others['main'].copy()
        df_another = others['another'].copy()
        if self.train:
            df_whole = pd.concat([df_main, df_another])
        else:
            df_whole = pd.concat([df_another, df_main])

        df_pivot = None
        for i, c in enumerate(COLUMNS):
            transform = PivotTransform(indices=['Publisher'],
                                       column=c,
                                       target='id',
                                       aggs=['count'],
                                       fillna=0)
            pub_to_c = transform(df_whole)
            if df_pivot is None:
                df_pivot = pub_to_c
            else:
                df_pivot = pd.merge(df_pivot,
                                    pub_to_c,
                                    how='left',
                                    on='Publisher')

        column_name = 'kmeans_cluster_by_Publisher_pivotby_all'
        df_main[column_name] = base.copy().loc[:, column_name]
        df_main = change_column_name(df_main, column_name, 'kmeans_cluster')
        transform = BasicGroupByTransform(keys=['kmeans_cluster'],
                                          targets=['target'],
                                          aggs=['mean'])
        cluster_target = transform(df_main)
        df_main = pd.merge(df_main,
                           cluster_target,
                           how='left',
                           on='kmeans_cluster')
        return df_main.loc[:, [cluster_target.columns[-1]]]
Exemplo n.º 13
0
    def create(self,
               base: pd.DataFrame,
               others: Optional[Dict[str, pd.DataFrame]] = None,
               *args,
               **kwargs) -> pd.DataFrame:
        df_main = others['main'].copy()
        df_another = others['another'].copy()
        if self.train:
            df_whole = pd.concat([df_main, df_another])
        else:
            df_whole = pd.concat([df_another, df_main])

        agg_transform = BasicGroupByTransform(keys=['Publisher'],
                                              targets=['Year_of_Release'],
                                              aggs=['min'])
        df_agg = agg_transform(df_whole)
        df_main = pd.merge(df_main, df_agg, how='left', on='Publisher')
        min_column = df_agg.columns[-1]
        df_main[
            'publisher_year_from_first'] = df_main.loc[:,
                                                       'Year_of_Release'] - df_main.loc[:,
                                                                                        min_column]

        return df_main.loc[:, [min_column, 'publisher_year_from_first']]