Пример #1
0
    def __init__(self, sales):
        global LAGS
        print("start process_data...")

        df_head = sales.head()

        if not PROC_CSV_EXIST:
            sales["x"] = sales["demand"] / sales["scale1"]
            self.FEATS = []
            for lag in tqdm(LAGS):
                sales[f"x_{lag}"] = sales.groupby("id")["x"].shift(lag)
                self.FEATS.append(f"x_{lag}")

            print(sales.shape)
            sales = sales.loc[sales.nb > sales.start]
            print(sales.shape)

            nb = sales['nb'].values
            MAX_LAG = max(LAGS)

            # SORRY THIS IS FAKE VALIDATION. I DIDN'T THINK IT WOULD HAVE HAD LIFTED UP MY SCORE LIKE THAT
            self.tr_mask = np.logical_and(nb > START + MAX_LAG, nb <= 1941)
            self.val_mask = np.logical_and(nb > 1913, nb <= 1941)
            self.te_mask = np.logical_and(nb > 1941, nb <= 1969)

            print('processing csv file...')

            # def preprocess_sales_2(sales):
            #     months_unq = sales['month'].unique().tolist()
            #     years_unq = sales['year'].unique().tolist()
            #     weeks_unq = sales['wday'].unique().tolist()
            #     # sales = sales.dropna(axis=0, subset=['x_28'])
            #     for i in years_unq:
            #         for y in months_unq:
            #             sales.loc[(sales['year'] == i) & (sales['month'] == y), 'x_28' + '_month_mean'] = \
            #                 sales.loc[(sales['month'] == y) & (sales['year'] == i)].groupby(['id'])['x_28'].transform(
            #                     lambda x: x.mean()).astype("float32")
            #             sales.loc[(sales['year'] == i) & (sales['month'] == y), 'x_28' + '_month_max'] = \
            #                 sales.loc[(sales['month'] == y) & (sales['year'] == i)].groupby(['id'])['x_28'].transform(
            #                     lambda x: x.max()).astype("float32")
            #             sales.loc[(sales['year'] == i) & (sales['month'] == y), 'x_28' + '_month_min'] = \
            #                 sales.loc[(sales['month'] == y) & (sales['year'] == i)].groupby(['id'])['x_28'].transform(
            #                     lambda x: x.min()).astype("float32")
            #             sales['x_28' + '_month_max_to_min_diff'] = (
            #                     sales['x_28' + '_month_max'] - sales['x_28' + '_month_min']).astype("float32")
            #
            #             for z in weeks_unq:
            #                 sales.loc[(sales['year'] == i) & (sales['month'] == y) & (
            #                         sales['wday'] == z), 'x_28' + '_wk_mean'] = \
            #                     sales.loc[(sales['month'] == y) & (sales['year'] == i) & (sales['wday'] == z)].groupby(
            #                         ['id'])[
            #                         'x_28'].transform(lambda x: x.mean()).astype("float32")
            #                 sales.loc[(sales['year'] == i) & (sales['month'] == y) & (
            #                         sales['wday'] == z), 'x_28' + '_wk_median'] = \
            #                     sales.loc[(sales['month'] == y) & (sales['year'] == i) & (sales['wday'] == z)].groupby(
            #                         ['id'])[
            #                         'x_28'].transform(lambda x: x.median()).astype("float32")
            #                 sales.loc[(sales['year'] == i) & (sales['month'] == y) & (
            #                         sales['wday'] == z), 'x_28' + '_wk_max'] = \
            #                     sales.loc[(sales['month'] == y) & (sales['year'] == i) & (sales['wday'] == z)].groupby(
            #                         ['id'])[
            #                         'x_28'].transform(lambda x: x.max()).astype("float32")
            #                 sales.loc[(sales['year'] == i) & (sales['month'] == y) & (
            #                         sales['wday'] == z), 'x_28' + '_wk_min'] = \
            #                     sales.loc[(sales['month'] == y) & (sales['year'] == i) & (sales['wday'] == z)].groupby(
            #                         ['id'])[
            #                         'x_28'].transform(lambda x: x.min()).astype("float32")
            #                 sales['x_28' + '_wk_max_to_min_diff'] = (
            #                         sales['x_28' + '_wk_max'] - sales['x_28' + '_wk_min']).astype("float32")
            #     return sales
            #
            # sales = preprocess_sales_2(sales)

            # sales_f9 = pd.read_csv("../input/sales_processed_f9.csv", index_col=0)
            #
            # def preprocess_sales_3(df):
            #     months_unq = df['month'].unique().tolist()
            #     years_unq = df['year'].unique().tolist()
            #     weeks_unq = df['wday'].unique().tolist()
            #
            #     for i in years_unq:
            #         for y in months_unq:
            #             df.loc[(df['year'] == i) & (df['month'] == y), 'x_28' + '_month_var'] = \
            #                 df.loc[(df['month'] == y) & (df['year'] == i)].groupby(['id'])[
            #                     'x_28'].transform(lambda x: x.var()).astype("float32")
            #             for z in weeks_unq:
            #                 df.loc[(df['year'] == i) & (df['month'] == y) & (
            #                         df['wday'] == z), 'x_28' + '_wk_var'] = df.loc[
            #                     (df['month'] == y) & (df['year'] == i) & (df['wday'] == z)].groupby(
            #                     ['id'])['x_28'].transform(lambda x: x.var()).astype("float32")
            #     return df
            #
            # sales_f9 = preprocess_sales_3(sales_f9)

            sales_f11 = pd.read_csv("../input/sales_processed_f9.csv",
                                    index_col=0)

            def preprocess_sales_3(df):
                print('In process 3')
                months_unq = df['month'].unique().tolist()
                years_unq = df['year'].unique().tolist()

                for i in years_unq:
                    for y in months_unq:
                        df.loc[(df['year'] == i) & (df['month'] == y), 'x_28' + '_month_median'] = \
                            df.loc[(df['month'] == y) & (df['year'] == i)].groupby(['id'])[
                                'x_28'].transform(lambda x: x.median()).astype("float32")
                return df

            sales_f11 = preprocess_sales_3(sales_f11)

            # sales.to_csv("../input/sales_processed_f9.csv", index=True)
            # sales[LIST_OF_FEATURE].to_csv("../input/sales_processed_only_f9.csv", index=True)
            # sales_f9.to_csv("../input/sales_processed_f11.csv", index=True)

            sales_f11.to_csv("../input/sales_processed_f10.csv", index=True)
            print('csv file processed.')
            exit()

            # sales_f15 = pd.read_csv("../input/sales_processed_f15.csv", index_col=0)
            # sales_f15 = sales_f15.drop('x_28_month_var', axis=1)
            # sales_f15.to_csv("../input/sales_processed_f15.csv", index=True)
        else:
            print('Reading csv file...')
            sales = pd.read_csv("../input/sales_processed_f10.csv",
                                index_col=0)
            sales = sales.loc[:, ~sales.columns.isin(
                ['x_28', 'x_30', 'x_35', 'x_42', 'x_49', 'x_56', 'x_63'])]
            # sales = pd.read_csv("../input/sales_processed.csv", index_col=0)
            sales = LoadData.reduce_mem_usage(sales)
            gc.collect()
            print('Csv file opened.')

            df_head_proc = sales.head()

            # sales['CA_w'] = sales.loc[:]['snap_CA'] * sales.loc[:]['x_28_wk_mean']
            # sales['TX_w'] = sales.loc[:]['snap_TX'] * sales.loc[:]['x_28_wk_mean']
            # sales['WI_w'] = sales.loc[:]['snap_WI'] * sales.loc[:]['x_28_wk_mean']

            self.FEATS = []
            for lag in tqdm(LAGS):
                sales[f"x_{lag}"] = sales.groupby("id")["x"].shift(lag)
                self.FEATS.append(f"x_{lag}")

            print(sales.shape)
            sales = sales.loc[sales.nb > sales.start]
            print(sales.shape)

            nb = sales['nb'].values
            MAX_LAG = max(LAGS)

            # SORRY THIS IS FAKE VALIDATION. I DIDN'T THINK IT WOULD HAVE HAD LIFTED UP MY SCORE LIKE THAT
            self.tr_mask = np.logical_and(nb > START + MAX_LAG, nb <= 1941)
            self.val_mask = np.logical_and(nb > 1913, nb <= 1941)
            self.te_mask = np.logical_and(nb > 1941, nb <= 1969)

        # print('#' * 40)
        # print("SALES:", sales.isnull().any())
        # self.scale2 = sales['scale1'].values
        self.scale = sales['scale1'].values
        self.ids = sales['id'].values
        # y = sales['demand'].values
        # ys = y / scale
        # self.ys = sales[['x', 'sales1']].values
        self.ys = sales['x'].values

        # feats_list = self.FEATS + LIST_OF_FEATURE
        self.feats_list = self.FEATS

        self.Z = sales[self.feats_list].values
        # self.Z = sales[self.FEATS].values.reshape((NITEMS, -1, len(self.FEATS)))
        print(self.scale.shape, self.ids.shape, self.ys.shape, self.Z.shape)

        self.sv = self.scale[self.val_mask]
        self.se = self.scale[self.te_mask]
        self.ids = self.ids[self.te_mask]
        self.ids = self.ids.reshape((-1, 28))

        self.ca = sales[['snap_CA']].values
        self.tx = sales[['snap_TX']].values
        self.wi = sales[['snap_WI']].values
        self.wday = sales[['wday']].values
        self.month = sales[['month']].values
        self.year = sales[['year']].values
        self.event = sales[['event_name']].values
        self.nday = sales[['nday']].values

        self.item = sales[['item_id']].values
        self.dept = sales[['dept_id']].values
        self.cat = sales[['cat_id']].values
        self.store = sales[['store_id']].values
        self.state = sales[['state_id']].values

        self.x_28_month_mean = sales[['x_28_month_mean']].values
        self.x_28_month_median = sales[['x_28_month_median']].values
        self.x_28_month_max = sales[['x_28_month_max']].values
        self.x_28_month_min = sales[['x_28_month_min']].values
        self.x_28_month_max_to_min_diff = sales[['x_28_month_max_to_min_diff'
                                                 ]].values
        self.x_28_wk_mean = sales[['x_28_wk_mean']].values
        self.x_28_wk_median = sales[['x_28_wk_median']].values
        self.x_28_wk_max = sales[['x_28_wk_max']].values
        self.x_28_wk_min = sales[['x_28_wk_min']].values
        self.x_28_wk_max_to_min_diff = sales[['x_28_wk_max_to_min_diff'
                                              ]].values
Пример #2
0
    def __init__(self, sales):
        global LAGS
        print("start process_data...")

        df_head = sales.head()

        if not PROC_CSV_EXIST:
            sales["x"] = sales["demand"] / sales["scale1"]
            self.FEATS = []
            for lag in tqdm(LAGS):
                sales[f"x_{lag}"] = sales.groupby("id")["x"].shift(lag)
                self.FEATS.append(f"x_{lag}")

            # print(sales.shape)
            # sales = sales.loc[sales.nb > sales.start]
            # print(sales.shape)

            # nb = sales['nb'].values
            # MAX_LAG = max(LAGS)

            # tr_mask = np.logical_and(nb>START + MAX_LAG, nb<=1913)
            # SORRY THIS IS FAKE VALIDATION. I DIDN'T THINK IT WOULD HAVE HAD LIFTED UP MY SCORE LIKE THAT
            # self.tr_mask = np.logical_and(nb > START + MAX_LAG, nb <= 1941)
            # self.val_mask = np.logical_and(nb > 1913, nb <= 1941)
            # self.te_mask = np.logical_and(nb > 1941, nb <= 1969)

            def preprocess_sales_2(sales):
                months_unq = sales['month'].unique().tolist()
                years_unq = sales['year'].unique().tolist()
                weeks_unq = sales['wday'].unique().tolist()
                # sales = sales.dropna(axis=0, subset=['x_28'])
                for i in years_unq:
                    for y in months_unq:
                        sales.loc[(sales['year'] == i) & (sales['month'] == y), 'x_28' + '_month_mean'] = \
                            sales.loc[(sales['month'] == y) & (sales['year'] == i)].groupby(['id'])['x_28'].transform(
                                lambda x: x.mean()).astype("float32")
                        sales.loc[(sales['year'] == i) & (sales['month'] == y), 'x_28' + '_month_max'] = \
                            sales.loc[(sales['month'] == y) & (sales['year'] == i)].groupby(['id'])['x_28'].transform(
                                lambda x: x.max()).astype("float32")
                        sales.loc[(sales['year'] == i) & (sales['month'] == y), 'x_28' + '_month_min'] = \
                            sales.loc[(sales['month'] == y) & (sales['year'] == i)].groupby(['id'])['x_28'].transform(
                                lambda x: x.min()).astype("float32")
                        sales['x_28' + '_month_max_to_min_diff'] = (
                            sales['x_28' + '_month_max'] -
                            sales['x_28' + '_month_min']).astype("float32")

                        for z in weeks_unq:
                            sales.loc[(sales['year'] == i) & (sales['month'] == y) & (
                                    sales['wday'] == z), 'x_28' + '_wk_mean'] = \
                                sales.loc[(sales['month'] == y) & (sales['year'] == i) & (sales['wday'] == z)].groupby(
                                    ['id'])[
                                    'x_28'].transform(lambda x: x.mean()).astype("float32")
                            sales.loc[(sales['year'] == i) & (sales['month'] == y) & (
                                    sales['wday'] == z), 'x_28' + '_wk_median'] = \
                                sales.loc[(sales['month'] == y) & (sales['year'] == i) & (sales['wday'] == z)].groupby(
                                    ['id'])[
                                    'x_28'].transform(lambda x: x.median()).astype("float32")
                            sales.loc[(sales['year'] == i) & (sales['month'] == y) & (
                                    sales['wday'] == z), 'x_28' + '_wk_max'] = \
                                sales.loc[(sales['month'] == y) & (sales['year'] == i) & (sales['wday'] == z)].groupby(
                                    ['id'])[
                                    'x_28'].transform(lambda x: x.max()).astype("float32")
                            sales.loc[(sales['year'] == i) & (sales['month'] == y) & (
                                    sales['wday'] == z), 'x_28' + '_wk_min'] = \
                                sales.loc[(sales['month'] == y) & (sales['year'] == i) & (sales['wday'] == z)].groupby(
                                    ['id'])[
                                    'x_28'].transform(lambda x: x.min()).astype("float32")
                            sales['x_28' + '_wk_max_to_min_diff'] = (
                                sales['x_28' + '_wk_max'] -
                                sales['x_28' + '_wk_min']).astype("float32")
                return sales

            sales = preprocess_sales_2(sales)
            sales.to_csv("../input/sales_processed_f9.csv", index=True)
            sales[LIST_OF_FEATURE].to_csv(
                "../input/sales_processed_only_f9.csv", index=True)
        else:
            sales = pd.read_csv("../input/sales_processed_f9.csv", index_col=0)
            sales = sales.loc[:, ~sales.columns.isin(
                ['x_28', 'x_30', 'x_35', 'x_42', 'x_49', 'x_56', 'x_63'])]
            # sales = pd.read_csv("../input/sales_processed.csv", index_col=0)
            sales = LoadData.reduce_mem_usage(sales)
            self.FEATS = []
            for lag in tqdm(LAGS):
                sales[f"x_{lag}"] = sales.groupby("id")["x"].shift(lag)
                self.FEATS.append(f"x_{lag}")

        # print('#' * 40)
        # print("SALES:", sales.isnull().any())
        # self.scale2 = sales['scale1'].values
        self.scale = sales['scale1'].values.reshape((NITEMS, -1))
        self.ids = sales['id'].values.reshape((NITEMS, -1))
        # y = sales['demand'].values
        # ys = y / scale
        self.ys = sales[['x', 'sales1']].values.reshape((NITEMS, -1, 2))
        # self.ys = sales['x'].values

        # feats_list = self.FEATS + LIST_OF_FEATURE
        feats_list = self.FEATS

        # arr_feat = sales[self.FEATS].values
        # arr_feature = sales[LIST_OF_FEATURE].values
        # arr_merge = np.concatenate([arr_feat, arr_feature], 1)

        # z_merge = sales[feats_list].values
        # print(np.array_equal(arr_merge, z_merge))
        # print(arr_merge == z_merge)
        # print(np.all((arr_merge == z_merge) | (np.isnan(arr_merge) & np.isnan(z_merge))))
        # print(type(arr_merge[0][0]), arr_merge[0][0])
        # print(type(z_merge[0][0]), z_merge[0][0])

        self.Z = sales[feats_list].values.reshape(
            (NITEMS, -1, len(feats_list)))
        # self.Z = sales[self.FEATS].values.reshape((NITEMS, -1, len(self.FEATS)))
        print(self.scale.shape, self.ids.shape, self.ys.shape, self.Z.shape)

        self.sv = self.scale[:, LEN - 56:LEN - 28]
        self.se = self.scale[:, LEN - 28:LEN]

        # self.sv = self.scale[self.val_mask]
        # self.se = self.scale[self.te_mask]
        # self.ids = self.ids[self.te_mask]
        # self.ids = self.ids.reshape((-1, 28))
        #
        # self.ca = sales[['snap_CA']].values
        # self.tx = sales[['snap_TX']].values
        # self.wi = sales[['snap_WI']].values
        # self.wday = sales[['wday']].values
        # self.month = sales[['month']].values
        # self.year = sales[['year']].values
        # self.event = sales[['event_name']].values
        # self.nday = sales[['nday']].values
        #
        # self.item = sales[['item_id']].values
        # self.dept = sales[['dept_id']].values
        # self.cat = sales[['cat_id']].values
        # self.store = sales[['store_id']].values
        # self.state = sales[['state_id']].values

        self.C = sales[CATCOLS].values.reshape((NITEMS, -1, len(CATCOLS)))