Exemplo n.º 1
0
 def __init__(self):
     self.preproc_s = PreprocessSetting()
     self.smc_s = SalesModelCreationSetting()
     self.preproc = Preprocess()
     self.postproc = Postprocess()
     self.mmt = MergeMasterTable()
     self.mmt_s = MergeMasterTableSetting()
     self.chart_cli = ChartClient()
     self.gu = GroupingUnit()
     self.util = Util()
Exemplo n.º 2
0
 def __init__(self):
     self.chart_cli = ChartClient()
     self.preproc_s = PreprocessSetting()
     self.sca_s = StoreCurrAnalysisSetting()
     self.preproc = Preprocess()
     self.sc = SrcConversion()
     self.gu = GroupingUnit()
     self.util = Util()
     self.mmt = MergeMasterTable()
     self.mmt_s = MergeMasterTableSetting()
 def __init__(self):
     self.preproc_s = PreprocessSetting()
     self.mra_s = MultipleRegressionAnalysisSetting()
     self.preproc = Preprocess()
     self.postproc = Postprocess()
     self.mmt = MergeMasterTable()
     self.mmt_s = MergeMasterTableSetting()
     self.chart_cli = ChartClient()
     self.gu = GroupingUnit()
     self.util = Util()
Exemplo n.º 4
0
class SalesModelCreation:
    clf = linear_model.LinearRegression()

    def __init__(self):
        self.preproc_s = PreprocessSetting()
        self.smc_s = SalesModelCreationSetting()
        self.preproc = Preprocess()
        self.postproc = Postprocess()
        self.mmt = MergeMasterTable()
        self.mmt_s = MergeMasterTableSetting()
        self.chart_cli = ChartClient()
        self.gu = GroupingUnit()
        self.util = Util()

    def execute(self):
        self.df_preproc, preproc_csv_file_name = self._preprocess()
        # preproc_csv_file_name = ''
        # self.df_preproc = self.preproc.fetch_csv_and_create_src_df(self.preproc_s.PROCESSED_DATA_DIR
        #                                                            , [preproc_csv_file_name])
        self._create_prediction_model()
        # self._postprocess()

    def _preprocess(self):
        df_src = self.preproc.common_proc(self.preproc_s)
        df_item_pivot = self.preproc.tanspose_cols_and_rows(
            df_src, self.gu.DAY_BILL, self.preproc_s.TGT_TRANPOSE_C_AND_R_COL,
            self.preproc_s.TRANPOSE_C_AND_R_COUNT_COL)

        self.preproc.dt_min_round(df_src, '滞在時間', 20)
        df_src['客構成'] = self.preproc.create_cstm_strctr(df_src)
        df_src = self.mmt.merge_store(df_src, self.mmt_s.F_PATH_STORE)
        df_src = self.mmt.merge_weather(df_src, self.mmt_s.DIR_WEATHER,
                                        self.preproc_s.TGT_PERIOD_FLOOR,
                                        self.preproc_s.TGT_PERIOD_TOP)

        df_src = self.preproc.calc_entering_and_exiting_time(df_src)
        df_src = self.preproc.create_stay_presense(df_src,
                                                   df_src.loc[0, '営業開始時間'],
                                                   df_src.loc[0, '営業締め時間'])
        self.preproc.dt_min_round(df_src, '注文時間', 10)
        self.preproc.dt_min_round(df_src, '滞在時間', 20)
        df_grouped_by_bill = df_src.groupby(self.gu.BILL).max().reset_index()
        df_grouped_by_bill = pd.merge(df_grouped_by_bill, df_item_pivot)
        # df_src = self.preproc.change_label_name(df_src)
        # preproc_csv_file_name = self.preproc.create_proc_data_csv(df_src, self.preproc_s.PROCESSED_DATA_DIR,
        #                                                           self.preproc_s.TGT_STORE,
        #                                                           self.preproc_s.TGT_PERIOD_FLOOR,
        #                                                           self.preproc_s.TGT_PERIOD_TOP,
        #                                                           '_' + self.preproc_s.GROUPING_FILE_MEMO)
        preproc_csv_file_name = None
        return df_grouped_by_bill, preproc_csv_file_name

    def _get_preproc_data(self, csv_file_name):
        return pd.read_csv(self.preproc_s.PROCESSED_DATA_DIR + csv_file_name,
                           encoding='cp932')

    def _create_prediction_model(self):
        self.df_preproc.drop(columns=[
            'H.集計対象営業年月日', 'H.伝票番号', 'H.伝票発行日', 'H.伝票処理日', '滞在時間', 'D.価格',
            '注文時間', 'D.オーダー日時'
        ],
                             inplace=True)
        self.preproc.replace_missing_value(self.df_preproc)
        # self.df_preproc['滞在時間'] = (self.df_preproc['滞在時間'] / np.timedelta64(1, 'M')).astype(int)
        X, y = self.util.create_prd_and_obj_df_or_values(
            self.df_preproc, 'H.伝票金額', 'values', does_replace_dummy=True)
        X_train, X_test, y_train, y_test = model_selection.train_test_split(
            X, y, test_size=.2, random_state=0)
        self._standardization(X_train)
        print(X_train)
        # self._create_model(X_train, X_test, y_train, y_test)
        n_hidden = 80  # 出力次元
        epochs = 100  # エポック数
        batch_size = 10  # ミニバッチサイズ

        # モデル定義
        prediction = Prediction(maxlen, n_hidden, n_in, n_out)
        # 学習
        model = prediction.train(x_train, t_train, batch_size, epochs)
        # テスト
        score = model.evaluate(x_test,
                               t_test,
                               batch_size=batch_size,
                               verbose=1)
        print("score:", score)

        # 正答率、準正答率(騰落)集計
        preds = model.predict(x_test)
        correct = 0
        semi_correct = 0
        for i in range(len(preds)):
            pred = np.argmax(preds[i, :])
            tar = np.argmax(t_test[i, :])
            if pred == tar:
                correct += 1
            else:
                if pred + tar == 1 or pred + tar == 5:
                    semi_correct += 1

        print("正答率:", 1.0 * correct / len(preds))
        print("準正答率(騰落):", 1.0 * (correct + semi_correct) / len(preds))

    def _standardization(self, X_train):
        scaler = StandardScaler()
        scaler.fit(X_train)
Exemplo n.º 5
0
class StoreCurrAnalysis:
    def __init__(self):
        self.chart_cli = ChartClient()
        self.preproc_s = PreprocessSetting()
        self.sca_s = StoreCurrAnalysisSetting()
        self.preproc = Preprocess()
        self.sc = SrcConversion()
        self.gu = GroupingUnit()
        self.util = Util()
        self.mmt = MergeMasterTable()
        self.mmt_s = MergeMasterTableSetting()

    # def execute(self,tgt_store):
    def execute(self):
        tgt_store = [
            '大和乃山賊', '定楽屋', 'うおにく', 'かこい屋', 'くつろぎ屋', 'ご馳走屋名駅店', 'ご馳走屋金山店',
            '九州乃山賊小倉総本店', '和古屋', '楽屋', '鳥Bouno!', 'ぐるめ屋'
        ]
        # tgt_store = ['大和乃山賊', ]
        for s in tgt_store:
            self.sca_s.TGT_STORE = self.preproc_s.TGT_STORE = s
            self.sca_s.OUTPUT_DIR = './data/OUTPUT/' + self.sca_s.TGT_STORE + '/'
            self.preproc_s.DATA_FILES_TO_FETCH = [
                '売上データ詳細_' + self.preproc_s.TGT_STORE + '_20180401-0630.csv',
            ]
            self.preproc_s.PROCESSED_DATA_DIR = './data/Input/processed_data/' + self.preproc_s.TGT_STORE + '/'

            self.df_preproc, preproc_csv_file_name = self._preprocess()

            # preproc_csv_file_name = ''
            # self.df_preproc = self.preproc.fetch_csv_and_create_src_df(self.preproc_s.PROCESSED_DATA_DIR
            #                                                            , [preproc_csv_file_name])
            # df_grouped_src = self.df_preproc.groupby(self.cols).mean().reset_index()
            # df_daily = df_grouped_src[self.cols]
            # self.util.df_to_csv(df_daily, self.sca_s.OUTPUT_DIR, '大和乃山賊_サンプル.csv')

            self._output_store_curr_info(del_old_file=True)

            print(s + " is finish")

    def _preprocess(self):
        df_src = self.preproc.common_proc(self.preproc_s)
        df_src = self.mmt.merge_store(df_src, self.mmt_s.F_PATH_STORE)
        # df_src = self.mmt.merge_weather(df_src, self.mmt_s.DIR_WEATHER, self.preproc_s.TGT_PERIOD_FLOOR,
        #                                        self.preproc_s.TGT_PERIOD_TOP)

        df_src = self.preproc.calc_entering_and_exiting_time(df_src)
        df_src = self.preproc.create_stay_presense(df_src,
                                                   df_src.loc[0, '営業開始時間'],
                                                   df_src.loc[0, '営業締め時間'])

        self.preproc.dt_min_round(df_src, '注文時間', 10)
        self.preproc.dt_min_round(df_src, '滞在時間', 20)
        preproc_csv_file_name = self.preproc.create_proc_data_csv(
            df_src,
            self.preproc_s.PROCESSED_DATA_DIR,
            self.preproc_s.TGT_STORE,
            self.preproc_s.TGT_PERIOD_FLOOR,
            self.preproc_s.TGT_PERIOD_TOP,
            memo=self.preproc_s.FILE_MEMO)

        return df_src, preproc_csv_file_name

    def _output_store_curr_info(self, del_old_file=False):
        self.df_grouped_by_bill = self._create_df_grouped_by_bill()
        self.df_set_date_index = self._create_df_set_date_index()

        # self.output_dict = dict()
        # self._monthly_sales()
        # self._daily_cstm_info()
        # self._abc_analysis()
        # self._sheet_occupancy()
        #
        # if del_old_file and os.path.isfile(self.sca_s.OUTPUT_DIR + self.sca_s.OUTPUT_F_EXCEL):
        #     os.remove(self.sca_s.OUTPUT_DIR + self.sca_s.OUTPUT_F_EXCEL)
        # with pd.ExcelWriter(self.sca_s.OUTPUT_DIR + self.sca_s.OUTPUT_F_EXCEL) as writer:
        #     self.util.check_existing_and_create_excel_file(self.sca_s.OUTPUT_DIR + self.sca_s.OUTPUT_F_EXCEL)
        #     writer.book = load_workbook(self.sca_s.OUTPUT_DIR + self.sca_s.OUTPUT_F_EXCEL)
        #     [v_df.to_excel(writer, sheet_name=k, merge_cells=False) for k, v_df in self.output_dict.items()]

        self._plot_moving_avg()

    def _create_df_grouped_by_bill(self):
        return self.df_preproc.groupby(self.gu.BILL).max().reset_index()

    def _create_df_set_date_index(self):
        return self.df_grouped_by_bill.set_index(
            pd.DatetimeIndex(self.df_grouped_by_bill['H.集計対象営業年月日']))

    def _plot_moving_avg(self):
        df_daily = self.df_set_date_index.groupby(self.df_set_date_index.index). \
            agg(self.sca_s.GROUPING_WAY_DAILY)

        df_daily = self.util.moving_average(df_daily, 'H.伝票金額', 7)
        df_daily = self.util.moving_average(df_daily, 'H.客数(合計)', 7)
        self.chart_cli.plot_axis_is_index(df_daily,
                                          needsSave=True,
                                          file_path=self.sca_s.OUTPUT_DIR +
                                          '移動平均_' + self.sca_s.TGT_STORE +
                                          '.png')

    def _sheet_occupancy(self):
        time_cols = []
        curr_time = self.df_grouped_by_bill.loc[0, '営業開始時間']
        end_time = self.df_grouped_by_bill.loc[0, '営業締め時間']
        sheet_num = int(self.df_grouped_by_bill.loc[0, '席数'])
        while curr_time < end_time:
            if curr_time % 100 == 0:
                curr_time_plus30 = curr_time + 30
            else:
                curr_time_plus30 = curr_time + 70
            time_cols.append(str(curr_time) + '-' + str(curr_time_plus30))
            curr_time = curr_time_plus30

        df_timely_sheet_occupancy = self.df_set_date_index.groupby([
            self.df_set_date_index.index.year.rename('year'),
            self.df_set_date_index.index.month.rename('month'),
            self.df_set_date_index.index.day.rename('day')
        ])[time_cols].sum() / sheet_num

        self.output_dict.update({'座席占有率': df_timely_sheet_occupancy})

    def _monthly_sales(self):
        df_monthly_sales = self.df_set_date_index.groupby([
            self.df_set_date_index.index.year.rename('year'),
            self.df_set_date_index.index.month.rename('month')
        ])['H.伝票金額'].sum()

        self.output_dict.update({'月間売上': df_monthly_sales})

    def _daily_cstm_info(self):
        df_daily_cstm = self.df_set_date_index.groupby(self.df_set_date_index.index). \
            agg(self.sca_s.GROUPING_WAY_DAILY_CSTM)
        self.output_dict.update({'日別客情報': df_daily_cstm})

        # self.chart_cli.create_pie_chart(
        #     df=self.preproc.grouping(self.df_preproc, self.sca_s.GROUPING_KEY_ITEM_CATEGORY2,
        #                              self.sca_s.GROUPING_WAY, self.sca_s.PIE_CHART_SET[0]),
        #     amount_col=self.sca_s.PIE_CHART_SET[1])

        # 時系列カラムをインデックスに指定する必要がある
        # self.chart_cli.time_series_graph(self.df_preproc,
        #                                  amount_cols_li=self.df_preproc[self.sca_s.TIME_SERIES_GRAPH_MONTHLY])
        # self.chart_cli.time_series_graph(self.df_preproc,
        #                                  amount_cols_li=self.df_preproc[self.sca_s.TIME_SERIES_GRAPH_DAYLY])
        #

        # self.chart_cli.plotfig()
        # self.chart_cli.savefig(self.sca_s.OUTPUT_DIR + self.sca_s.FIG_FILE_NAME)
        # self.chart_cli.closefig()

    def _abc_analysis(self):
        [
            self.sales_and_ratio_by_key(k, True)
            for k in self.sca_s.ABC_BILL_LEVEL_KEY
        ]
        [
            self.sales_and_ratio_by_key(k, False)
            for k in self.sca_s.ABC_NO_BILL_LEVEL_KEY
        ]

    def sales_and_ratio_by_key(self, key_li, bill_level_summary=True):
        df_grouped = self.df_preproc.groupby(key_li)
        df_sales_by_key = df_grouped.agg({'D.価格': np.sum, "H.客数(合計)": np.mean})
        df_sales_by_key['売上比率'] = df_sales_by_key['D.価格'] / int(
            df_sales_by_key['D.価格'].sum())

        if bill_level_summary:
            df_tmp = self.df_preproc.groupby(
                self.gu.BILL + key_li).mean().reset_index().set_index(
                    key_li, drop=True)
            s_count_by_key = df_tmp.groupby(key_li).size()
        else:
            s_count_by_key = df_grouped.size()
        s_count_by_key.name = 'count_' + '_'.join(key_li)
        s_ratio_by_key = pd.Series(s_count_by_key / s_count_by_key.sum(),
                                   name='ratio_' + '_'.join(key_li))

        df_merged = pd.concat(
            [df_sales_by_key, s_count_by_key, s_ratio_by_key], axis=1)
        df_merged = self.preproc.sort_df(df_merged,
                                         ['count_' + '_'.join(key_li)],
                                         [False])
        df_merged["平均支払額"] = df_merged['D.価格'] / df_merged['count_' +
                                                           '_'.join(key_li)]

        if key_li in self.sca_s.CALC_PRICE_PER_CSTM:
            df_merged["客単価"] = df_merged["平均支払額"] / df_merged["H.客数(合計)"]
        else:
            df_merged.drop(columns="H.客数(合計)", inplace=True)

        # self.chart_cli.create_pie_chart(df=df_merged, amount_col='D.価格')
        # self.chart_cli.savefig(self.sca_s.OUTPUT_DIR, 'ABC分析_売上構成比.png')

        self.output_dict.update({'ABC分析_' + '_'.join(key_li): df_merged})
Exemplo n.º 6
0
class CausalAnalysis:
    def __init__(self):
        self.chart_cli = ChartClient()
        self.preproc_s = PreprocessSetting()
        self.ca_s = CausalAnalysisSetting()
        self.preproc = Preprocess()
        self.sc = SrcConversion()
        self.gu = GroupingUnit()
        self.util = Util()
        self.mmt = MergeMasterTable()
        self.mmt_s = MergeMasterTableSetting()

    # def execute(self,tgt_store):
    def execute(self):
        tgt_store = [
            '大和乃山賊', '定楽屋', 'うおにく', 'かこい屋', 'くつろぎ屋', 'ご馳走屋名駅店', 'ご馳走屋金山店',
            '九州乃山賊小倉総本店', '和古屋', '楽屋', '鳥Bouno!', 'ぐるめ屋'
        ]
        # tgt_store = ['sample', ]
        # tgt_store = ['大和乃山賊']
        for s in tgt_store:
            self.ca_s.TGT_STORE = self.preproc_s.TGT_STORE = s
            self.ca_s.OUTPUT_DIR = './data/OUTPUT/' + self.ca_s.TGT_STORE + '/'
            self.preproc_s.DATA_FILES_TO_FETCH = [
                '売上データ詳細_' + self.preproc_s.TGT_STORE + '_20180401-0630.csv',
            ]
            self.preproc_s.PROCESSED_DATA_DIR = './data/Input/processed_data/' + self.preproc_s.TGT_STORE + '/'

            self.df_preproc, preproc_csv_file_name = self._preprocess()

            # preproc_csv_file_name = ''
            # self.df_preproc = self.preproc.fetch_csv_and_create_src_df(self.preproc_s.PROCESSED_DATA_DIR
            #                                                            , [preproc_csv_file_name])
            # df_grouped_src = self.df_preproc.groupby(self.cols).mean().reset_index()
            # df_daily = df_grouped_src[self.cols]
            # self.util.df_to_csv(df_daily, self.ca_s.OUTPUT_DIR, '大和乃山賊_サンプル.csv')

            # self._leveling_by_day_sales_up()
            df_leveled = self._leveling_sales(self.df_preproc)
            self.t_test(df_leveled, self.ca_s.T_TEST_TGT_COL,
                        self.ca_s.T_TEST_DIFF_COL,
                        self.ca_s.T_TEST_DIFF_CONDITION, True)

            print(s + " is finish")

    def _preprocess(self):
        df_src = self.preproc.common_proc(self.preproc_s)
        df_src = self.mmt.merge_store(df_src, self.mmt_s.F_PATH_STORE)
        df_src = self.mmt.merge_weather(df_src, self.mmt_s.DIR_WEATHER,
                                        self.preproc_s.TGT_PERIOD_FLOOR,
                                        self.preproc_s.TGT_PERIOD_TOP)
        df_src = self.mmt.merge_calender(df_src, self.mmt_s.F_PATH_CALENDER)
        df_src = self.preproc.calc_entering_and_exiting_time(df_src)
        # df_src = self.preproc.create_stay_presense(df_src, df_src.loc[0, '営業開始時間'], df_src.loc[0, '営業締め時間'])

        # self.preproc.dt_min_round(df_src, '注文時間', 10)
        # self.preproc.dt_min_round(df_src, '滞在時間', 20)
        preproc_csv_file_name = self.preproc.create_proc_data_csv(
            df_src,
            self.preproc_s.PROCESSED_DATA_DIR,
            self.preproc_s.TGT_STORE,
            self.preproc_s.TGT_PERIOD_FLOOR,
            self.preproc_s.TGT_PERIOD_TOP,
            memo=self.preproc_s.FILE_MEMO)

        return df_src, preproc_csv_file_name

    def t_test(self,
               df,
               index_col,
               diff_tgt_col,
               diff_condition,
               does_output_csv=False):
        df_t_test_rslt = pd.DataFrame(columns=[
            'item', 'src_count', 'src_avg', 'tgt_count', 'tgt_avg', 't', 'p'
        ])
        df.set_index(index_col, inplace=True)
        for c in self.ca_s.CALC_TGT_COLS:
            df_src = df[df[diff_tgt_col] != diff_condition][c + '_平準化']
            df_tgt = df[df[diff_tgt_col] == diff_condition][c + '_平準化']

            for item in df.index.unique().tolist():
                # welch's t-test
                df_src_by_item = df_src[df_src.index == item]
                df_tgt_by_item = df_tgt[df_tgt.index == item]
                t, p = stats.ttest_ind(df_src_by_item,
                                       df_tgt_by_item,
                                       equal_var=False)
                df_t_test_rslt = df_t_test_rslt.append(
                    pd.Series([
                        item,
                        df_src_by_item.count(),
                        df_src_by_item.mean(),
                        df_tgt_by_item.count(),
                        df_tgt_by_item.mean(), t, p
                    ],
                              index=df_t_test_rslt.columns),
                    ignore_index=True).sort_values('p')
            if does_output_csv:
                self.util.df_to_csv(df_t_test_rslt, self.ca_s.OUTPUT_DIR,
                                    c + '_t検定.csv')

    def _leveling_sales(self, df_src):
        df_calc_src, calc_tgt_dict = self._calc_tgt_sales(
            self.ca_s.SUB_GROUP_COLS, self.ca_s.MAIN_GROUP_COLS,
            self.ca_s.CALC_TGT_COLS, self.ca_s.DIFF_TGT_COL,
            self.ca_s.DIFF_CONDITION)
        df_leveling_ratio = self._calc_sales_diff(df_calc_src,
                                                  self.ca_s.CALC_TGT_COLS,
                                                  does_output_csv=True)
        df_merged_ratio = pd.merge(df_src,
                                   df_leveling_ratio,
                                   on=self.ca_s.MAIN_GROUP_COLS)
        for c in calc_tgt_dict.keys():
            df_merged_ratio[c + '_平準化'] = df_merged_ratio.apply(
                lambda x: x[c] // x[c + '_増加率']
                if x[self.ca_s.DIFF_TGT_COL
                     ] == self.ca_s.DIFF_CONDITION else x[c],
                axis=1)
        return df_merged_ratio

    def _calc_tgt_sales(self, sub_group_cols: list, main_group_cols: list,
                        calc_tgt_cols: list, diff_tgt_col: str,
                        diff_condition):
        calc_tgt_dict = dict()
        for c in calc_tgt_cols:
            calc_tgt_dict.update({c: [c + '_sum', c + '_count']})
        df_grouped = self.df_preproc[sub_group_cols + [diff_tgt_col] +
                                     calc_tgt_cols]
        drop_cols = [c for c in sub_group_cols if c not in main_group_cols
                     ] + [diff_tgt_col]

        df_normal = df_grouped[
            df_grouped[diff_tgt_col] != diff_condition].groupby(
                sub_group_cols + [diff_tgt_col]).sum().reset_index().drop(
                    drop_cols, axis=1)
        df_special = df_grouped[
            df_grouped[diff_tgt_col] == diff_condition].groupby(
                sub_group_cols + [diff_tgt_col]).sum().reset_index().drop(
                    drop_cols, axis=1)
        dfs = [df_normal, df_special]
        for idx, df in enumerate(dfs):
            df = df.groupby(main_group_cols).agg(['sum',
                                                  'count']).reset_index()
            df.columns = [
                '_'.join(c) if c[1] != '' else c[0] for c in df.columns
            ]
            for k, v_list in calc_tgt_dict.items():
                df[k + '_売上/日数'] = df[v_list[0]] / df[v_list[1]]
            dfs[idx] = df
        df_calc_src = pd.merge(
            dfs[0],
            dfs[1],
            on=main_group_cols,
            how='outer',
            suffixes=('_normal', '_special')).set_index(main_group_cols)
        return df_calc_src, calc_tgt_dict

    def _calc_sales_diff(self,
                         df_calc_src,
                         calc_tgt_cols,
                         does_output_csv=False):
        return_cols = []
        for c in calc_tgt_cols:
            # nan -> 1
            df_calc_src[c +
                        '_増加率'] = (df_calc_src[c + '_売上/日数_special'] /
                                   df_calc_src[c + '_売上/日数_normal']).replace(
                                       np.nan, 1)
            return_cols.append(c + '_増加率')
        if does_output_csv:
            index_names = '_'.join(df_calc_src.index.names)
            [
                self.util.df_to_csv(df_calc_src[c], self.ca_s.OUTPUT_DIR,
                                    index_names + '_' + c + '.csv', True)
                for c in return_cols
            ]
        return df_calc_src[return_cols].reset_index()