Exemplo n.º 1
0
def train_model(X_train, X_test, y_train, y_test, sample_weight,
                additional_trees):

    # DEFINE TRAIN POOL AND VAL POOL
    cat_features = ['year', 'month', 'store_type', 'activity_desc']

    train_pool = catpool(data=X_train,
                         label=y_train,
                         cat_features=cat_features,
                         weight=sample_weight)

    val_pool = catpool(data=X_test, label=y_test, cat_features=cat_features)

    # DEFINE CATBOOST MODEL
    model = CatBoostRegressor(
        iterations=1000,
        # learning_rate=0.1,
        loss_function='RMSE',
        one_hot_max_size=50,
        # loss_function='Tweedie:variance_power=1.9',
        eval_metric='RMSE',
        verbose=100)

    # TRAIN MODEL
    model.fit(X=train_pool,
              eval_set=val_pool,
              use_best_model=False,
              early_stopping_rounds=additional_trees)

    return model
Exemplo n.º 2
0
def train_model(X_train, X_test, y_train, y_test, sample_weight):

    cat_features = ['year', 'month', 'store_type', 'activity_desc']

    train_pool = catpool(data=X_train,
                         label=y_train,
                         cat_features=cat_features,
                         weight=sample_weight)
    val_pool = catpool(data=X_test, label=y_test, cat_features=cat_features)

    # CatBoost model
    model = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.1,
        loss_function='RMSE',
        one_hot_max_size=50,
        # loss_function='Tweedie:variance_power=1.9',
        eval_metric='RMSE',
        verbose=200)

    model.fit(X=train_pool,
              eval_set=val_pool,
              use_best_model=False,
              early_stopping_rounds=50)

    return model
Exemplo n.º 3
0
def train_model(X_train, X_test, y_train, y_test):

    train_pool = catpool(data=X_train, label=y_train)
    val_pool = catpool(data=X_test, label=y_test)

    # CatBoost model
    model = CatBoostRegressor(
        iterations=500,
        # learning_rate=0.1,
        loss_function='RMSE',
        eval_metric='RMSE',
        verbose=False
        )

    model.fit(
        X=train_pool,
        eval_set=val_pool,
        early_stopping_rounds=20
    )

    return model
Exemplo n.º 4
0
def retrain_model(X, y, trees, learning_rate, sample_weight):

    # DEFINE TRAIN POOL
    cat_features = ['year','month','store_type','activity_desc']
    train_pool = catpool(data=X, label=y, cat_features=cat_features, weight=sample_weight)

    # DEFINE CATBOOST MODEL
    #* PARAMETERS DEPEND ON PRIOR MODEL
    model = CatBoostRegressor(
        iterations=trees,
        learning_rate=learning_rate,
        loss_function='RMSE',
        one_hot_max_size=50,
        # loss_function='Tweedie:variance_power=1.5',
        eval_metric='RMSE',
        verbose=200
        )

    # FIT CATBOOST MODEL
    model.fit(X=train_pool)

    return model
Exemplo n.º 5
0
def main(item, threshold):

    # REDIRECT STDOUT AND STDERR TO LOG FILE
    with open(f"{log_path}{item}.log", "w") as train_log, contextlib.redirect_stdout(train_log), contextlib.redirect_stderr(train_log):
        try:
            print(f"ITEM {item} STARTING AT {now()}, PROCESS: {item_list.index(item)+1} OUT OF {len(item_list)}")
            print(f"Loading data from {data_path}")
            raw_train_val_df, raw_test_df = load_data(f"{data_path}{item}.csv.gz", 
                train_start=train_val_start, val_end=train_val_end, test_start=test_start, test_end=test_end)

            # print TIME RANGE
            print(f"train_val_df: {raw_train_val_df.day_dt.min()} to {raw_train_val_df.day_dt.max()}")
            print(f"test_df: {raw_test_df.day_dt.min()} to {raw_test_df.day_dt.max()}")

            # IF EMPTY, CONTINUE WITH NEXT ITEM
            if len(raw_train_val_df[raw_train_val_df.day_dt < v1_start]) == 0:
                print(f"EMTPY TRAIN AND VAL DATA >>>")
                return None
            if len(raw_test_df) == 0:
                print(f"EMPTY TEST DATA >>>")
                return None
            
            # PRINT OFFER DESC FROM TRAIN, VAL AND TEST
            try:                
                print("\nOFFERS IN TRAIN AND VAL DATA:")
                for offer in raw_train_val_df.offer_code.dropna().unique():
                    tmp = prom_info[(prom_info.offer_code == int(offer)) & (prom_info.item == int(item))][['item_desc','offer_start_date','offer_end_date','promotion_merch','buyer_note','retail_price', 'prom_price','unit_price']]
                    print(tmp.to_markdown(showindex=False))
                    print(prom_info[prom_info.offer_code == int(offer)].item_desc)

                print("\nOFFERS IN TEST DATA:")
                for offer in raw_test_df.offer_code.dropna().unique():
                    tmp = prom_info[(prom_info.offer_code == int(offer)) & (prom_info.item == int(item))][['item_desc','offer_start_date','offer_end_date','promotion_merch','buyer_note','retail_price', 'prom_price','unit_price']]
                    print(tmp.to_markdown(showindex=False))
                    print(prom_info[prom_info.offer_code == int(offer)].item_desc)

            except Exception as e:
                print(f"Error when printing offer desc: {e}")

            # MAKE NEW COPIES OF DATAFRAMES
            train_val_df, test_df = raw_train_val_df.copy(), raw_test_df.copy()
            del raw_train_val_df, raw_test_df

            # ADD NUM ITEMS TO TRAIN AND TEST
            num_items_dict = prom_info.groupby(['offer_code']).item.count().to_frame().reset_index().rename({'item':'num_items_in_offer'},axis=1)
            train_val_df.offer_code.fillna('0',inplace=True)
            train_val_df.offer_code = train_val_df.offer_code.astype(int)
            train_val_df = train_val_df.merge(num_items_dict, how='left', on='offer_code')
            train_val_df.num_items_in_offer.fillna(100,inplace=True)
            test_df.offer_code.fillna('0',inplace=True)
            test_df.offer_code = test_df.offer_code.astype(int)
            test_df = test_df.merge(num_items_dict, how='left', on='offer_code')
            test_df.num_items_in_offer.fillna(100,inplace=True)

            # ADD LUNAR NEW YEAR FEATURE
            train_val_df = train_val_df.merge(lunar, how='left', on='day_dt')
            test_df = test_df.merge(lunar, how='left', on='day_dt')

            # ADD PROMOTION SCHEDULE FEATURE
            prom_schedule = get_prom_schedule(item=item, prom_info=prom_info) #* GET PROMOTION SCHEDULE FOR CURRENT ITEM
            train_val_df = train_val_df.merge(prom_schedule, how='left', on='day_dt')
            test_df = test_df.merge(prom_schedule, how='left', on='day_dt')
            train_val_df.day_of_prom.fillna(-1,inplace=True)
            test_df.day_of_prom.fillna(-1,inplace=True)

            # DEFINE FEATURES TO SELECT
            index_features = ['day_dt','store_id','item_code','loc_wh']

            X_features = ['year','month','day_of_prom','weekday','day_type','is_5th','store_type','activity_desc',
                        'prom0','prom1','prom2','prom3','prom4','prom5','prom6','prom7', 'is_vip','free_gift',
                        'required_num','unit_price','p_rate','dis_spring','num_items_in_offer','loc_selling_area','pds_grace',
                        ]

            y_feature = ['ttl_quantity']

            print(f"\nSELECTED FEATURES: {X_features}")

            print(f"Cleaning train data...")
            train_val_df = clean_data(train_val_df)
            train_val_df = impute_data(train_val_df)
            train_val_df = select_features(index_features+X_features+y_feature, train_val_df)
            train_val_df = label_encoder(train_val_df,X_features)

            print(f"Cleaning test data...")
            test_df = clean_data(test_df)
            test_df = impute_data(test_df)
            test_df = select_features(index_features+X_features+y_feature, test_df)
            test_df = label_encoder(test_df,X_features)

            #* SORT train_val_df FOR WEIGHT ASSIGNMENT
            train_val_df = train_val_df.sort_values('day_dt')

            # SPLIT TRAIN DATA INTO TRAIN AND VALIDATION
            train_start, train_end, val_start, val_end = train_val_start, v1_start, v1_start, v2_end
            train_index = train_val_df[train_val_df.day_dt.between(train_start, train_end)].index
            val_index = train_val_df[train_val_df.day_dt.between(val_start, val_end)].index

            #* ASSIGN SAMPLE WEIGHT BY DAY_DT BEFORE VALIDATION SET
            print(f"ASSIGNING SAMPLE WEIGHT FOR TRAIN DATA FROM {train_start} TO {train_end}")
            date_range = pd.date_range(train_start, train_end)
            weight_dict = dict(zip(date_range, np.linspace(0,1,len(date_range)) ** 4))
            sample_weight_train = [weight_dict[day_dt] for day_dt in train_val_df.loc[train_index].day_dt]

            # plt.figure(figsize=(12,8))
            # plt.plot(train_val_df.loc[train_index].day_dt, sample_weight_train)
            # plt.savefig("./test/sample_weight_train.png")

            #* ASSIGN SAMPLE WEIGHT BY DAY_DT BEFORE TEST SET
            print(f"ASSIGNING SAMPLE WEIGHT FOR TRAIN AND VAL FROM {train_start} TO {val_end}")
            date_range = pd.date_range(train_start, val_end)
            weight_dict = dict(zip(date_range, np.linspace(0,1,len(date_range)) ** 4))
            sample_weight_train_val = [weight_dict[day_dt] for day_dt in train_val_df.day_dt]

            # plt.figure(figsize=(12,8))
            # plt.plot(train_val_df.day_dt, sample_weight_train_val)
            # plt.savefig("./test/sample_weight_train_val.png")

            # Define X and y
            X = train_val_df[X_features]
            X['year'] = X['year'].apply(lambda x: 2020 if int(x) == 2021 else x)
            y = train_val_df[y_feature[0]]
            X_train, X_test, y_train, y_test = X.loc[train_index], X.loc[val_index], y.loc[train_index], y.loc[val_index]
            print(f"Training model...")
            model = train_model(X_train, X_test, y_train, y_test, sample_weight=sample_weight_train)
            # print(model.get_feature_importance(type='ShapValues', data=catpool(X_train, y_train, cat_features=['year','month','store_type','activity_desc'])))
            print("FEATURE IMPORTANCE - PREDICTION VALUES CHANGE:")
            print(pd.Series(index=X_train.columns, 
                        data=model.get_feature_importance(type='PredictionValuesChange')).sort_values(ascending=False))
            print("FEATURE IMPORTANCE - LOSS FUNCTION CHANGE:")
            print(pd.Series(index=X_train.columns, 
                        data=model.get_feature_importance(type='LossFunctionChange', 
                        data=catpool(X_train, y_train, cat_features=['year','month','store_type','activity_desc']))).sort_values(ascending=False))

            print(f"Evaluating model with {model.best_iteration_+50} trees and {round(model.learning_rate_,4)} learning rate...")
            y_pred_val = model.predict(X_test)
            y_pred_train = model.predict(X_train)

            # GET EVALUATE RESULTS
            train_evaluate, val_evaluate = train_val_df.loc[train_index], train_val_df.loc[val_index]
            train_evaluate['y_pred'], val_evaluate['y_pred']  = y_pred_train, y_pred_val

            # VALIDATION RESULTS
            item_wh = agg_results(df=val_evaluate, v1_start=v1_start, v1_end=v1_end, v2_start=v2_start, v2_end=v2_end)
            val_passed = np.mean([1 if r>=0.65 and r<=1.2 else 0 for r in item_wh.ratio])
            print(f"ALL VALIDATION {val_start, val_end} RESULTS________________________________________________________________________")
            show_results(item_wh)
            print(item_wh)

            if val_passed >= threshold: #* IF 75% SELL THRU BETWEEN 65% AND 120%

                print(f"ITEM PASSED!!!")
                if test_df.duplicated().any(): # CHECK DUPLICATES IN TEST DATA
                    print("DUPLICATED DATA IN TEST DATA...REMOVED")
                test_df = test_df.drop_duplicates()

                # ADD VALIDATION DATASET INTO TRAINING
                print(f"TRAINING NEW MODEL WITH ADDED VALIDATION DATA USING {model.best_iteration_+50} TREES AND {round(model.learning_rate_,4)} LEARNING RATE...")
                retrained_model = retrain_model(X=X, y=y, trees=model.best_iteration_+50, learning_rate=model.learning_rate_, sample_weight=sample_weight_train_val)

                # PREDICT ON TEST DATASET
                print(f"PREDICTING ON TEST DATA...")
                test_X = test_df[X_features]
                test_X['year'] = test_X.year.apply(lambda x: 2020 if int(x) == 2021 else x)
                test_df['y_pred'] = [round(x,4) for x in retrained_model.predict(test_X)]

                print(f"PREDICTION RESULTS:")
                pred_results = test_df.groupby('day_dt').agg({'ttl_quantity':'sum', 'y_pred':'sum', 'unit_price':'mean'})
                pred_results['ratio'] = pred_results.ttl_quantity / pred_results.y_pred
                print(pred_results)

                print(f"EXPORTING RESULT DATA, META DATA, IMG, LOG...")
                export_results(item_code=item,
                            train=train_evaluate,
                            val1=val_evaluate[val_evaluate.day_dt.between(v1_start, v1_end)],
                            val2=val_evaluate[val_evaluate.day_dt.between(v2_start, v2_end)],
                            test=test_df,
                            export_result_meta_data=export_result_meta_data,
                            export_img=export_img)

                print(f"EXPORTED RESULTS TO {img_path} AT {now()}")
                
            print(f"ITEM TRAINING FINISHED AT {now()}")
            
        except:
            traceback.print_exc()