def train_model(X_train, X_test, y_train, y_test, sample_weight, additional_trees): # DEFINE TRAIN POOL AND VAL POOL cat_features = ['year', 'month', 'store_type', 'activity_desc'] train_pool = catpool(data=X_train, label=y_train, cat_features=cat_features, weight=sample_weight) val_pool = catpool(data=X_test, label=y_test, cat_features=cat_features) # DEFINE CATBOOST MODEL model = CatBoostRegressor( iterations=1000, # learning_rate=0.1, loss_function='RMSE', one_hot_max_size=50, # loss_function='Tweedie:variance_power=1.9', eval_metric='RMSE', verbose=100) # TRAIN MODEL model.fit(X=train_pool, eval_set=val_pool, use_best_model=False, early_stopping_rounds=additional_trees) return model
def train_model(X_train, X_test, y_train, y_test, sample_weight): cat_features = ['year', 'month', 'store_type', 'activity_desc'] train_pool = catpool(data=X_train, label=y_train, cat_features=cat_features, weight=sample_weight) val_pool = catpool(data=X_test, label=y_test, cat_features=cat_features) # CatBoost model model = CatBoostRegressor( iterations=1000, learning_rate=0.1, loss_function='RMSE', one_hot_max_size=50, # loss_function='Tweedie:variance_power=1.9', eval_metric='RMSE', verbose=200) model.fit(X=train_pool, eval_set=val_pool, use_best_model=False, early_stopping_rounds=50) return model
def train_model(X_train, X_test, y_train, y_test): train_pool = catpool(data=X_train, label=y_train) val_pool = catpool(data=X_test, label=y_test) # CatBoost model model = CatBoostRegressor( iterations=500, # learning_rate=0.1, loss_function='RMSE', eval_metric='RMSE', verbose=False ) model.fit( X=train_pool, eval_set=val_pool, early_stopping_rounds=20 ) return model
def retrain_model(X, y, trees, learning_rate, sample_weight): # DEFINE TRAIN POOL cat_features = ['year','month','store_type','activity_desc'] train_pool = catpool(data=X, label=y, cat_features=cat_features, weight=sample_weight) # DEFINE CATBOOST MODEL #* PARAMETERS DEPEND ON PRIOR MODEL model = CatBoostRegressor( iterations=trees, learning_rate=learning_rate, loss_function='RMSE', one_hot_max_size=50, # loss_function='Tweedie:variance_power=1.5', eval_metric='RMSE', verbose=200 ) # FIT CATBOOST MODEL model.fit(X=train_pool) return model
def main(item, threshold): # REDIRECT STDOUT AND STDERR TO LOG FILE with open(f"{log_path}{item}.log", "w") as train_log, contextlib.redirect_stdout(train_log), contextlib.redirect_stderr(train_log): try: print(f"ITEM {item} STARTING AT {now()}, PROCESS: {item_list.index(item)+1} OUT OF {len(item_list)}") print(f"Loading data from {data_path}") raw_train_val_df, raw_test_df = load_data(f"{data_path}{item}.csv.gz", train_start=train_val_start, val_end=train_val_end, test_start=test_start, test_end=test_end) # print TIME RANGE print(f"train_val_df: {raw_train_val_df.day_dt.min()} to {raw_train_val_df.day_dt.max()}") print(f"test_df: {raw_test_df.day_dt.min()} to {raw_test_df.day_dt.max()}") # IF EMPTY, CONTINUE WITH NEXT ITEM if len(raw_train_val_df[raw_train_val_df.day_dt < v1_start]) == 0: print(f"EMTPY TRAIN AND VAL DATA >>>") return None if len(raw_test_df) == 0: print(f"EMPTY TEST DATA >>>") return None # PRINT OFFER DESC FROM TRAIN, VAL AND TEST try: print("\nOFFERS IN TRAIN AND VAL DATA:") for offer in raw_train_val_df.offer_code.dropna().unique(): tmp = prom_info[(prom_info.offer_code == int(offer)) & (prom_info.item == int(item))][['item_desc','offer_start_date','offer_end_date','promotion_merch','buyer_note','retail_price', 'prom_price','unit_price']] print(tmp.to_markdown(showindex=False)) print(prom_info[prom_info.offer_code == int(offer)].item_desc) print("\nOFFERS IN TEST DATA:") for offer in raw_test_df.offer_code.dropna().unique(): tmp = prom_info[(prom_info.offer_code == int(offer)) & (prom_info.item == int(item))][['item_desc','offer_start_date','offer_end_date','promotion_merch','buyer_note','retail_price', 'prom_price','unit_price']] print(tmp.to_markdown(showindex=False)) print(prom_info[prom_info.offer_code == int(offer)].item_desc) except Exception as e: print(f"Error when printing offer desc: {e}") # MAKE NEW COPIES OF DATAFRAMES train_val_df, test_df = raw_train_val_df.copy(), raw_test_df.copy() del raw_train_val_df, raw_test_df # ADD NUM ITEMS TO TRAIN AND TEST num_items_dict = prom_info.groupby(['offer_code']).item.count().to_frame().reset_index().rename({'item':'num_items_in_offer'},axis=1) train_val_df.offer_code.fillna('0',inplace=True) train_val_df.offer_code = train_val_df.offer_code.astype(int) train_val_df = train_val_df.merge(num_items_dict, how='left', on='offer_code') train_val_df.num_items_in_offer.fillna(100,inplace=True) test_df.offer_code.fillna('0',inplace=True) test_df.offer_code = test_df.offer_code.astype(int) test_df = test_df.merge(num_items_dict, how='left', on='offer_code') test_df.num_items_in_offer.fillna(100,inplace=True) # ADD LUNAR NEW YEAR FEATURE train_val_df = train_val_df.merge(lunar, how='left', on='day_dt') test_df = test_df.merge(lunar, how='left', on='day_dt') # ADD PROMOTION SCHEDULE FEATURE prom_schedule = get_prom_schedule(item=item, prom_info=prom_info) #* GET PROMOTION SCHEDULE FOR CURRENT ITEM train_val_df = train_val_df.merge(prom_schedule, how='left', on='day_dt') test_df = test_df.merge(prom_schedule, how='left', on='day_dt') train_val_df.day_of_prom.fillna(-1,inplace=True) test_df.day_of_prom.fillna(-1,inplace=True) # DEFINE FEATURES TO SELECT index_features = ['day_dt','store_id','item_code','loc_wh'] X_features = ['year','month','day_of_prom','weekday','day_type','is_5th','store_type','activity_desc', 'prom0','prom1','prom2','prom3','prom4','prom5','prom6','prom7', 'is_vip','free_gift', 'required_num','unit_price','p_rate','dis_spring','num_items_in_offer','loc_selling_area','pds_grace', ] y_feature = ['ttl_quantity'] print(f"\nSELECTED FEATURES: {X_features}") print(f"Cleaning train data...") train_val_df = clean_data(train_val_df) train_val_df = impute_data(train_val_df) train_val_df = select_features(index_features+X_features+y_feature, train_val_df) train_val_df = label_encoder(train_val_df,X_features) print(f"Cleaning test data...") test_df = clean_data(test_df) test_df = impute_data(test_df) test_df = select_features(index_features+X_features+y_feature, test_df) test_df = label_encoder(test_df,X_features) #* SORT train_val_df FOR WEIGHT ASSIGNMENT train_val_df = train_val_df.sort_values('day_dt') # SPLIT TRAIN DATA INTO TRAIN AND VALIDATION train_start, train_end, val_start, val_end = train_val_start, v1_start, v1_start, v2_end train_index = train_val_df[train_val_df.day_dt.between(train_start, train_end)].index val_index = train_val_df[train_val_df.day_dt.between(val_start, val_end)].index #* ASSIGN SAMPLE WEIGHT BY DAY_DT BEFORE VALIDATION SET print(f"ASSIGNING SAMPLE WEIGHT FOR TRAIN DATA FROM {train_start} TO {train_end}") date_range = pd.date_range(train_start, train_end) weight_dict = dict(zip(date_range, np.linspace(0,1,len(date_range)) ** 4)) sample_weight_train = [weight_dict[day_dt] for day_dt in train_val_df.loc[train_index].day_dt] # plt.figure(figsize=(12,8)) # plt.plot(train_val_df.loc[train_index].day_dt, sample_weight_train) # plt.savefig("./test/sample_weight_train.png") #* ASSIGN SAMPLE WEIGHT BY DAY_DT BEFORE TEST SET print(f"ASSIGNING SAMPLE WEIGHT FOR TRAIN AND VAL FROM {train_start} TO {val_end}") date_range = pd.date_range(train_start, val_end) weight_dict = dict(zip(date_range, np.linspace(0,1,len(date_range)) ** 4)) sample_weight_train_val = [weight_dict[day_dt] for day_dt in train_val_df.day_dt] # plt.figure(figsize=(12,8)) # plt.plot(train_val_df.day_dt, sample_weight_train_val) # plt.savefig("./test/sample_weight_train_val.png") # Define X and y X = train_val_df[X_features] X['year'] = X['year'].apply(lambda x: 2020 if int(x) == 2021 else x) y = train_val_df[y_feature[0]] X_train, X_test, y_train, y_test = X.loc[train_index], X.loc[val_index], y.loc[train_index], y.loc[val_index] print(f"Training model...") model = train_model(X_train, X_test, y_train, y_test, sample_weight=sample_weight_train) # print(model.get_feature_importance(type='ShapValues', data=catpool(X_train, y_train, cat_features=['year','month','store_type','activity_desc']))) print("FEATURE IMPORTANCE - PREDICTION VALUES CHANGE:") print(pd.Series(index=X_train.columns, data=model.get_feature_importance(type='PredictionValuesChange')).sort_values(ascending=False)) print("FEATURE IMPORTANCE - LOSS FUNCTION CHANGE:") print(pd.Series(index=X_train.columns, data=model.get_feature_importance(type='LossFunctionChange', data=catpool(X_train, y_train, cat_features=['year','month','store_type','activity_desc']))).sort_values(ascending=False)) print(f"Evaluating model with {model.best_iteration_+50} trees and {round(model.learning_rate_,4)} learning rate...") y_pred_val = model.predict(X_test) y_pred_train = model.predict(X_train) # GET EVALUATE RESULTS train_evaluate, val_evaluate = train_val_df.loc[train_index], train_val_df.loc[val_index] train_evaluate['y_pred'], val_evaluate['y_pred'] = y_pred_train, y_pred_val # VALIDATION RESULTS item_wh = agg_results(df=val_evaluate, v1_start=v1_start, v1_end=v1_end, v2_start=v2_start, v2_end=v2_end) val_passed = np.mean([1 if r>=0.65 and r<=1.2 else 0 for r in item_wh.ratio]) print(f"ALL VALIDATION {val_start, val_end} RESULTS________________________________________________________________________") show_results(item_wh) print(item_wh) if val_passed >= threshold: #* IF 75% SELL THRU BETWEEN 65% AND 120% print(f"ITEM PASSED!!!") if test_df.duplicated().any(): # CHECK DUPLICATES IN TEST DATA print("DUPLICATED DATA IN TEST DATA...REMOVED") test_df = test_df.drop_duplicates() # ADD VALIDATION DATASET INTO TRAINING print(f"TRAINING NEW MODEL WITH ADDED VALIDATION DATA USING {model.best_iteration_+50} TREES AND {round(model.learning_rate_,4)} LEARNING RATE...") retrained_model = retrain_model(X=X, y=y, trees=model.best_iteration_+50, learning_rate=model.learning_rate_, sample_weight=sample_weight_train_val) # PREDICT ON TEST DATASET print(f"PREDICTING ON TEST DATA...") test_X = test_df[X_features] test_X['year'] = test_X.year.apply(lambda x: 2020 if int(x) == 2021 else x) test_df['y_pred'] = [round(x,4) for x in retrained_model.predict(test_X)] print(f"PREDICTION RESULTS:") pred_results = test_df.groupby('day_dt').agg({'ttl_quantity':'sum', 'y_pred':'sum', 'unit_price':'mean'}) pred_results['ratio'] = pred_results.ttl_quantity / pred_results.y_pred print(pred_results) print(f"EXPORTING RESULT DATA, META DATA, IMG, LOG...") export_results(item_code=item, train=train_evaluate, val1=val_evaluate[val_evaluate.day_dt.between(v1_start, v1_end)], val2=val_evaluate[val_evaluate.day_dt.between(v2_start, v2_end)], test=test_df, export_result_meta_data=export_result_meta_data, export_img=export_img) print(f"EXPORTED RESULTS TO {img_path} AT {now()}") print(f"ITEM TRAINING FINISHED AT {now()}") except: traceback.print_exc()