def dow_analyze(test_df, model, col, period_from, period_to): validate_data = test_df[(test_df['visit_date'] >= period_from) & (test_df["visit_date"] <= period_to)] validate_data = validate_data[col] score_list = [] for dow_num in range(7): print("-" * 40) print("dow= ", dow_num) dowed_data = validate_data[validate_data["dow"] == dow_num] print(dowed_data["visitors"].describe()) x_valid = dowed_data.drop('visitors', axis=1) y_valid = model.predict(x_valid) print(pd.DataFrame(y_valid).describe()) validation_score = custom_metrics.rmse(dowed_data["visitors"], y_valid) score_list.append(validation_score) print(score_list)
def split_train_skt(df, col, period_list, model): models = [] score = 0.0 for period in period_list: train = cut_input(df, period[0], period[1]) test = cut_input(df, period[2], period[3]) model.fit(train[col], train['visitors']) y_pred = model.predict(test[col]) this_score = custom_metrics.rmse(test["visitors"], y_pred) print(this_score) score = score + this_score models.append(model) score = score / len(period_list) print("Average score:") print(score) return models
def validate(test_df, model, col, period_from, period_to, verbose=False, save_model=False, save_name=None): validate_data = test_df[(test_df['visit_date'] >= period_from) & (test_df["visit_date"] <= period_to)] if verbose: print("-" * 40) print(period_from, " to ", period_to) print(validate_data["visitors"].describe()) x_valid = validate_data[col].drop('visitors', axis=1) y_valid = model.predict(x_valid) if verbose: print(pd.DataFrame(y_valid).describe()) validation_score = custom_metrics.rmse(validate_data["visitors"], y_valid) print(validation_score) if save_model: save_prediction(validate_data, y_valid, save_name)
def store_analyze(test_df, model, col, period_from, period_to): validate_data = test_df[(test_df['visit_date'] >= period_from) & (test_df["visit_date"] <= period_to)] validate_data = validate_data[col] x_valid = validate_data.drop("visitors", axis=1) y_valid = model.predict(x_valid) validate_data["predicted"] = y_valid # print(validate_data.head()) ret_list = [] for air_store_num in range(812): single_store = validate_data[validate_data["air_store_num"] == air_store_num] if single_store.empty: print("whoops:", air_store_num) continue validate_score = custom_metrics.rmse(single_store["visitors"], single_store["predicted"]) ret_list.append([air_store_num, validate_score]) ret_df = pd.DataFrame(ret_list, columns=["air_store_num", "score"]) return ret_df
def validate(df, actual_col_name, validate_col_name, period_list=None, verbose=False): if period_list is None: period_list = pocket_periods.get_six_week_period_list() score_list = [] for period_from, period_to in period_list: validate_data = df[(df['visit_date'] >= period_from) & (df["visit_date"] <= period_to)] validation_score = custom_metrics.rmse( validate_data[actual_col_name], validate_data[validate_col_name]) score_list.append(validation_score) if verbose: print("score from ", period_from, " to ", period_to, " is:") print(validation_score) print("average score for", validate_col_name, " is:") average_score = sum(score_list) / float(len(score_list)) print(average_score) print("-" * 40)
w2_good = get_joined_model(good_filenames) short_filenames = get_filenames(prefix_w2, "_short.csv") w2_short = get_joined_model(short_filenames) out_filenames = get_filenames(prefix_w2, "_out.csv") w2_out = get_joined_model(out_filenames) nores_filenames = get_filenames(prefix_w2, "_no_res.csv") w2_nores = get_joined_model(nores_filenames) print("Loaded data") # print(w2_normal["prediction"].count()) # print(w2_good["prediction"].count()) # print(w2_short["prediction"].count()) # print(w2_out["prediction"].count()) # print(w2_nores["prediction"].count()) print("-" * 40) normal_score = custom_metrics.rmse(w2_normal["actual_log"], w2_normal["prediction"]) short_score = custom_metrics.rmse(w2_short["actual_log"], w2_short["prediction"]) out_score = custom_metrics.rmse(w2_out["actual_log"], w2_out["prediction"]) nores_score = custom_metrics.rmse(w2_nores["actual_log"], w2_nores["prediction"]) print("normal=", normal_score) print("short=", short_score) print("out=", out_score) print("nores=", nores_score) print("-" * 40) df = pd.merge(w2_normal, w2_short, how="left", on=["air_store_id", "visit_date"],
w2_short = pd.read_csv('../output/p1_w2_0416_0422_short_train.csv') print("Loaded data") print(all_model["actual"].count()) print(w2_model["actual"].count()) print(w2_good["actual"].count()) print(w2_short["actual"].count()) print("-" * 40) df = pd.merge(all_model, w2_model, on=["air_store_id", "visit_date"], suffixes=["_all", "_w2"]) # print(df.describe()) all_score = custom_metrics.rmse(df["actual_log_all"], df["prediction_all"]) w2_score = custom_metrics.rmse(df["actual_log_all"], df["prediction_w2"]) print("all=", all_score) print("w2=", w2_score) df["averaged"] = df["prediction_all"] * 0.1 + df["prediction_w2"] * 0.9 avg_score = custom_metrics.rmse(df["actual_log_all"], df["averaged"]) print("all+w2=", avg_score) df = pd.merge(df, w2_good, on=["air_store_id", "visit_date"], how="left", suffixes=["", "_good"]) df["with_good"] = np.where(df["prediction"].isnull(), df["prediction_w2"], df["prediction"])