def ensemble(train_x=None, train_y=None, test_x=None, ids=None): if train_x is None: train_x, train_y, test_x, ids = data_frame.main() preds_train = [] preds_test = [] for i in range(5): params["random_seed"] = i params["depth"] = 5 + i pred_train, pred_test = predict_cv(params, train_x, train_y, test_x, seed=i + 100) preds_train.append(pred_train) preds_test.append(pred_test) learn_lgb.output_metrics(train_y, pred_train) pred_train = np.mean(preds_train, axis=0) pred_test = np.mean(preds_test, axis=0) learn_lgb.output_metrics(train_y, pred_train) learn_lgb.output_metrics(np.expm1(train_y), np.expm1(pred_train)) return pred_train, pred_test
def em(complement=True): train_x, train_y, test_x, ids = data_frame.main(complement=complement) embeded_lgb_feature = feature_selection.null_importance(train_x, train_y, test_x, ids, create=False) if "categoryId_TE_mean" not in embeded_lgb_feature: embeded_lgb_feature.append("categoryId_TE_mean") if "ratings_disabled" not in embeded_lgb_feature: embeded_lgb_feature.append("ratings_disabled") train_x = train_x[embeded_lgb_feature] test_x = test_x[embeded_lgb_feature] train_x, train_y, test_x = pseudo.get_pseudo_data_set(train_x, train_y, test_x, threshold=0.3) pred_train, pred_test = ensemble(train_x, train_y, test_x, ids) sub = pd.DataFrame() sub["id"] = ids sub['y'] = np.expm1(pred_test) sub.to_csv(f'./data/output/test_cat_complement_{complement}.csv', index=False) sub = pd.DataFrame() sub['y'] = np.expm1(pred_train) sub.to_csv(f'./data/output/train_cat_complement_{complement}.csv', index=False)
def ensemble_diff_category_TE(train_x=None, train_y=None, test_x=None, ids=None): if train_x is None: train_x, train_y, test_x, ids = data_frame.main() train_y = train_y - train_x.categoryId_TE_mean preds_train = [] preds_test = [] for i in range(5): params["random_state"] = i # params["num_leaves"] -= 1 pred_train, pred_test = main(train_x, train_y, test_x, ids, i) preds_train.append(pred_train) preds_test.append(pred_test) pred_train = np.mean(preds_train, axis=0) pred_test = np.mean(preds_test, axis=0) train_y = train_y + train_x.categoryId_TE_mean pred_train = pred_train + train_x.categoryId_TE_mean pred_test = pred_test + test_x.categoryId_TE_mean output_metrics(train_y, pred_train) output_metrics(np.expm1(train_y), np.expm1(pred_train)) return pred_train, pred_test
def ensemble_div_period(train_x=None, train_y=None, test_x=None, ids=None): if train_x is None: train_x, train_y, test_x, ids = data_frame.main() train_y = train_y / train_x.period.apply(np.log1p) preds_train = [] preds_test = [] for i in range(5): params["random_state"] = i params["num_leaves"] -= 1 pred_train, pred_test = main(train_x, train_y, test_x, ids, i) preds_train.append(pred_train) preds_test.append(pred_test) pred_train = np.mean(preds_train, axis=0) pred_test = np.mean(preds_test, axis=0) train_y = train_y * train_x.period.apply(np.log1p) pred_train = pred_train * train_x.period.apply(np.log1p) pred_test = pred_test * test_x.period.apply(np.log1p) output_metrics(train_y, pred_train) output_metrics(np.expm1(train_y), np.expm1(pred_train)) sub = pd.DataFrame() sub["id"] = ids sub['y'] = np.expm1(pred_test) sub.to_csv('./data/output/lgb.csv', index=False) return pred_train, pred_test
def ensemble(train_x=None, train_y=None, test_x=None, ids=None): if train_x is None: train_x, train_y, test_x, ids = data_frame.main() preds_train = [] preds_test = [] for i in range(5): params["random_state"] = i params["num_leaves"] += 1 pred_train, pred_test = main(train_x, train_y, test_x, ids, i) preds_train.append(pred_train) preds_test.append(pred_test) pred_train = np.mean(preds_train, axis=0) pred_test = np.mean(preds_test, axis=0) output_metrics(train_y, pred_train) output_metrics(np.expm1(train_y), np.expm1(pred_train)) # sub = pd.DataFrame() # sub["id"] = ids # sub['y'] = np.expm1(pred_test) # # sub.to_csv('./data/output/test_lgb.csv', index=False) # # sub = pd.DataFrame() # sub['y'] = np.expm1(pred_train) # # sub.to_csv('./data/output/train_lgb.csv', index=False) return pred_train, pred_test
def em(complement=True): train_x, train_y, test_x, ids = data_frame.main(complement=complement) # embeded_lgb_feature = feature_selection.null_importance(train_x, train_y, test_x, ids, create=True) embeded_lgb_feature = feature_selection.main(train_x, train_y, test_x, ids) if "categoryId_TE_mean" not in embeded_lgb_feature: embeded_lgb_feature.append("categoryId_TE_mean") if "ratings_disabled" not in embeded_lgb_feature: embeded_lgb_feature.append("ratings_disabled") train_x = train_x[embeded_lgb_feature] test_x = test_x[embeded_lgb_feature] train_x, train_y, test_x = pseudo.get_pseudo_data_set(train_x, train_y, test_x, threshold=0.3) preds_train = [] preds_test = [] # 通常 pred_train, pred_test = ensemble(train_x, train_y, test_x, ids) preds_train.append(pred_train) preds_test.append(pred_test) # categoryIdのTEの差分 pred_train, pred_test = ensemble_diff_category_TE(train_x, train_y, test_x, ids) preds_train.append(pred_train) preds_test.append(pred_test) # 期間で除算 pred_train, pred_test = ensemble_div_period(train_x, train_y, test_x, ids) preds_train.append(pred_train) preds_test.append(pred_test) # 平均をとる pred_train = np.mean(preds_train, axis=0) pred_test = np.mean(preds_test, axis=0) output_metrics(train_y, pred_train) output_metrics(np.expm1(train_y), np.expm1(pred_train)) sub = pd.DataFrame() sub["id"] = ids sub['y'] = np.expm1(pred_test) sub.to_csv(f'./data/output/test_lgb_complement_{complement}.csv', index=False) sub = pd.DataFrame() sub['y'] = np.expm1(pred_train) sub.to_csv(f'./data/output/train_lgb_complement_{complement}.csv', index=False) return pred_train, pred_test, preds_train, preds_test
def select_k_best(train_x=None, train_y=None, test_x=None, ids=None, seed=22, k=300): if train_x is None: train_x, train_y, test_x, ids = data_frame.main() selector = SelectKBest(score_func=f_regression, k=5) selector.fit(train_x, train_y) mask = selector.get_support() return selector.transform(train_x), selector.transform(test_x)
def ensemble(): train_x, train_y, test_x, ids = data_frame.main() preds_train = [] preds_test = [] for i in range(5): pred_train, pred_test = main(train_x, train_y, test_x, ids, i) preds_train.append(pred_train) preds_test.append(pred_test) pred_train = np.mean(preds_train, axis=0) pred_test = np.mean(preds_test, axis=0) learn_lgb.output_metrics(train_y, pred_train) learn_lgb.output_metrics(np.expm1(train_y), np.expm1(pred_train)) sub = pd.DataFrame() sub["id"] = ids sub['y'] = np.expm1(pred_test) sub.to_csv('./data/output/ridge.csv', index=False)
def em2(): train_x, train_y, test_x, ids = data_frame.main(complement=True) pred_train_a, pred_test_a, _, _ = em(complement=True) pred_train_b, pred_test_b, _, _ = em(complement=False) # 平均をとる pred_train = np.mean([pred_train_a, pred_train_b], axis=0) pred_test = np.mean([pred_test_a, pred_test_b], axis=0) output_metrics(train_y, pred_train) output_metrics(np.expm1(train_y), np.expm1(pred_train)) sub = pd.DataFrame() sub["id"] = ids sub['y'] = np.expm1(pred_test) sub.to_csv('./data/output/test_lgb.csv', index=False) sub = pd.DataFrame() sub['y'] = np.expm1(pred_train) sub.to_csv('./data/output/train_lgb.csv', index=False)
def main(train_x=None, train_y=None, test_x=None, ids=None, seed=22): if train_x is None: train_x, train_y, test_x, ids = data_frame.main() lgbr = LGBMRegressor(n_estimators=500, learning_rate=0.1, num_leaves=86, subsample_freq=1, subsample=0.9, feature_fraction=0.2, bagging_seed=11, metrics="rmse", reg_alpha=0.1, reg_lambda=0.1, random_state=0) embeded_lgb_selector = SelectFromModel(lgbr, threshold='1.25*median') embeded_lgb_selector.fit(train_x, train_y) embeded_lgb_support = embeded_lgb_selector.get_support() embeded_lgb_feature = train_x.loc[:, embeded_lgb_support].columns.tolist() print(str(len(embeded_lgb_feature)), 'selected features') return embeded_lgb_feature
def main(train_x=None, train_y=None, test_x=None, ids=None, seed=22): if train_x is None: train_x, train_y, test_x, ids = data_frame.main() drop_null = set(test_x.keys()[test_x.isna().any()].to_list() + train_x.keys()[train_x.isna().any()].to_list()) drop_list = ["publishedAt", "categoryId", "collection_date" ] + list(drop_null) train_x = train_x.drop(drop_list, axis=1) test_x = test_x.drop(drop_list, axis=1) # train_x, test_x = preprocess(train_x, test_x) pred_train, pred_test = predict_cv(train_x, train_y, test_x, seed=seed) learn_lgb.output_metrics(train_y, pred_train) # sub = pd.DataFrame() # sub["id"] = ids # sub['y'] = np.expm1(pred_test) # # sub.to_csv('./data/output/ridge.csv', index=False) return pred_train, pred_test
def main(train_x=None, train_y=None, test_x=None, ids=None, seed=22): if train_x is None: train_x, train_y, test_x, ids = data_frame.main() # embeded_lgb_feature = feature_selection.null_importance(train_x, train_y, test_x, ids) # if "categoryId_TE_mean" not in embeded_lgb_feature: # embeded_lgb_feature.append("categoryId_TE_mean") # if "ratings_disabled" not in embeded_lgb_feature: # embeded_lgb_feature.append("ratings_disabled") # train_x = train_x[embeded_lgb_feature] # test_x = test_x[embeded_lgb_feature] pred_train, pred_test = predict_cv(params, train_x, train_y, test_x, seed) # output_metrics(train_y, pred_train) # output_metrics(np.expm1(train_y), np.expm1(pred_train)) # sub = pd.DataFrame() # sub["id"] = ids # sub['y'] = np.expm1(pred_test) # # sub.to_csv('./data/output/lgb.csv', index=False) return pred_train, pred_test
def null_importance(train_x=None, train_y=None, test_x=None, ids=None, seed=22, create=False): # 閾値を設定 THRESHOLD = 40 if not create: print(f"Create {create}") actual_importance = pd.read_csv("./data/null_importance.csv") imp_features = [] for feature, score in zip(actual_importance["feature"], actual_importance["score"]): if score >= THRESHOLD: imp_features.append(feature) print(str(len(imp_features)), 'selected features') return imp_features if train_x is None: train_x, train_y, test_x, ids = data_frame.main() # 実際の目的変数でモデルを学習し、特徴量の重要度を含むデータフレームを作成 actual_importance = get_feature_importances(train_x, train_y, shuffle=False) # 目的変数をシャッフルした状態でモデルを学習し、特徴量の重要度を含むデータフレームを作成 N_RUNS = 100 null_importance = pd.DataFrame() for i in range(N_RUNS): imp_df = get_feature_importances(train_x, train_y, shuffle=True, seed=i) imp_df["run"] = i + 1 null_importance = pd.concat([null_importance, imp_df]) # 実データにおいて特徴量の重要度が高かった上位5位を表示 # for feature in actual_importance["feature"][:5]: # display_distributions(actual_importance, null_importance, feature) score_list = [] # 閾値を超える特徴量を取得 imp_features = [] for feature in actual_importance["feature"]: actual_value = actual_importance.query( f"feature=='{feature}'")["importance"].values null_value = null_importance.query( f"feature=='{feature}'")["importance"].values percentage = (null_value < actual_value).sum() / null_value.size * 100 score_list.append(percentage) if percentage >= THRESHOLD: imp_features.append(feature) actual_importance["score"] = score_list actual_importance.to_csv("./data/null_importance.csv", index=False) print(str(len(imp_features)), 'selected features') return imp_features