def run(debug, model_dir, kaggle=False): if kaggle: files_dir = "/kaggle/input/riiid-split10/*.pickle" else: files_dir = "../input/riiid-test-answer-prediction/split10_base/*.pickle" logger = get_logger() # environment env = riiideducation.make_env() df_question = pd.read_csv( "../input/riiid-test-answer-prediction/questions.csv", dtype={ "bundle_id": "int32", "question_id": "int32", "correct_answer": "int8", "part": "int8" }) df_lecture = pd.read_csv( "../input/riiid-test-answer-prediction/lectures.csv", dtype={ "lecture_id": "int32", "tag": "int16", "part": "int8" }) # model loading models = [] for model_path in glob.glob(f"{model_dir}/*model*.pickle"): with open(model_path, "rb") as f: models.append(pickle.load(f)) # data preprocessing logger = get_logger() feature_factory_dict = {} feature_factory_dict["tags"] = {"TagsSeparator": TagsSeparator()} for column in [ "content_id", "user_id", "content_type_id", "prior_question_had_explanation", "tags1", "tags2", "tags3", "tags4", "tags5", "tags6", ("user_id", "content_type_id"), ("user_id", "prior_question_had_explanation") ]: is_partial_fit = column == "content_id" is_onebyone = "content_id" in column if type(column) == str: feature_factory_dict[column] = { "CountEncoder": CountEncoder(column=column), "TargetEncoder": TargetEncoder(column=column, is_partial_fit=is_partial_fit) } else: feature_factory_dict[column] = { "CountEncoder": CountEncoder(column=list(column)), "TargetEncoder": TargetEncoder(column=list(column), is_partial_fit=is_partial_fit) } for column in [ "part", ("user_id", "tag"), ("user_id", "part"), ("content_type_id", "part"), ("user_id", "content_id") ]: if type(column) == str: feature_factory_dict[column] = { "CountEncoder": CountEncoder(column=column) } else: feature_factory_dict[column] = { "CountEncoder": CountEncoder(column=list(column)) } feature_factory_dict["user_id"][ "MeanAggregatorTimestamp"] = MeanAggregator(column="user_id", agg_column="timestamp", remove_now=False) feature_factory_dict["user_id"][ "MeanAggregatorPriorQuestionElapsedTime"] = MeanAggregator( column="user_id", agg_column="prior_question_elapsed_time", remove_now=True) feature_factory_dict["user_id"]["ShiftDiffEncoder"] = ShiftDiffEncoder( groupby="user_id", column="timestamp") feature_factory_dict["content_id"][ "MeanAggregatorPriorQuestionElapsedTime"] = MeanAggregator( column="content_id", agg_column="prior_question_elapsed_time", remove_now=True) feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger) for model_id, fname in enumerate(glob.glob(files_dir)): logger.info(f"loading... {fname}") df = pd.read_pickle(fname) df["answered_correctly"] = df["answered_correctly"].replace(-1, np.nan) df["prior_question_had_explanation"] = df[ "prior_question_had_explanation"].fillna(-1).astype("int8") if debug: df = df.head(1000) df = pd.concat([ pd.merge(df[df["content_type_id"] == 0], df_question, how="left", left_on="content_id", right_on="question_id"), pd.merge(df[df["content_type_id"] == 1], df_lecture, how="left", left_on="content_id", right_on="lecture_id") ]).sort_values(["user_id", "timestamp"]) feature_factory_manager.fit(df, is_first_fit=True) iter_test = env.iter_test() df_test_prev = pd.DataFrame() df_test_prev1 = pd.DataFrame() answered_correctlies = [] user_answers = [] i = 0 t = time.time() for (df_test, df_sample_prediction) in iter_test: i += 1 logger.info( f"[time: {int(time.time() - t)}iteration {i}: data_length: {len(df_test)}" ) # 前回のデータ更新 if len(df_test_prev) > 0: # 初回のみパスするためのif answered_correctly = df_test.iloc[0]["prior_group_answers_correct"] user_answer = df_test.iloc[0]["prior_group_responses"] answered_correctlies.extend([ int(x) for x in answered_correctly.replace("[", "").replace( "'", "").replace("]", "").replace(" ", "").split(",") ]) user_answers.extend([ int(x) for x in user_answer.replace("[", "").replace("'", "").replace( "]", "").replace(" ", "").split(",") ]) if debug: update_record = 1 else: update_record = 150 if len(df_test_prev) > update_record: df_test_prev["answered_correctly"] = answered_correctlies df_test_prev["user_answer"] = user_answers # df_test_prev = df_test_prev.drop(prior_columns, axis=1) df_test_prev = df_test_prev[ df_test_prev["answered_correctly"] != -1] df_test_prev["answered_correctly"] = df_test_prev[ "answered_correctly"].replace(-1, np.nan) df_test_prev["prior_question_had_explanation"] = df_test_prev[ "prior_question_had_explanation"].fillna(-1).astype("int8") feature_factory_manager.fit(df_test_prev) df_test_prev = pd.DataFrame() answered_correctlies = [] user_answers = [] # 今回のデータ取得&計算 # logger.info(f"[time: {int(time.time() - t)}dataload") logger.info(f"merge... ") w_df1 = pd.merge(df_test[df_test["content_type_id"] == 0], df_question, how="left", left_on="content_id", right_on="question_id") w_df2 = pd.merge(df_test[df_test["content_type_id"] == 1], df_lecture, how="left", left_on="content_id", right_on="lecture_id") df_test = pd.concat([w_df1, w_df2]) df_test["tag"] = df_test["tag"].fillna(-1) df_test["correct_answer"] = df_test["correct_answer"].fillna(-1) df_test["bundle_id"] = df_test["bundle_id"].fillna(-1) logger.info(f"transform... ") df_test["prior_question_had_explanation"] = df_test[ "prior_question_had_explanation"].astype("float16").fillna( -1).astype("int8") df = feature_factory_manager.partial_predict(df_test) df.columns = [x.replace(" ", "_") for x in df.columns] logger.info(f"other... ") # predict predicts = [] cols = models[0].feature_name() for model in models: predicts.append(model.predict(df[cols])) df["answered_correctly"] = np.array(predicts).transpose().mean(axis=1) df_sample_prediction = pd.merge(df_sample_prediction[["row_id"]], df[["row_id", "answered_correctly"]], how="inner") env.predict(df_sample_prediction) df_test_prev = df_test_prev.append(df[cols + ["user_id", "tags"]]) if debug: df_test_prev.to_csv(f"{i}.csv")
def run(debug, model_dir, kaggle=False): if kaggle: files_dir = "/kaggle/input/riiid-split10/*.pickle" else: files_dir = "../input/riiid-test-answer-prediction/split10/*.pickle" logger = get_logger() # environment env = riiideducation.make_env() # model loading models = [] for model_path in glob.glob(f"{model_dir}/*model*.pickle"): with open(model_path, "rb") as f: models.append(pickle.load(f)) # data preprocessing logger = get_logger() feature_factory_dict = {} for column in [ "user_id", "content_id", "content_type_id", "prior_question_had_explanation" ]: feature_factory_dict[column] = { "CountEncoder": CountEncoder(column=column), "TargetEncoder": TargetEncoder(column=column) } feature_factory_dict["user_id"][ "MeanAggregatorTimestamp"] = MeanAggregator(column="user_id", agg_column="timestamp", remove_now=False) feature_factory_dict["user_id"][ "MeanAggregatorPriorQuestionElapsedTime"] = MeanAggregator( column="user_id", agg_column="prior_question_elapsed_time", remove_now=True) feature_factory_dict["content_id"][ "MeanAggregatorPriorQuestionElapsedTime"] = MeanAggregator( column="content_id", agg_column="prior_question_elapsed_time", remove_now=True) for column in [("user_id", "content_type_id"), ("user_id", "prior_question_had_explanation")]: feature_factory_dict[column] = { "CountEncoder": CountEncoder(column=list(column)), "TargetEncoder": TargetEncoder(column=list(column)) } feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger) for model_id, fname in enumerate(glob.glob(files_dir)): logger.info(f"loading... {fname}") df = pd.read_pickle(fname) df["prior_question_had_explanation"] = df[ "prior_question_had_explanation"].astype("float16").fillna( -1).astype("int8") if debug: df = df.head(1000) feature_factory_manager.fit(df) iter_test = env.iter_test() df_test_prev = pd.DataFrame() i = 0 t = time.time() for (df_test, df_sample_prediction) in iter_test: i += 1 logger.info( f"[time: {int(time.time() - t)}iteration {i}: data_length: {len(df_test)}" ) # 前回のデータ更新 if len(df_test_prev) > 0: answered_correctly = df_test.iloc[0]["prior_group_answers_correct"] user_answer = df_test.iloc[0]["prior_group_responses"] df_test_prev["answered_correctly"] = [ int(x) for x in answered_correctly.replace("[", "").replace( "'", "").replace("]", "").replace(" ", "").split(",") ] df_test_prev["user_answer"] = [ int(x) for x in user_answer.replace("[", "").replace("'", "").replace( "]", "").replace(" ", "").split(",") ] # df_test_prev = df_test_prev.drop(prior_columns, axis=1) feature_factory_manager.fit(df_test_prev) # 今回のデータ取得&計算 # logger.info(f"[time: {int(time.time() - t)}dataload") logger.info(f"transform... ") df_test["prior_question_had_explanation"] = df_test[ "prior_question_had_explanation"].astype("float16").fillna( -1).astype("int8") df = feature_factory_manager.partial_predict(df_test) logger.info(f"other... ") cols = models[0].feature_name() for col in cols: if col not in df.columns: df[col] = -99999 # predict predicts = [] cols = models[0].feature_name() for model in models: predicts.append(model.predict(df[cols])) df["answered_correctly"] = np.array(predicts).transpose().mean(axis=1) df_sample_prediction = pd.merge(df_sample_prediction[["row_id"]], df[["row_id", "answered_correctly"]], how="inner") env.predict(df_sample_prediction) df_test_prev = df[cols + ["user_id"]] df_test_prev.to_csv(f"{i}.csv")
def train_lgbm_cv_newuser_with_iteration( df: pd.DataFrame, feature_factory_manager: FeatureFactoryManager, params: dict, output_dir: str, model_id: int, exp_name: str, drop_user_id: bool, categorical_feature: list = [], experiment_id: int = 0, is_debug: bool = False): if not is_debug: mlflow.start_run(experiment_id=experiment_id, run_name=exp_name) mlflow.log_param("model_id", model_id) mlflow.log_param("count_row", len(df)) mlflow.log_param("count_column", len(df.columns)) for key, value in params.items(): mlflow.log_param(key, value) if drop_user_id: features = [ x for x in df.columns if x not in [ "answered_correctly", "user_id", "user_answer", "tags", "type_of", "bundle_id", "previous_5_ans" ] ] else: features = [ x for x in df.columns if x not in [ "answered_correctly", "user_answer", "tags", "type_of", "bundle_id", "previous_5_ans" ] ] df_imp = pd.DataFrame() df_imp["feature"] = features df1 = feature_factory_manager.all_predict(df.copy()) train_idx = [] val_idx = [] np.random.seed(0) for _, w_df in df.groupby("user_id"): if np.random.random() < 0.1: # all val val_idx.extend(w_df.index.tolist()) else: train_num = int(len(w_df) * 0.9) train_idx.extend(w_df[:train_num].index.tolist()) val_idx.extend(w_df[train_num:].index.tolist()) val_idx = val_idx[:1000000] df1 = df1.drop( ["user_answer", "tags", "type_of", "bundle_id", "previous_5_ans"], axis=1) df1.columns = [ x.replace("[", "_").replace("]", "_").replace("'", "_").replace( " ", "_").replace(",", "_") for x in df1.columns ] df_train = df1.loc[train_idx] df_train = df_train[df_train["answered_correctly"].notnull()] df_val = df1.loc[val_idx] df_val = df_val[df_val["answered_correctly"].notnull()] # valid2 feature_factory_manager.fit(df.loc[train_idx]) df2 = [] for i in tqdm.tqdm(range(len(val_idx) // 100)): w_df = df.loc[val_idx[i * 100:(i + 1) * 100]] df2.append(feature_factory_manager.partial_predict(w_df)) feature_factory_manager.fit(w_df) df2 = pd.concat(df2) df2.columns = [ x.replace("[", "_").replace("]", "_").replace("'", "_").replace( " ", "_").replace(",", "_") for x in df2.columns ] df2_val = df2[df2["answered_correctly"].notnull()] print(df_val) print(df2_val) assert len(df_val) == len(df2_val) print(f"make_train_data len={len(train_idx)}") train_data = lgb.Dataset(df_train[features], label=df_train["answered_correctly"]) print(f"make_test_data len={len(val_idx)}") valid_data1 = lgb.Dataset(df_val[features], label=df_val["answered_correctly"]) valid_data2 = lgb.Dataset(df2_val[features], label=df2_val["answered_correctly"]) model = lgb.train(params, train_data, categorical_feature=categorical_feature, valid_sets=[train_data, valid_data1, valid_data2], verbose_eval=100) print( roc_auc_score(df_val["answered_correctly"], model.predict(df_val[features]))) print( roc_auc_score(df2_val["answered_correctly"], model.predict(df2_val[features]))) if not is_debug: mlflow.log_metric("auc_train", model.best_score["training"]["auc"]) mlflow.log_metric("auc_val", model.best_score["valid_1"]["auc"]) mlflow.end_run() df_imp["importance"] = model.feature_importance( "gain") / model.feature_importance("gain").sum() df_imp.sort_values( "importance", ascending=False).to_csv(f"{output_dir}/imp_{model_id}.csv") with open(f"{output_dir}/model_{model_id}_lgbm.pickle", "wb") as f: pickle.dump(model, f) y_oof = model.predict(df.loc[val_idx][features]) df_oof = pd.DataFrame() df_oof["row_id"] = df.loc[val_idx].index df_oof["predict"] = y_oof df_oof["target"] = df.loc[val_idx]["answered_correctly"].values df_oof.to_csv(f"{output_dir}/oof_{model_id}_lgbm.csv", index=False)
def main(params: dict, output_dir: str): import mlflow print("start params={}".format(params)) logger = get_logger() df = pd.read_pickle( "../input/riiid-test-answer-prediction/train_merged.pickle") # df = pd.read_pickle("../input/riiid-test-answer-prediction/split10/train_0.pickle").sort_values(["user_id", "timestamp"]).reset_index(drop=True) if is_debug: df = df.head(500000) df["prior_question_had_explanation"] = df[ "prior_question_had_explanation"].fillna(-1) column_config = { ("content_id", "content_type_id"): { "type": "category" }, "user_answer": { "type": "leakage_feature" }, "answered_correctly": { "type": "leakage_feature" }, "part": { "type": "category" }, "prior_question_elapsed_time_bin300": { "type": "category" }, "duration_previous_content_bin300": { "type": "category" }, "prior_question_had_explanation": { "type": "category" }, "rating_diff_content_user_id": { "type": "numeric" } } with open(f"{output_dir}/transformer_param.json", "w") as f: json.dump(params, f) if is_make_feature_factory: # feature factory feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent( is_partial_fit=True) feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_dict["user_id"][ "UserContentRateEncoder"] = UserContentRateEncoder( rate_func="elo", column="user_id") feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id=model_id, load_feature=not is_debug, save_feature=not is_debug) ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) ff_for_transformer.make_dict(df=df) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) feature_factory_manager.fit(df.iloc[:95000].copy()) w_df = feature_factory_manager.all_predict(df.iloc[:95000].copy()) ff_for_transformer.fit(w_df) for _, w_df in tqdm(df.iloc[95000:].groupby( ["user_id", "task_container_id"])): ww_df = feature_factory_manager.partial_predict( w_df.drop(["answered_correctly", "user_answer"], axis=1)) group = ff_for_transformer.partial_predict(ww_df) ww_df["answered_correctly"] = w_df["answered_correctly"] ww_df["user_answer"] = w_df["user_answer"] feature_factory_manager.fit(ww_df) ff_for_transformer.fit(ww_df) for dicts in feature_factory_manager.feature_factory_dict.values(): for factory in dicts.values(): factory.logger = None feature_factory_manager.logger = None with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f: pickle.dump(feature_factory_manager, f) ff_for_transformer.logger = None with open( f"{output_dir}/feature_factory_manager_for_transformer.pickle", "wb") as f: pickle.dump(ff_for_transformer, f)
df_val = df.iloc[len(train_idx):] print(df_train) df2 = pd.read_pickle(fname).sort_values(["user_id", "timestamp" ]).reset_index(drop=True) # df2 = pd.concat([pd.read_pickle(fname).head(500), pd.read_pickle(fname).tail(500)]).sort_values(["user_id", "timestamp"]).reset_index(drop=True) df2["answered_correctly"] = df2["answered_correctly"].replace(-1, np.nan) df2["prior_question_had_explanation"] = df2[ "prior_question_had_explanation"].fillna(-1).astype("int8") df2_train = feature_factory_manager.all_predict(df2.iloc[train_idx]) print(df2_train) feature_factory_manager.fit(df2.iloc[train_idx], is_first_fit=True) df2_val = [] for i in tqdm.tqdm(range(len(val_idx) // 3)): w_df = df2.iloc[val_idx[i * 3:(i + 1) * 3]] df2_val.append(feature_factory_manager.partial_predict(w_df)) feature_factory_manager.fit(w_df) df2_val = pd.concat(df2_val) df2_val = df2_val.drop(["user_answer", "tags", "type_of"], axis=1) os.makedirs(output_dir, exist_ok=True) df_val.to_csv("exp055_all.csv", index=False) df2_val.to_csv("exp055_partial.csv", index=False) params = { 'objective': 'binary', 'num_leaves': 32, 'min_data_in_leaf': 15, # 42, 'max_depth': -1, 'learning_rate': 0.3, 'boosting': 'gbdt',