示例#1
0
def run(debug, model_dir, kaggle=False):

    if kaggle:
        files_dir = "/kaggle/input/riiid-split10/*.pickle"
    else:
        files_dir = "../input/riiid-test-answer-prediction/split10_base/*.pickle"

    logger = get_logger()
    # environment
    env = riiideducation.make_env()

    df_question = pd.read_csv(
        "../input/riiid-test-answer-prediction/questions.csv",
        dtype={
            "bundle_id": "int32",
            "question_id": "int32",
            "correct_answer": "int8",
            "part": "int8"
        })
    df_lecture = pd.read_csv(
        "../input/riiid-test-answer-prediction/lectures.csv",
        dtype={
            "lecture_id": "int32",
            "tag": "int16",
            "part": "int8"
        })
    # model loading
    models = []
    for model_path in glob.glob(f"{model_dir}/*model*.pickle"):
        with open(model_path, "rb") as f:
            models.append(pickle.load(f))

    # data preprocessing
    logger = get_logger()
    feature_factory_dict = {}
    feature_factory_dict["tags"] = {"TagsSeparator": TagsSeparator()}
    for column in [
            "content_id", "user_id", "content_type_id",
            "prior_question_had_explanation", "tags1", "tags2", "tags3",
            "tags4", "tags5", "tags6", ("user_id", "content_type_id"),
        ("user_id", "prior_question_had_explanation")
    ]:
        is_partial_fit = column == "content_id"
        is_onebyone = "content_id" in column
        if type(column) == str:
            feature_factory_dict[column] = {
                "CountEncoder":
                CountEncoder(column=column),
                "TargetEncoder":
                TargetEncoder(column=column, is_partial_fit=is_partial_fit)
            }
        else:
            feature_factory_dict[column] = {
                "CountEncoder":
                CountEncoder(column=list(column)),
                "TargetEncoder":
                TargetEncoder(column=list(column),
                              is_partial_fit=is_partial_fit)
            }

    for column in [
            "part", ("user_id", "tag"), ("user_id", "part"),
        ("content_type_id", "part"), ("user_id", "content_id")
    ]:
        if type(column) == str:
            feature_factory_dict[column] = {
                "CountEncoder": CountEncoder(column=column)
            }
        else:
            feature_factory_dict[column] = {
                "CountEncoder": CountEncoder(column=list(column))
            }

    feature_factory_dict["user_id"][
        "MeanAggregatorTimestamp"] = MeanAggregator(column="user_id",
                                                    agg_column="timestamp",
                                                    remove_now=False)
    feature_factory_dict["user_id"][
        "MeanAggregatorPriorQuestionElapsedTime"] = MeanAggregator(
            column="user_id",
            agg_column="prior_question_elapsed_time",
            remove_now=True)
    feature_factory_dict["user_id"]["ShiftDiffEncoder"] = ShiftDiffEncoder(
        groupby="user_id", column="timestamp")
    feature_factory_dict["content_id"][
        "MeanAggregatorPriorQuestionElapsedTime"] = MeanAggregator(
            column="content_id",
            agg_column="prior_question_elapsed_time",
            remove_now=True)

    feature_factory_manager = FeatureFactoryManager(
        feature_factory_dict=feature_factory_dict, logger=logger)

    for model_id, fname in enumerate(glob.glob(files_dir)):
        logger.info(f"loading... {fname}")
        df = pd.read_pickle(fname)
        df["answered_correctly"] = df["answered_correctly"].replace(-1, np.nan)
        df["prior_question_had_explanation"] = df[
            "prior_question_had_explanation"].fillna(-1).astype("int8")
        if debug:
            df = df.head(1000)
        df = pd.concat([
            pd.merge(df[df["content_type_id"] == 0],
                     df_question,
                     how="left",
                     left_on="content_id",
                     right_on="question_id"),
            pd.merge(df[df["content_type_id"] == 1],
                     df_lecture,
                     how="left",
                     left_on="content_id",
                     right_on="lecture_id")
        ]).sort_values(["user_id", "timestamp"])
        feature_factory_manager.fit(df, is_first_fit=True)

    iter_test = env.iter_test()
    df_test_prev = pd.DataFrame()
    df_test_prev1 = pd.DataFrame()
    answered_correctlies = []
    user_answers = []
    i = 0
    t = time.time()
    for (df_test, df_sample_prediction) in iter_test:
        i += 1
        logger.info(
            f"[time: {int(time.time() - t)}iteration {i}: data_length: {len(df_test)}"
        )
        # 前回のデータ更新
        if len(df_test_prev) > 0:  # 初回のみパスするためのif
            answered_correctly = df_test.iloc[0]["prior_group_answers_correct"]
            user_answer = df_test.iloc[0]["prior_group_responses"]
            answered_correctlies.extend([
                int(x) for x in answered_correctly.replace("[", "").replace(
                    "'", "").replace("]", "").replace(" ", "").split(",")
            ])
            user_answers.extend([
                int(x)
                for x in user_answer.replace("[", "").replace("'", "").replace(
                    "]", "").replace(" ", "").split(",")
            ])

        if debug:
            update_record = 1
        else:
            update_record = 150
        if len(df_test_prev) > update_record:
            df_test_prev["answered_correctly"] = answered_correctlies
            df_test_prev["user_answer"] = user_answers
            # df_test_prev = df_test_prev.drop(prior_columns, axis=1)
            df_test_prev = df_test_prev[
                df_test_prev["answered_correctly"] != -1]
            df_test_prev["answered_correctly"] = df_test_prev[
                "answered_correctly"].replace(-1, np.nan)
            df_test_prev["prior_question_had_explanation"] = df_test_prev[
                "prior_question_had_explanation"].fillna(-1).astype("int8")

            feature_factory_manager.fit(df_test_prev)

            df_test_prev = pd.DataFrame()
            answered_correctlies = []
            user_answers = []
        # 今回のデータ取得&計算

        # logger.info(f"[time: {int(time.time() - t)}dataload")
        logger.info(f"merge... ")
        w_df1 = pd.merge(df_test[df_test["content_type_id"] == 0],
                         df_question,
                         how="left",
                         left_on="content_id",
                         right_on="question_id")
        w_df2 = pd.merge(df_test[df_test["content_type_id"] == 1],
                         df_lecture,
                         how="left",
                         left_on="content_id",
                         right_on="lecture_id")
        df_test = pd.concat([w_df1, w_df2])
        df_test["tag"] = df_test["tag"].fillna(-1)
        df_test["correct_answer"] = df_test["correct_answer"].fillna(-1)
        df_test["bundle_id"] = df_test["bundle_id"].fillna(-1)

        logger.info(f"transform... ")
        df_test["prior_question_had_explanation"] = df_test[
            "prior_question_had_explanation"].astype("float16").fillna(
                -1).astype("int8")

        df = feature_factory_manager.partial_predict(df_test)
        df.columns = [x.replace(" ", "_") for x in df.columns]
        logger.info(f"other... ")

        # predict
        predicts = []
        cols = models[0].feature_name()
        for model in models:
            predicts.append(model.predict(df[cols]))

        df["answered_correctly"] = np.array(predicts).transpose().mean(axis=1)
        df_sample_prediction = pd.merge(df_sample_prediction[["row_id"]],
                                        df[["row_id", "answered_correctly"]],
                                        how="inner")
        env.predict(df_sample_prediction)
        df_test_prev = df_test_prev.append(df[cols + ["user_id", "tags"]])
        if debug:
            df_test_prev.to_csv(f"{i}.csv")
示例#2
0
def run(debug, model_dir, kaggle=False):

    if kaggle:
        files_dir = "/kaggle/input/riiid-split10/*.pickle"
    else:
        files_dir = "../input/riiid-test-answer-prediction/split10/*.pickle"

    logger = get_logger()
    # environment
    env = riiideducation.make_env()

    # model loading
    models = []
    for model_path in glob.glob(f"{model_dir}/*model*.pickle"):
        with open(model_path, "rb") as f:
            models.append(pickle.load(f))

    # data preprocessing
    logger = get_logger()
    feature_factory_dict = {}
    for column in [
            "user_id", "content_id", "content_type_id",
            "prior_question_had_explanation"
    ]:
        feature_factory_dict[column] = {
            "CountEncoder": CountEncoder(column=column),
            "TargetEncoder": TargetEncoder(column=column)
        }
    feature_factory_dict["user_id"][
        "MeanAggregatorTimestamp"] = MeanAggregator(column="user_id",
                                                    agg_column="timestamp",
                                                    remove_now=False)
    feature_factory_dict["user_id"][
        "MeanAggregatorPriorQuestionElapsedTime"] = MeanAggregator(
            column="user_id",
            agg_column="prior_question_elapsed_time",
            remove_now=True)
    feature_factory_dict["content_id"][
        "MeanAggregatorPriorQuestionElapsedTime"] = MeanAggregator(
            column="content_id",
            agg_column="prior_question_elapsed_time",
            remove_now=True)
    for column in [("user_id", "content_type_id"),
                   ("user_id", "prior_question_had_explanation")]:
        feature_factory_dict[column] = {
            "CountEncoder": CountEncoder(column=list(column)),
            "TargetEncoder": TargetEncoder(column=list(column))
        }
    feature_factory_manager = FeatureFactoryManager(
        feature_factory_dict=feature_factory_dict, logger=logger)

    for model_id, fname in enumerate(glob.glob(files_dir)):
        logger.info(f"loading... {fname}")
        df = pd.read_pickle(fname)
        df["prior_question_had_explanation"] = df[
            "prior_question_had_explanation"].astype("float16").fillna(
                -1).astype("int8")

        if debug:
            df = df.head(1000)
        feature_factory_manager.fit(df)

    iter_test = env.iter_test()
    df_test_prev = pd.DataFrame()
    i = 0
    t = time.time()
    for (df_test, df_sample_prediction) in iter_test:
        i += 1
        logger.info(
            f"[time: {int(time.time() - t)}iteration {i}: data_length: {len(df_test)}"
        )
        # 前回のデータ更新
        if len(df_test_prev) > 0:
            answered_correctly = df_test.iloc[0]["prior_group_answers_correct"]
            user_answer = df_test.iloc[0]["prior_group_responses"]

            df_test_prev["answered_correctly"] = [
                int(x) for x in answered_correctly.replace("[", "").replace(
                    "'", "").replace("]", "").replace(" ", "").split(",")
            ]
            df_test_prev["user_answer"] = [
                int(x)
                for x in user_answer.replace("[", "").replace("'", "").replace(
                    "]", "").replace(" ", "").split(",")
            ]
            # df_test_prev = df_test_prev.drop(prior_columns, axis=1)

            feature_factory_manager.fit(df_test_prev)
        # 今回のデータ取得&計算

        # logger.info(f"[time: {int(time.time() - t)}dataload")
        logger.info(f"transform... ")
        df_test["prior_question_had_explanation"] = df_test[
            "prior_question_had_explanation"].astype("float16").fillna(
                -1).astype("int8")

        df = feature_factory_manager.partial_predict(df_test)
        logger.info(f"other... ")
        cols = models[0].feature_name()
        for col in cols:
            if col not in df.columns:
                df[col] = -99999

        # predict
        predicts = []
        cols = models[0].feature_name()
        for model in models:
            predicts.append(model.predict(df[cols]))

        df["answered_correctly"] = np.array(predicts).transpose().mean(axis=1)
        df_sample_prediction = pd.merge(df_sample_prediction[["row_id"]],
                                        df[["row_id", "answered_correctly"]],
                                        how="inner")
        env.predict(df_sample_prediction)
        df_test_prev = df[cols + ["user_id"]]

        df_test_prev.to_csv(f"{i}.csv")
示例#3
0
def train_lgbm_cv_newuser_with_iteration(
        df: pd.DataFrame,
        feature_factory_manager: FeatureFactoryManager,
        params: dict,
        output_dir: str,
        model_id: int,
        exp_name: str,
        drop_user_id: bool,
        categorical_feature: list = [],
        experiment_id: int = 0,
        is_debug: bool = False):

    if not is_debug:
        mlflow.start_run(experiment_id=experiment_id, run_name=exp_name)

        mlflow.log_param("model_id", model_id)
        mlflow.log_param("count_row", len(df))
        mlflow.log_param("count_column", len(df.columns))

        for key, value in params.items():
            mlflow.log_param(key, value)
    if drop_user_id:
        features = [
            x for x in df.columns if x not in [
                "answered_correctly", "user_id", "user_answer", "tags",
                "type_of", "bundle_id", "previous_5_ans"
            ]
        ]
    else:
        features = [
            x for x in df.columns if x not in [
                "answered_correctly", "user_answer", "tags", "type_of",
                "bundle_id", "previous_5_ans"
            ]
        ]
    df_imp = pd.DataFrame()
    df_imp["feature"] = features

    df1 = feature_factory_manager.all_predict(df.copy())

    train_idx = []
    val_idx = []
    np.random.seed(0)
    for _, w_df in df.groupby("user_id"):
        if np.random.random() < 0.1:
            # all val
            val_idx.extend(w_df.index.tolist())
        else:
            train_num = int(len(w_df) * 0.9)
            train_idx.extend(w_df[:train_num].index.tolist())
            val_idx.extend(w_df[train_num:].index.tolist())

    val_idx = val_idx[:1000000]
    df1 = df1.drop(
        ["user_answer", "tags", "type_of", "bundle_id", "previous_5_ans"],
        axis=1)
    df1.columns = [
        x.replace("[", "_").replace("]", "_").replace("'", "_").replace(
            " ", "_").replace(",", "_") for x in df1.columns
    ]
    df_train = df1.loc[train_idx]
    df_train = df_train[df_train["answered_correctly"].notnull()]
    df_val = df1.loc[val_idx]
    df_val = df_val[df_val["answered_correctly"].notnull()]

    # valid2
    feature_factory_manager.fit(df.loc[train_idx])

    df2 = []
    for i in tqdm.tqdm(range(len(val_idx) // 100)):
        w_df = df.loc[val_idx[i * 100:(i + 1) * 100]]
        df2.append(feature_factory_manager.partial_predict(w_df))
        feature_factory_manager.fit(w_df)
    df2 = pd.concat(df2)
    df2.columns = [
        x.replace("[", "_").replace("]", "_").replace("'", "_").replace(
            " ", "_").replace(",", "_") for x in df2.columns
    ]
    df2_val = df2[df2["answered_correctly"].notnull()]

    print(df_val)
    print(df2_val)
    assert len(df_val) == len(df2_val)

    print(f"make_train_data len={len(train_idx)}")
    train_data = lgb.Dataset(df_train[features],
                             label=df_train["answered_correctly"])
    print(f"make_test_data len={len(val_idx)}")
    valid_data1 = lgb.Dataset(df_val[features],
                              label=df_val["answered_correctly"])
    valid_data2 = lgb.Dataset(df2_val[features],
                              label=df2_val["answered_correctly"])

    model = lgb.train(params,
                      train_data,
                      categorical_feature=categorical_feature,
                      valid_sets=[train_data, valid_data1, valid_data2],
                      verbose_eval=100)
    print(
        roc_auc_score(df_val["answered_correctly"],
                      model.predict(df_val[features])))
    print(
        roc_auc_score(df2_val["answered_correctly"],
                      model.predict(df2_val[features])))

    if not is_debug:
        mlflow.log_metric("auc_train", model.best_score["training"]["auc"])
        mlflow.log_metric("auc_val", model.best_score["valid_1"]["auc"])
        mlflow.end_run()

    df_imp["importance"] = model.feature_importance(
        "gain") / model.feature_importance("gain").sum()
    df_imp.sort_values(
        "importance",
        ascending=False).to_csv(f"{output_dir}/imp_{model_id}.csv")
    with open(f"{output_dir}/model_{model_id}_lgbm.pickle", "wb") as f:
        pickle.dump(model, f)

    y_oof = model.predict(df.loc[val_idx][features])
    df_oof = pd.DataFrame()
    df_oof["row_id"] = df.loc[val_idx].index
    df_oof["predict"] = y_oof
    df_oof["target"] = df.loc[val_idx]["answered_correctly"].values

    df_oof.to_csv(f"{output_dir}/oof_{model_id}_lgbm.csv", index=False)
示例#4
0
def main(params: dict, output_dir: str):
    import mlflow
    print("start params={}".format(params))
    logger = get_logger()
    df = pd.read_pickle(
        "../input/riiid-test-answer-prediction/train_merged.pickle")
    # df = pd.read_pickle("../input/riiid-test-answer-prediction/split10/train_0.pickle").sort_values(["user_id", "timestamp"]).reset_index(drop=True)
    if is_debug:
        df = df.head(500000)
    df["prior_question_had_explanation"] = df[
        "prior_question_had_explanation"].fillna(-1)
    column_config = {
        ("content_id", "content_type_id"): {
            "type": "category"
        },
        "user_answer": {
            "type": "leakage_feature"
        },
        "answered_correctly": {
            "type": "leakage_feature"
        },
        "part": {
            "type": "category"
        },
        "prior_question_elapsed_time_bin300": {
            "type": "category"
        },
        "duration_previous_content_bin300": {
            "type": "category"
        },
        "prior_question_had_explanation": {
            "type": "category"
        },
        "rating_diff_content_user_id": {
            "type": "numeric"
        }
    }

    with open(f"{output_dir}/transformer_param.json", "w") as f:
        json.dump(params, f)
    if is_make_feature_factory:
        # feature factory
        feature_factory_dict = {"user_id": {}}
        feature_factory_dict["user_id"][
            "DurationPreviousContent"] = DurationPreviousContent(
                is_partial_fit=True)
        feature_factory_dict["user_id"][
            "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder()
        feature_factory_dict["user_id"][
            "UserContentRateEncoder"] = UserContentRateEncoder(
                rate_func="elo", column="user_id")
        feature_factory_manager = FeatureFactoryManager(
            feature_factory_dict=feature_factory_dict,
            logger=logger,
            split_num=1,
            model_id=model_id,
            load_feature=not is_debug,
            save_feature=not is_debug)

        ff_for_transformer = FeatureFactoryForTransformer(
            column_config=column_config,
            dict_path="../feature_engineering/",
            sequence_length=params["max_seq"],
            logger=logger)
        ff_for_transformer.make_dict(df=df)
        df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True)
        feature_factory_manager.fit(df.iloc[:95000].copy())
        w_df = feature_factory_manager.all_predict(df.iloc[:95000].copy())
        ff_for_transformer.fit(w_df)
        for _, w_df in tqdm(df.iloc[95000:].groupby(
            ["user_id", "task_container_id"])):
            ww_df = feature_factory_manager.partial_predict(
                w_df.drop(["answered_correctly", "user_answer"], axis=1))
            group = ff_for_transformer.partial_predict(ww_df)

            ww_df["answered_correctly"] = w_df["answered_correctly"]
            ww_df["user_answer"] = w_df["user_answer"]
            feature_factory_manager.fit(ww_df)
            ff_for_transformer.fit(ww_df)
        for dicts in feature_factory_manager.feature_factory_dict.values():
            for factory in dicts.values():
                factory.logger = None
        feature_factory_manager.logger = None
        with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f:
            pickle.dump(feature_factory_manager, f)

        ff_for_transformer.logger = None
        with open(
                f"{output_dir}/feature_factory_manager_for_transformer.pickle",
                "wb") as f:
            pickle.dump(ff_for_transformer, f)
示例#5
0
    df_val = df.iloc[len(train_idx):]
    print(df_train)

    df2 = pd.read_pickle(fname).sort_values(["user_id", "timestamp"
                                             ]).reset_index(drop=True)
    # df2 = pd.concat([pd.read_pickle(fname).head(500), pd.read_pickle(fname).tail(500)]).sort_values(["user_id", "timestamp"]).reset_index(drop=True)
    df2["answered_correctly"] = df2["answered_correctly"].replace(-1, np.nan)
    df2["prior_question_had_explanation"] = df2[
        "prior_question_had_explanation"].fillna(-1).astype("int8")
    df2_train = feature_factory_manager.all_predict(df2.iloc[train_idx])
    print(df2_train)
    feature_factory_manager.fit(df2.iloc[train_idx], is_first_fit=True)
    df2_val = []
    for i in tqdm.tqdm(range(len(val_idx) // 3)):
        w_df = df2.iloc[val_idx[i * 3:(i + 1) * 3]]
        df2_val.append(feature_factory_manager.partial_predict(w_df))
        feature_factory_manager.fit(w_df)
    df2_val = pd.concat(df2_val)
    df2_val = df2_val.drop(["user_answer", "tags", "type_of"], axis=1)

    os.makedirs(output_dir, exist_ok=True)

    df_val.to_csv("exp055_all.csv", index=False)
    df2_val.to_csv("exp055_partial.csv", index=False)
    params = {
        'objective': 'binary',
        'num_leaves': 32,
        'min_data_in_leaf': 15,  # 42,
        'max_depth': -1,
        'learning_rate': 0.3,
        'boosting': 'gbdt',