示例#1
0
def make_feature_factory_manager(split_num, model_id=None):
    logger = get_logger()

    feature_factory_dict = {}

    for column in ["user_id", "content_id"]:
        is_partial_fit = (column == "content_id" or column == "user_id")

        if type(column) == str:
            feature_factory_dict[column] = {
                "TargetEncoder":
                TargetEncoder(column=column, is_partial_fit=is_partial_fit)
            }
        else:
            feature_factory_dict[column] = {
                "TargetEncoder":
                TargetEncoder(column=list(column),
                              is_partial_fit=is_partial_fit)
            }
    feature_factory_dict["user_id"][
        "DurationPreviousContent"] = DurationPreviousContent(
            is_partial_fit=True)
    feature_factory_dict["user_id"][
        "PastNTimestampEncoder"] = PastNFeatureEncoder(
            column="timestamp",
            past_ns=[2, 3, 4, 5, 6, 7, 8, 9, 10],
            agg_funcs=["vslast"],
            remove_now=False)
    feature_factory_dict["user_id"]["StudyTermEncoder2"] = StudyTermEncoder2(
        is_partial_fit=True)
    feature_factory_dict["user_id"][
        "ElapsedTimeMeanByContentIdEncoder"] = ElapsedTimeMeanByContentIdEncoder(
        )
    feature_factory_dict["user_id"]["CountEncoder"] = CountEncoder(
        column="user_id", is_partial_fit=True)
    feature_factory_dict[("user_id", "part")] = {
        "UserContentRateEncoder":
        UserContentRateEncoder(column=["user_id", "part"], rate_func="elo")
    }
    feature_factory_dict["user_id"][
        "PastNUserAnswerHistory"] = PastNUserAnswerHistory(past_n=2,
                                                           min_size=300)
    for column in [("user_id", "prior_question_had_explanation"),
                   ("content_id", "prior_question_had_explanation"),
                   ("part", "prior_question_had_explanation"),
                   ("user_id", "part", "prior_question_had_explanation")]:
        if column not in feature_factory_dict:
            feature_factory_dict[column] = {}
        if type(column) == str:
            feature_factory_dict[column][
                f"MeanAggregatorShiftDiffTimeElapsedTimeby{column}"] = MeanAggregator(
                    column=column,
                    agg_column="duration_previous_content_cap100k",
                    remove_now=False)
            feature_factory_dict[column][
                f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator(
                    column=column, agg_column="study_time", remove_now=False)
        else:
            feature_factory_dict[column][
                f"MeanAggregatorShiftDiffTimeElapsedTimeby{column}"] = MeanAggregator(
                    column=list(column),
                    agg_column="duration_previous_content_cap100k",
                    remove_now=False)
            feature_factory_dict[column][
                f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator(
                    column=list(column),
                    agg_column="study_time",
                    remove_now=False)

    feature_factory_dict["user_id"][
        "CategoryLevelEncoderPart"] = CategoryLevelEncoder(
            groupby_column="user_id", agg_column="part", categories=[2, 5])
    feature_factory_dict["user_id"][
        "UserContentNowRateEncoder"] = UserContentNowRateEncoder(
            column="part", target=[1, 2, 3, 4, 5, 6, 7], rate_func="elo")
    feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2(
        groupby="user_id",
        column="content_id",
        is_debug=is_debug,
        model_id=model_id,
        n=300)
    feature_factory_dict["user_id"][
        "PreviousNAnsweredCorrectly"] = PreviousNAnsweredCorrectly(
            n=5, is_partial_fit=True)

    feature_factory_dict[f"previous_5_ans"] = {
        "TargetEncoder": TargetEncoder(column="previous_5_ans")
    }
    feature_factory_dict["user_id"][
        "QuestionLectureTableEncoder2"] = QuestionLectureTableEncoder2(
            model_id=model_id, is_debug=is_debug, past_n=100, min_size=300)
    feature_factory_dict["user_id"][
        "QuestionQuestionTableEncoder2"] = QuestionQuestionTableEncoder2(
            model_id=model_id, is_debug=is_debug, past_n=100, min_size=300)
    feature_factory_dict["user_id"][
        "UserContentRateEncoder"] = UserContentRateEncoder(column="user_id",
                                                           rate_func="elo")
    feature_factory_dict["content_id"]["CorrectVsIncorrectMeanEncoderContent-Duration100k"] = \
        CorrectVsIncorrectMeanEncoder(groupby="content_id",
                                      column="duration_previous_content_cap100k",
                                      min_size=300)
    feature_factory_dict["content_id"]["CorrectVsIncorrectMeanEncoderContent-UserIdTargetEnc"] = \
        CorrectVsIncorrectMeanEncoder(groupby="part",
                                      column="target_enc_user_id",
                                      min_size=300)

    feature_factory_dict["user_id"][
        "PreviousContentAnswerTargetEncoder"] = PreviousContentAnswerTargetEncoder(
            min_size=300)
    feature_factory_dict["post"] = {
        "DurationFeaturePostProcess": DurationFeaturePostProcess()
    }
    feature_factory_manager = FeatureFactoryManager(
        feature_factory_dict=feature_factory_dict,
        logger=logger,
        split_num=split_num,
        model_id=model_id,
        load_feature=not is_debug,
        save_feature=not is_debug)
    return feature_factory_manager
示例#2
0
文件: ex_279.py 项目: kurupical/riiid
def make_feature_factory_manager(split_num, model_id=None):
    logger = get_logger()

    feature_factory_dict = {}

    for column in ["user_id", "content_id", ("last_lecture", "content_id")]:
        is_partial_fit = (column == "content_id" or column == "user_id")

        if type(column) == str:
            feature_factory_dict[column] = {
                "TargetEncoder":
                TargetEncoder(column=column, is_partial_fit=is_partial_fit)
            }
        else:
            feature_factory_dict[column] = {
                "TargetEncoder":
                TargetEncoder(column=list(column),
                              is_partial_fit=is_partial_fit)
            }
    feature_factory_dict["user_id"][
        "ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id",
                                                        column="timestamp",
                                                        is_partial_fit=True)
    feature_factory_dict["user_id"][
        "PastNTimestampEncoder"] = PastNFeatureEncoder(
            column="timestamp",
            past_ns=[2, 3, 4, 5, 6, 7, 8, 9, 10],
            agg_funcs=["vslast"],
            remove_now=False)
    feature_factory_dict["user_id"][
        "Past1ContentTypeId"] = PastNFeatureEncoder(column="content_type_id",
                                                    past_ns=[5, 15],
                                                    agg_funcs=["mean"],
                                                    remove_now=False)
    feature_factory_dict["user_id"]["StudyTermEncoder"] = StudyTermEncoder(
        is_partial_fit=True)
    feature_factory_dict["user_id"][
        "ElapsedTimeVsShiftDiffEncoder"] = ElapsedTimeVsShiftDiffEncoder()
    feature_factory_dict["user_id"]["CountEncoder"] = CountEncoder(
        column="user_id", is_partial_fit=True)
    feature_factory_dict[("user_id", "part")] = {
        "UserContentRateEncoder":
        UserContentRateEncoder(column=["user_id", "part"], rate_func="elo")
    }

    for column in ["user_id", "content_id", "part", ("user_id", "part")]:
        if column not in feature_factory_dict:
            feature_factory_dict[column] = {}
        if type(column) == str:
            feature_factory_dict[column][
                f"MeanAggregatorShiftDiffTimeElapsedTimeby{column}"] = MeanAggregator(
                    column=column,
                    agg_column="shiftdiff_timestamp_by_user_id_cap200k",
                    remove_now=False)
            feature_factory_dict[column][
                f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator(
                    column=column, agg_column="study_time", remove_now=False)
        else:
            feature_factory_dict[column][
                f"MeanAggregatorShiftDiffTimeElapsedTimeby{column}"] = MeanAggregator(
                    column=list(column),
                    agg_column="shiftdiff_timestamp_by_user_id_cap200k",
                    remove_now=False)
            feature_factory_dict[column][
                f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator(
                    column=list(column),
                    agg_column="study_time",
                    remove_now=False)

    feature_factory_dict["user_id"][
        "CategoryLevelEncoderPart"] = CategoryLevelEncoder(
            groupby_column="user_id", agg_column="part", categories=[2, 5])

    feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2(
        groupby="user_id",
        column="content_id",
        is_debug=is_debug,
        model_id=model_id,
        n=300)
    feature_factory_dict["user_id"][
        "PreviousNAnsweredCorrectly"] = PreviousNAnsweredCorrectly(
            n=5, is_partial_fit=True)

    feature_factory_dict[f"previous_5_ans"] = {
        "TargetEncoder": TargetEncoder(column="previous_5_ans")
    }
    feature_factory_dict["user_id"][
        "QuestionLectureTableEncoder2"] = QuestionLectureTableEncoder2(
            model_id=model_id, is_debug=is_debug, past_n=100, min_size=300)
    feature_factory_dict["user_id"][
        "QuestionQuestionTableEncoder2"] = QuestionQuestionTableEncoder2(
            model_id=model_id, is_debug=is_debug, past_n=100, min_size=300)
    feature_factory_dict["user_id"][
        "UserContentRateEncoder"] = UserContentRateEncoder(column="user_id",
                                                           rate_func="elo")
    feature_factory_dict["post"] = {
        "ContentIdTargetEncoderAggregator": TargetEncoderAggregator()
    }

    feature_factory_manager = FeatureFactoryManager(
        feature_factory_dict=feature_factory_dict,
        logger=logger,
        split_num=split_num,
        model_id=model_id,
        load_feature=not is_debug,
        save_feature=not is_debug)
    return feature_factory_manager
示例#3
0
def main(params: dict, output_dir: str):
    import mlflow
    print("start params={}".format(params))
    model_id = "train_0"
    logger = get_logger()
    # df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle")
    df = pd.read_pickle(
        "../input/riiid-test-answer-prediction/split10/train_0.pickle"
    ).sort_values(["user_id", "timestamp"]).reset_index(drop=True)
    if is_debug:
        df = df.head(30000)
    df["prior_question_had_explanation"] = df[
        "prior_question_had_explanation"].fillna(-1)
    column_config = {
        ("content_id", "content_type_id"): {
            "type": "category"
        },
        "user_answer": {
            "type": "leakage_feature"
        },
        "answered_correctly": {
            "type": "leakage_feature"
        },
        "part": {
            "type": "category"
        },
        "prior_question_elapsed_time_bin300": {
            "type": "category"
        },
        "duration_previous_content_bin300": {
            "type": "category"
        },
        "prior_question_had_explanation": {
            "type": "category"
        },
        "rating_diff_content_user_id": {
            "type": "numeric"
        },
        "task_container_id_bin300": {
            "type": "category"
        },
        "study_time_bin300": {
            "type": "category"
        },
        "diff_mean_study_time_by_user_id_bin300": {
            "type": "category"
        },
        "past2_timestamp_vslast_bin300": {
            "type": "category"
        },
        "past3_timestamp_vslast_bin300": {
            "type": "category"
        },
        "past4_timestamp_vslast_bin300": {
            "type": "category"
        },
        "past5_timestamp_vslast_bin300": {
            "type": "category"
        },
        "rating_diff_content_user_id_bin500": {
            "type": "category"
        }
    }

    if not load_pickle or is_debug:
        feature_factory_dict = {"user_id": {}}
        feature_factory_dict["user_id"][
            "DurationPreviousContent"] = DurationPreviousContent()
        feature_factory_dict["user_id"][
            "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder()
        feature_factory_dict["user_id"][
            "UserContentRateEncoder"] = UserContentRateEncoder(
                rate_func="elo", column="user_id")
        feature_factory_dict["user_id"][
            "StudyTermEncoder"] = StudyTermEncoder2()
        feature_factory_dict["user_id"][
            "MeanAggregatorStudyTimebyUserId"] = MeanAggregator(
                column="user_id", agg_column="study_time", remove_now=False)

        feature_factory_dict["user_id"][
            "PastNTimestampEncoder"] = PastNFeatureEncoder(
                column="timestamp",
                past_ns=[2, 3, 4, 5],
                agg_funcs=["vslast"],
                remove_now=False)
        feature_factory_manager = FeatureFactoryManager(
            feature_factory_dict=feature_factory_dict,
            logger=logger,
            split_num=1,
            model_id="train_0",
            load_feature=not is_debug,
            save_feature=not is_debug)

        print("all_predict")
        df = feature_factory_manager.all_predict(df)
        df["task_container_id_bin300"] = [
            x if x < 300 else 300 for x in df["task_container_id"].values
        ]

        def f(x):
            x = x // 1000
            if x > 150:
                return 150
            if x < -150:
                return -150
            return x

        df["study_time_bin300"] = [f(x) for x in df["study_time"].values]
        df["diff_mean_study_time_by_user_id_bin300"] = [
            f(x) for x in df["diff_mean_study_time_by_user_id"].values
        ]
        df["past2_timestamp_vslast_bin300"] = [
            f(x) for x in df["past2_timestamp_vslast"].values
        ]
        df["past3_timestamp_vslast_bin300"] = [
            f(x) for x in df["past3_timestamp_vslast"].values
        ]
        df["past4_timestamp_vslast_bin300"] = [
            f(x) for x in df["past4_timestamp_vslast"].values
        ]
        df["past5_timestamp_vslast_bin300"] = [
            f(x) for x in df["past5_timestamp_vslast"].values
        ]
        df["rating_diff_content_user_id_bin500"] = [
            f(x) for x in df["rating_diff_content_user_id"].values
        ]
        df = df[[
            "user_id", "content_id", "content_type_id", "part", "user_answer",
            "answered_correctly", "prior_question_elapsed_time_bin300",
            "duration_previous_content_bin300", "study_time_bin300",
            "prior_question_had_explanation", "rating_diff_content_user_id",
            "task_container_id_bin300", "past2_timestamp_vslast_bin300",
            "past3_timestamp_vslast_bin300", "past4_timestamp_vslast_bin300",
            "past5_timestamp_vslast_bin300",
            "rating_diff_content_user_id_bin500",
            "diff_mean_study_time_by_user_id_bin300"
        ]].fillna(0)
        print(df.head(10))

        print("data preprocess")

        train_idx = []
        val_idx = []
        np.random.seed(0)
        for _, w_df in df[df["content_type_id"] == 0].groupby("user_id"):
            if np.random.random() < 0.01:
                # all val
                val_idx.extend(w_df.index.tolist())
            else:
                train_num = int(len(w_df) * 0.95)
                train_idx.extend(w_df[:train_num].index.tolist())
                val_idx.extend(w_df[train_num:].index.tolist())
    ff_for_transformer = FeatureFactoryForTransformer(
        column_config=column_config,
        dict_path="../feature_engineering/",
        sequence_length=params["max_seq"],
        logger=logger)
    ff_for_transformer.make_dict(df=df)
    n_skill = len(ff_for_transformer.embbed_dict[("content_id",
                                                  "content_type_id")])
    if not load_pickle or is_debug:
        df["is_val"] = 0
        df["is_val"].loc[val_idx] = 1
        w_df = df[df["is_val"] == 0]
        w_df["group"] = (
            w_df.groupby("user_id")["user_id"].transform("count") -
            w_df.groupby("user_id").cumcount()) // params["max_seq"]
        w_df["user_id"] = w_df["user_id"].astype(
            str) + "_" + w_df["group"].astype(str)

        group = ff_for_transformer.all_predict(w_df)

        dataset_train = SAKTDataset(group,
                                    n_skill=n_skill,
                                    max_seq=params["max_seq"])

        del w_df
        gc.collect()

    ff_for_transformer = FeatureFactoryForTransformer(
        column_config=column_config,
        dict_path="../feature_engineering/",
        sequence_length=params["max_seq"],
        logger=logger)
    if not load_pickle or is_debug:
        group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0])
        dataset_val = SAKTDataset(group,
                                  is_test=True,
                                  n_skill=n_skill,
                                  max_seq=params["max_seq"])

    os.makedirs("../input/feature_engineering/model139", exist_ok=True)
    if not is_debug and not load_pickle:
        with open(f"../input/feature_engineering/model139/train.pickle",
                  "wb") as f:
            pickle.dump(dataset_train, f)
        with open(f"../input/feature_engineering/model139/val.pickle",
                  "wb") as f:
            pickle.dump(dataset_val, f)

    if not is_debug and load_pickle:
        with open(f"../input/feature_engineering/model139/train.pickle",
                  "rb") as f:
            dataset_train = pickle.load(f)
        with open(f"../input/feature_engineering/model139/val.pickle",
                  "rb") as f:
            dataset_val = pickle.load(f)
        print("loaded!")
    dataloader_train = DataLoader(dataset_train,
                                  batch_size=params["batch_size"],
                                  shuffle=True,
                                  num_workers=1)
    dataloader_val = DataLoader(dataset_val,
                                batch_size=params["batch_size"],
                                shuffle=False,
                                num_workers=1)

    model = SAKTModel(n_skill,
                      embed_dim=params["embed_dim"],
                      max_seq=params["max_seq"],
                      dropout=dropout)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(
        optimizer_grouped_parameters,
        lr=params["lr"],
        weight_decay=0.01,
    )
    num_train_optimization_steps = int(len(dataloader_train) * epochs)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=params["num_warmup_steps"],
        num_training_steps=num_train_optimization_steps)
    criterion = nn.BCEWithLogitsLoss()

    model.to(device)
    criterion.to(device)

    for epoch in range(epochs):
        loss, acc, auc, auc_val = train_epoch(model, dataloader_train,
                                              dataloader_val, optimizer,
                                              criterion, scheduler, device)
        print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}".
              format(epoch, loss, auc, auc_val))

    preds = []
    labels = []
    with torch.no_grad():
        for item in tqdm(dataloader_val):
            label = item["label"].to(device).float()

            output = model(item, device)
            preds.extend(torch.nn.Sigmoid()(
                output[:, -1]).view(-1).data.cpu().numpy().tolist())
            labels.extend(label[:, -1].view(-1).data.cpu().numpy().tolist())

    auc_transformer = roc_auc_score(labels, preds)
    print("single transformer: {:.4f}".format(auc_transformer))
    df_oof = pd.DataFrame()
    # df_oof["row_id"] = df.loc[val_idx].index
    print(len(dataloader_val))
    print(len(preds))
    df_oof["predict"] = preds
    df_oof["target"] = labels

    df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False)
    """
    df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv")
    df_oof2.columns = ["row_id", "predict_lgbm", "target"]
    df_oof2 = pd.merge(df_oof, df_oof2, how="inner")

    auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values)
    print("lgbm: {:.4f}".format(auc_lgbm))

    print("ensemble")
    max_auc = 0
    max_nn_ratio = 0
    for r in np.arange(0, 1.05, 0.05):
        auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r)
        print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc))

        if max_auc < auc:
            max_auc = auc
            max_nn_ratio = r
    print(len(df_oof2))
    """
    if not is_debug:
        mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__))

        for key, value in params.items():
            mlflow.log_param(key, value)
        mlflow.log_metric("auc_val", auc_transformer)
        mlflow.end_run()
    torch.save(model.state_dict(), f"{output_dir}/transformers.pth")
    del model
    torch.cuda.empty_cache()
    with open(f"{output_dir}/transformer_param.json", "w") as f:
        json.dump(params, f)
    if is_make_feature_factory:
        # feature factory
        feature_factory_dict = {"user_id": {}}
        feature_factory_dict["user_id"][
            "DurationPreviousContent"] = DurationPreviousContent(
                is_partial_fit=True)
        feature_factory_dict["user_id"][
            "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder()
        feature_factory_manager = FeatureFactoryManager(
            feature_factory_dict=feature_factory_dict,
            logger=logger,
            split_num=1,
            model_id="all",
            load_feature=not is_debug,
            save_feature=not is_debug)

        ff_for_transformer = FeatureFactoryForTransformer(
            column_config=column_config,
            dict_path="../feature_engineering/",
            sequence_length=params["max_seq"],
            logger=logger)
        df = pd.read_pickle(
            "../input/riiid-test-answer-prediction/train_merged.pickle")
        if is_debug:
            df = df.head(10000)
        df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True)
        feature_factory_manager.fit(df)
        df = feature_factory_manager.all_predict(df)
        for dicts in feature_factory_manager.feature_factory_dict.values():
            for factory in dicts.values():
                factory.logger = None
        feature_factory_manager.logger = None
        with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f:
            pickle.dump(feature_factory_manager, f)

        ff_for_transformer.fit(df)
        ff_for_transformer.logger = None
        with open(
                f"{output_dir}/feature_factory_manager_for_transformer.pickle",
                "wb") as f:
            pickle.dump(ff_for_transformer, f)