def make_feature_factory_manager(split_num, model_id=None): logger = get_logger() feature_factory_dict = {} feature_factory_dict["tags"] = { "TagsSeparator": TagsSeparator(is_partial_fit=True) } for column in [ "user_id", "content_id", "part", ("user_id", "prior_question_had_explanation"), ("user_id", "part"), ("content_id", "prior_question_had_explanation") ]: is_partial_fit = (column == "content_id" or column == "user_id") if type(column) == str: feature_factory_dict[column] = { "TargetEncoder": TargetEncoder(column=column, is_partial_fit=is_partial_fit) } else: feature_factory_dict[column] = { "TargetEncoder": TargetEncoder(column=list(column), is_partial_fit=is_partial_fit) } feature_factory_dict["user_id"][ "ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id", column="timestamp") feature_factory_dict["user_id"][ "ShiftDiffEncoderContentId"] = ShiftDiffEncoder(groupby="user_id", column="content_id") for column in ["user_id", "content_id"]: feature_factory_dict[column][ f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator( column=column, agg_column="prior_question_elapsed_time", remove_now=True) feature_factory_dict["user_id"][ "UserLevelEncoder2ContentId"] = UserLevelEncoder2( vs_column="content_id") feature_factory_dict["content_id"][ "ContentLevelEncoder2UserId"] = ContentLevelEncoder( vs_column="user_id", is_partial_fit=True) feature_factory_dict["user_id"][ "MeanAggregatorContentLevel"] = MeanAggregator( column="user_id", agg_column="content_level_user_id", remove_now=False) feature_factory_dict["user_id"]["CountEncoder"] = CountEncoder( column="user_id", is_partial_fit=True) feature_factory_dict["user_id"][ "UserCountBinningEncoder"] = UserCountBinningEncoder( is_partial_fit=True) feature_factory_dict["user_count_bin"] = {} feature_factory_dict["user_count_bin"]["TargetEncoder"] = TargetEncoder( column="user_count_bin") feature_factory_dict[("user_id", "user_count_bin")] = { "TargetEncoder": TargetEncoder(column=["user_id", "user_count_bin"]) } feature_factory_dict[("content_id", "user_count_bin")] = { "TargetEncoder": TargetEncoder(column=["content_id", "user_count_bin"]) } feature_factory_dict[( "prior_question_had_explanation", "user_count_bin")] = { "TargetEncoder": TargetEncoder( column=["prior_question_had_explanation", "user_count_bin"]) } feature_factory_dict["user_id"][ "CategoryLevelEncoderPart"] = CategoryLevelEncoder( groupby_column="user_id", agg_column="part", categories=[2, 5]) feature_factory_dict["user_id"][ "FirstColumnEncoderContentId"] = FirstColumnEncoder( column="content_id", astype="int16", is_partial_fit=True) feature_factory_dict["user_id"][ "FirstColumnEncoderPart"] = FirstColumnEncoder(column="part", astype="int8", is_partial_fit=True) for column in [ "user_id", "user_count_bin", "first_column_content_id", "first_column_part", ("user_id", "part") ]: if column not in feature_factory_dict: feature_factory_dict[column] = {} if type(column) == str: feature_factory_dict[column][ f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator( column=column, agg_column="target_enc_content_id", remove_now=False) else: feature_factory_dict[column][ f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator( column=list(column), agg_column="target_enc_content_id", remove_now=False) for column in [ "content_id", "part", "tags1", "tags2", "prior_question_had_explanation", ("content_id", "prior_question_had_explanation") ]: if column not in feature_factory_dict: feature_factory_dict[column] = {} if type(column) == str: feature_factory_dict[column][ f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator( column=column, agg_column="target_enc_user_id", remove_now=False) else: feature_factory_dict[column][ f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator( column=list(column), agg_column="target_enc_user_id", remove_now=False) feature_factory_dict["user_count_bin"]["CategoryLevelEncoderUserCountBin"] = \ CategoryLevelEncoder(groupby_column="user_id", agg_column="user_count_bin", categories=[0]) feature_factory_dict["prior_question_elapsed_time"] = { "PriorQuestionElapsedTimeBinningEncoder": PriorQuestionElapsedTimeBinningEncoder(is_partial_fit=True) } feature_factory_dict[("part", "prior_question_elapsed_time_bin")] = { "TargetEncoder": TargetEncoder(column=["part", "prior_question_elapsed_time_bin"]) } feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2( groupby="user_id", column="content_id", is_debug=is_debug, model_id=model_id) feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=split_num) return feature_factory_manager
def make_feature_factory_manager(split_num, size, window, model_id=None): logger = get_logger() feature_factory_dict = {} feature_factory_dict["tags"] = { "TagsSeparator": TagsSeparator(is_partial_fit=True) } for column in ["user_id", "content_id"]: is_partial_fit = (column == "content_id" or column == "user_id") if type(column) == str: feature_factory_dict[column] = { "TargetEncoder": TargetEncoder(column=column, is_partial_fit=is_partial_fit) } else: feature_factory_dict[column] = { "TargetEncoder": TargetEncoder(column=list(column), is_partial_fit=is_partial_fit) } feature_factory_dict["user_id"][ "ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id", column="timestamp", is_partial_fit=True) feature_factory_dict["user_id"]["StudyTermEncoder"] = StudyTermEncoder( is_partial_fit=True) # feature_factory_dict["user_id"]["UserLevelEncoder2ContentId"] = UserLevelEncoder2(vs_column="content_id") # feature_factory_dict["content_id"]["ContentLevelEncoder2UserId"] = ContentLevelEncoder(vs_column="user_id", is_partial_fit=True) # feature_factory_dict["user_id"]["MeanAggregatorContentLevel"] = MeanAggregator(column="user_id", # agg_column="content_level_user_id", # remove_now=False) feature_factory_dict["user_id"]["CountEncoder"] = CountEncoder( column="user_id", is_partial_fit=True) feature_factory_dict["user_id"][ "UserCountBinningEncoder"] = UserCountBinningEncoder( is_partial_fit=True) feature_factory_dict["user_count_bin"] = {} feature_factory_dict["user_count_bin"]["TargetEncoder"] = TargetEncoder( column="user_count_bin") feature_factory_dict[("user_id", "user_count_bin")] = { "TargetEncoder": TargetEncoder(column=["user_id", "user_count_bin"]) } feature_factory_dict[("content_id", "user_count_bin")] = { "TargetEncoder": TargetEncoder(column=["content_id", "user_count_bin"]) } feature_factory_dict[("user_id", "part")] = { "UserContentRateEncoder": UserContentRateEncoder(column=["user_id", "part"], rate_func="elo") } for column in ["user_id", "content_id", "part", ("user_id", "part")]: if column not in feature_factory_dict: feature_factory_dict[column] = {} if type(column) == str: feature_factory_dict[column][ f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator( column=column, agg_column="prior_question_elapsed_time", remove_now=True) feature_factory_dict[column][ f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator( column=column, agg_column="study_time", remove_now=True) else: feature_factory_dict[column][ f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator( column=list(column), agg_column="prior_question_elapsed_time", remove_now=True) feature_factory_dict[column][ f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator( column=list(column), agg_column="study_time", remove_now=True) feature_factory_dict["user_id"][ "CategoryLevelEncoderPart"] = CategoryLevelEncoder( groupby_column="user_id", agg_column="part", categories=[2, 5]) feature_factory_dict["prior_question_elapsed_time"] = { "PriorQuestionElapsedTimeBinningEncoder": PriorQuestionElapsedTimeBinningEncoder(is_partial_fit=True) } feature_factory_dict[("part", "prior_question_elapsed_time_bin")] = { "TargetEncoder": TargetEncoder(column=["part", "prior_question_elapsed_time_bin"]) } feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2( groupby="user_id", column="content_id", is_debug=is_debug, model_id=model_id, n=500) feature_factory_dict["user_id"][ "PreviousNAnsweredCorrectly"] = PreviousNAnsweredCorrectly( n=3, is_partial_fit=True) feature_factory_dict[f"previous_3_ans"] = { "TargetEncoder": TargetEncoder(column="previous_3_ans") } feature_factory_dict["user_id"][ "QuestionLectureTableEncoder2"] = QuestionLectureTableEncoder2( model_id=model_id, is_debug=is_debug, past_n=100, min_size=100) feature_factory_dict["user_id"][ "QuestionQuestionTableEncoder"] = QuestionQuestionTableEncoder( model_id=model_id, is_debug=is_debug, past_n=100, min_size=300) feature_factory_dict["user_id"][ "UserContentRateEncoder"] = UserContentRateEncoder(column="user_id", rate_func="elo") feature_factory_dict["post"] = { "ContentIdTargetEncoderAggregator": TargetEncoderAggregator() } feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=split_num, model_id=model_id, load_feature=not is_debug, save_feature=not is_debug) return feature_factory_manager
def make_feature_factory_manager(split_num): logger = get_logger() feature_factory_dict = {} for column in [ "content_id", "user_id", "prior_question_had_explanation", ("user_id", "part"), ("content_id", "prior_question_had_explanation") ]: is_partial_fit = (column == "content_id" or column == "user_id") if type(column) == str: feature_factory_dict[column] = { "CountEncoder": CountEncoder(column=column, is_partial_fit=is_partial_fit), "TargetEncoder": TargetEncoder(column=column, is_partial_fit=is_partial_fit) } else: feature_factory_dict[column] = { "CountEncoder": CountEncoder(column=list(column), is_partial_fit=is_partial_fit), "TargetEncoder": TargetEncoder(column=list(column), is_partial_fit=is_partial_fit) } feature_factory_dict["user_id"][ "ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id", column="timestamp", is_partial_fit=True) feature_factory_dict["user_id"][ "ShiftDiffEncoderContentId"] = ShiftDiffEncoder(groupby="user_id", column="content_id") for column in ["user_id", "content_id"]: feature_factory_dict[column][ f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator( column=column, agg_column="prior_question_elapsed_time", remove_now=True) feature_factory_dict["user_id"][ "UserLevelEncoder2ContentId"] = UserLevelEncoder2( vs_column="content_id") feature_factory_dict["user_id"][ "UserCountBinningEncoder"] = UserCountBinningEncoder( is_partial_fit=True) feature_factory_dict["user_count_bin"] = {} feature_factory_dict["user_count_bin"]["CountEncoder"] = CountEncoder( column="user_count_bin") feature_factory_dict["user_count_bin"]["TargetEncoder"] = TargetEncoder( column="user_count_bin") feature_factory_dict[("user_id", "user_count_bin")] = { "CountEncoder": CountEncoder(column=["user_id", "user_count_bin"]), "TargetEncoder": TargetEncoder(column=["user_id", "user_count_bin"]) } feature_factory_dict[("content_id", "user_count_bin")] = { "CountEncoder": CountEncoder(column=["content_id", "user_count_bin"]), "TargetEncoder": TargetEncoder(column=["content_id", "user_count_bin"]) } feature_factory_dict[( "prior_question_had_explanation", "user_count_bin")] = { "CountEncoder": CountEncoder( column=["prior_question_had_explanation", "user_count_bin"]), "TargetEncoder": TargetEncoder( column=["prior_question_had_explanation", "user_count_bin"]) } feature_factory_dict["user_id"][ "CategoryLevelEncoderPart"] = CategoryLevelEncoder( groupby_column="user_id", agg_column="part", categories=[2, 5]) feature_factory_dict["user_count_bin"]["CategoryLevelEncoderUserCountBin"] = \ CategoryLevelEncoder(groupby_column="user_id", agg_column="user_count_bin", categories=[0]) feature_factory_dict["prior_question_elapsed_time"] = { "PriorQuestionElapsedTimeBinningEncoder": PriorQuestionElapsedTimeBinningEncoder() } feature_factory_dict[("part", "prior_question_elapsed_time_bin")] = { "CountEncoder": CountEncoder(column=["part", "prior_question_elapsed_time_bin"]), "TargetEncoder": TargetEncoder(column=["part", "prior_question_elapsed_time_bin"]) } feature_factory_dict[("user_id", "content_id")] = { "PreviousAnswer2": PreviousAnswer2(column=["user_id", "content_id"]) } feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=split_num) return feature_factory_manager
} feature_factory_dict[("prior_question_had_explanation", "user_count_bin")] = { "CountEncoder": CountEncoder(column=["prior_question_had_explanation", "user_count_bin"]), "TargetEncoder": TargetEncoder(column=["prior_question_had_explanation", "user_count_bin"]) } feature_factory_dict["user_id"]["CategoryLevelEncoderPart"] = CategoryLevelEncoder(groupby_column="user_id", agg_column="part", categories=[1, 2, 3, 4, 5, 6, 7]) feature_factory_dict["user_count_bin"]["CategoryLevelEncoderUserCountBin"] = \ CategoryLevelEncoder(groupby_column="user_id", agg_column="user_count_bin", categories=[0, 1, 2, 3, 4, 5]) feature_factory_dict["prior_question_elapsed_time"] = { "PriorQuestionElapsedTimeBinningEncoder": PriorQuestionElapsedTimeBinningEncoder() } feature_factory_dict[("part", "prior_question_elapsed_time_bin")] = { "CountEncoder": CountEncoder(column=["part", "prior_question_elapsed_time_bin"]), "TargetEncoder": TargetEncoder(column=["part", "prior_question_elapsed_time_bin"]) } feature_factory_manager = FeatureFactoryManager(feature_factory_dict=feature_factory_dict, logger=logger, split_num=10) df = feature_factory_manager.all_predict(df) os.makedirs(output_dir, exist_ok=True) params = { 'objective': 'binary', 'num_leaves': 32, 'min_data_in_leaf': 15, # 42,
def run(debug, model_dir, kaggle=False): if kaggle: files_dir = "/kaggle/input/riiid-split10/*.pickle" else: files_dir = "../input/riiid-test-answer-prediction/split10_base/*.pickle" logger = get_logger() # environment env = riiideducation.make_env() df_question = pd.read_csv( "../input/riiid-test-answer-prediction/questions.csv", dtype={ "bundle_id": "int32", "question_id": "int32", "correct_answer": "int8", "part": "int8" }) df_lecture = pd.read_csv( "../input/riiid-test-answer-prediction/lectures.csv", dtype={ "lecture_id": "int32", "tag": "int16", "part": "int8" }) # model loading models = [] for model_path in glob.glob(f"{model_dir}/*model*.pickle"): with open(model_path, "rb") as f: models.append(pickle.load(f)) # data preprocessing logger = get_logger() feature_factory_dict = {} feature_factory_dict["tags"] = {"TagsSeparator": TagsSeparator()} for column in [ "content_id", "user_id", "part", "prior_question_had_explanation", "tags1", "tags2", ("user_id", "prior_question_had_explanation"), ("user_id", "part"), ("content_id", "prior_question_had_explanation") ]: is_partial_fit = (column == "content_id" or column == "user_id") if type(column) == str: feature_factory_dict[column] = { "CountEncoder": CountEncoder(column=column, is_partial_fit=is_partial_fit), "TargetEncoder": TargetEncoder(column=column, is_partial_fit=is_partial_fit) } else: feature_factory_dict[column] = { "CountEncoder": CountEncoder(column=list(column), is_partial_fit=is_partial_fit), "TargetEncoder": TargetEncoder(column=list(column), is_partial_fit=is_partial_fit) } feature_factory_dict["user_id"][ "ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id", column="timestamp", is_partial_fit=True) feature_factory_dict["user_id"][ "ShiftDiffEncoderContentId"] = ShiftDiffEncoder(groupby="user_id", column="content_id") for column in ["user_id", "content_id"]: feature_factory_dict[column][ f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator( column=column, agg_column="prior_question_elapsed_time", remove_now=True) feature_factory_dict["user_id"][ "UserLevelEncoder2ContentId"] = UserLevelEncoder2( vs_column="content_id") feature_factory_dict["user_id"][ "UserCountBinningEncoder"] = UserCountBinningEncoder( is_partial_fit=True) feature_factory_dict["user_count_bin"] = {} feature_factory_dict["user_count_bin"]["CountEncoder"] = CountEncoder( column="user_count_bin") feature_factory_dict["user_count_bin"]["TargetEncoder"] = TargetEncoder( column="user_count_bin") feature_factory_dict[("user_id", "user_count_bin")] = { "CountEncoder": CountEncoder(column=["user_id", "user_count_bin"]), "TargetEncoder": TargetEncoder(column=["user_id", "user_count_bin"]) } feature_factory_dict[("content_id", "user_count_bin")] = { "CountEncoder": CountEncoder(column=["content_id", "user_count_bin"]), "TargetEncoder": TargetEncoder(column=["content_id", "user_count_bin"]) } feature_factory_dict[( "prior_question_had_explanation", "user_count_bin")] = { "CountEncoder": CountEncoder( column=["prior_question_had_explanation", "user_count_bin"]), "TargetEncoder": TargetEncoder( column=["prior_question_had_explanation", "user_count_bin"]) } feature_factory_dict["user_id"][ "CategoryLevelEncoderPart"] = CategoryLevelEncoder( groupby_column="user_id", agg_column="part", categories=[1, 2, 3, 4, 5, 6, 7]) feature_factory_dict["user_count_bin"]["CategoryLevelEncoderUserCountBin"] = \ CategoryLevelEncoder(groupby_column="user_id", agg_column="user_count_bin", categories=[0, 1, 2, 3, 4, 5]) feature_factory_dict["prior_question_elapsed_time"] = { "PriorQuestionElapsedTimeBinningEncoder": PriorQuestionElapsedTimeBinningEncoder() } feature_factory_dict[("part", "prior_question_elapsed_time_bin")] = { "CountEncoder": CountEncoder(column=["part", "prior_question_elapsed_time_bin"]), "TargetEncoder": TargetEncoder(column=["part", "prior_question_elapsed_time_bin"]) } feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger) for model_id, fname in enumerate(glob.glob(files_dir)): logger.info(f"loading... {fname}") df = pd.read_pickle(fname) df = df[df["answered_correctly"] != -1] df["prior_question_had_explanation"] = df[ "prior_question_had_explanation"].fillna(-1).astype("int8") if debug: df = df.head(1000) df = pd.concat([ pd.merge(df[df["content_type_id"] == 0], df_question, how="left", left_on="content_id", right_on="question_id"), pd.merge(df[df["content_type_id"] == 1], df_lecture, how="left", left_on="content_id", right_on="lecture_id") ]).sort_values(["user_id", "timestamp"]) # df = feature_factory_manager.feature_factory_dict["content_id"]["TargetEncoder"].all_predict(df) feature_factory_manager.fit(df, is_first_fit=True) iter_test = env.iter_test() df_test_prev = pd.DataFrame() answered_correctlies = [] user_answers = [] i = 0 t = time.time() for (df_test, df_sample_prediction) in iter_test: i += 1 logger.info( f"[time: {int(time.time() - t)}iteration {i}: data_length: {len(df_test)}" ) # 前回のデータ更新 if len(df_test_prev) > 0: # 初回のみパスするためのif answered_correctly = df_test.iloc[0]["prior_group_answers_correct"] user_answer = df_test.iloc[0]["prior_group_responses"] answered_correctlies.extend([ int(x) for x in answered_correctly.replace("[", "").replace( "'", "").replace("]", "").replace(" ", "").split(",") ]) user_answers.extend([ int(x) for x in user_answer.replace("[", "").replace("'", "").replace( "]", "").replace(" ", "").split(",") ]) if debug: update_record = 1 else: update_record = 50 if len(df_test_prev) > update_record: df_test_prev["answered_correctly"] = answered_correctlies df_test_prev["user_answer"] = user_answers # df_test_prev = df_test_prev.drop(prior_columns, axis=1) df_test_prev = df_test_prev[ df_test_prev["answered_correctly"] != -1] df_test_prev["answered_correctly"] = df_test_prev[ "answered_correctly"].replace(-1, np.nan) df_test_prev["prior_question_had_explanation"] = df_test_prev[ "prior_question_had_explanation"].fillna(-1).astype("int8") feature_factory_manager.fit(df_test_prev) df_test_prev = pd.DataFrame() answered_correctlies = [] user_answers = [] # 今回のデータ取得&計算 # logger.info(f"[time: {int(time.time() - t)}dataload") logger.info(f"merge... ") w_df1 = pd.merge(df_test[df_test["content_type_id"] == 0], df_question, how="left", left_on="content_id", right_on="question_id") w_df2 = pd.merge(df_test[df_test["content_type_id"] == 1], df_lecture, how="left", left_on="content_id", right_on="lecture_id") df_test = pd.concat([w_df1, w_df2]).sort_values(["user_id", "timestamp"]).sort_index() df_test["tag"] = df_test["tag"].fillna(-1) df_test["correct_answer"] = df_test["correct_answer"].fillna(-1) df_test["bundle_id"] = df_test["bundle_id"].fillna(-1) logger.info(f"transform... ") df_test["prior_question_had_explanation"] = df_test[ "prior_question_had_explanation"].astype("float16").fillna( -1).astype("int8") df = feature_factory_manager.partial_predict(df_test) df.columns = [x.replace(" ", "_") for x in df.columns] logger.info(f"other... ") # predict predicts = [] cols = models[0].feature_name() for model in models: predicts.append(model.predict(df[cols])) df["answered_correctly"] = np.array(predicts).transpose().mean(axis=1) df_sample_prediction = pd.merge(df_sample_prediction[["row_id"]], df[["row_id", "answered_correctly"]], how="inner") env.predict(df_sample_prediction) df_test_prev = df_test_prev.append(df[cols + ["user_id", "tags"]]) if i < 5: df_test_prev.to_csv(f"{i}.csv")
def make_feature_factory_manager(split_num, model_id=None): logger = get_logger() feature_factory_dict = {} for column in ["user_id", "content_id", "part", ("user_id", "prior_question_had_explanation"), ("user_id", "part"), ("content_id", "prior_question_had_explanation")]: is_partial_fit = (column == "content_id" or column == "user_id") if type(column) == str: feature_factory_dict[column] = { "CountEncoder": CountEncoder(column=column, is_partial_fit=is_partial_fit), "TargetEncoder": TargetEncoder(column=column, is_partial_fit=is_partial_fit) } else: feature_factory_dict[column] = { "CountEncoder": CountEncoder(column=list(column), is_partial_fit=is_partial_fit), "TargetEncoder": TargetEncoder(column=list(column), is_partial_fit=is_partial_fit) } feature_factory_dict["user_id"]["ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id", column="timestamp", is_partial_fit=True) for column in ["user_id", "content_id"]: feature_factory_dict[column][f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator(column=column, agg_column="prior_question_elapsed_time", remove_now=True) feature_factory_dict["content_id"]["MeanAggregatorShiftDiffTimestamp"] = MeanAggregator(column="content_id", agg_column="shiftdiff_timestamp_by_user_id", remove_now=False) for column in ["target_enc_user_id", "prior_question_elapsed_time"]: feature_factory_dict["user_id"][f"MeanAggregatorContentIdUserAnswer{column}"] = MeanAggregator2(column=["content_id", "user_answer"], agg_column=column, remove_now=True) feature_factory_dict["user_id"]["UserLevelEncoder2ContentId"] = UserLevelEncoder2(vs_column="content_id") feature_factory_dict["content_id"]["ContentLevelEncoder2UserId"] = ContentLevelEncoder(vs_column="user_id", is_partial_fit=True) feature_factory_dict["user_id"]["MeanAggregatorContentLevel"] = MeanAggregator(column="user_id", agg_column="content_level_user_id", remove_now=False) feature_factory_dict["user_id"]["CountEncoder"] = CountEncoder(column="user_id", is_partial_fit=True) feature_factory_dict["user_id"]["UserCountBinningEncoder"] = UserCountBinningEncoder(is_partial_fit=True) feature_factory_dict["user_count_bin"] = {} feature_factory_dict["user_count_bin"]["TargetEncoder"] = TargetEncoder(column="user_count_bin") feature_factory_dict[("user_id", "user_count_bin")] = { "TargetEncoder": TargetEncoder(column=["user_id", "user_count_bin"]) } feature_factory_dict[("content_id", "user_count_bin")] = { "TargetEncoder": TargetEncoder(column=["content_id", "user_count_bin"]) } feature_factory_dict[("prior_question_had_explanation", "user_count_bin")] = { "TargetEncoder": TargetEncoder(column=["prior_question_had_explanation", "user_count_bin"]) } feature_factory_dict["user_id"]["CategoryLevelEncoderPart"] = CategoryLevelEncoder(groupby_column="user_id", agg_column="part", categories=[2, 5]) feature_factory_dict["user_id"]["FirstColumnEncoderContentId"] = FirstColumnEncoder(agg_column="content_id", astype="int16", is_partial_fit=True) feature_factory_dict["user_id"]["FirstColumnEncoderPart"] = FirstColumnEncoder(agg_column="part", astype="int8", is_partial_fit=True) for column in ["user_id", "user_count_bin", "first_column_content_id", "first_column_part", ("user_id", "part")]: if column not in feature_factory_dict: feature_factory_dict[column] = {} if type(column) == str: feature_factory_dict[column][f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator( column=column, agg_column="target_enc_content_id", remove_now=False ) else: feature_factory_dict[column][f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator( column=list(column), agg_column="target_enc_content_id", remove_now=False ) for column in [("content_id", "prior_question_had_explanation")]: if column not in feature_factory_dict: feature_factory_dict[column] = {} if type(column) == str: feature_factory_dict[column][f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator( column=column, agg_column="target_enc_user_id", remove_now=False ) else: feature_factory_dict[column][f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator( column=list(column), agg_column="target_enc_user_id", remove_now=False ) feature_factory_dict["prior_question_elapsed_time"] = { "PriorQuestionElapsedTimeBinningEncoder": PriorQuestionElapsedTimeBinningEncoder(is_partial_fit=True) } feature_factory_dict[("part", "prior_question_elapsed_time_bin")] = { "TargetEncoder": TargetEncoder(column=["part", "prior_question_elapsed_time_bin"]) } feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2(groupby="user_id", column="content_id", is_debug=is_debug, model_id=model_id, n=1000) feature_factory_dict["user_id"]["PreviousNAnsweredCorrectly"] = PreviousNAnsweredCorrectly(n=3, is_partial_fit=True) feature_factory_dict[f"previous_3_ans"] = { "TargetEncoder": TargetEncoder(column="previous_3_ans") } feature_factory_dict["user_id"]["QuestionLectureTableEncoder2"] = QuestionLectureTableEncoder2(model_id=model_id, is_debug=is_debug, past_n=100) feature_factory_dict["post"] = { "ContentIdTargetEncoderAggregator": TargetEncoderAggregator() } feature_factory_manager = FeatureFactoryManager(feature_factory_dict=feature_factory_dict, logger=logger, split_num=split_num, model_id=model_id, load_feature=not is_debug, save_feature=not is_debug) return feature_factory_manager