def inference(TARGET, FEATURES, prior_question_elapsed_time_mean, features_dicts, train_mean_dict, scaler): net = keras.models.load_model('model.h5', compile=False) # Get feature dict answered_correctly_u_count = features_dicts['answered_correctly_u_count'] answered_correctly_u_sum = features_dicts['answered_correctly_u_sum'] elapsed_time_u_sum = features_dicts['elapsed_time_u_sum'] explanation_u_sum = features_dicts['explanation_u_sum'] answered_correctly_q_count = features_dicts['answered_correctly_q_count'] answered_correctly_q_sum = features_dicts['answered_correctly_q_sum'] # answered_correctly_uq = features_dicts["answered_correctly_uq"] elapsed_time_q_sum = features_dicts['elapsed_time_q_sum'] explanation_q_sum = features_dicts['explanation_q_sum'] timestamp_u = features_dicts['timestamp_u'] timestamp_u_incorrect = features_dicts['timestamp_u_incorrect'] answered_correctly_up_count = features_dicts['answered_correctly_up_count'] answered_correctly_up_sum = features_dicts['answered_correctly_up_sum'] # Get api iterator and predictor env = riiideducation.make_env() iter_test = env.iter_test() set_predict = env.predict questions_df = pd.read_pickle('questions_df.pkl') previous_test_df = None for (test_df, sample_prediction_df) in iter_test: if previous_test_df is not None: previous_test_df[TARGET] = eval(test_df["prior_group_answers_correct"].iloc[0]) update_features(previous_test_df, answered_correctly_u_sum,answered_correctly_u_count, answered_correctly_q_sum, answered_correctly_q_count,timestamp_u_incorrect, answered_correctly_up_count, answered_correctly_up_sum) test_df['prior_question_had_explanation'] = test_df.prior_question_had_explanation.fillna(False).astype('int8') test_df['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace = True) question_cols = ['question_id','part','tag_1','answered_correctly_q_mean','answered_correctly_q_std','answered_correctly_p_mean','answered_correctly_p_std','answered_correctly_b_mean','answered_correctly_b_std','answered_correctly_tag_1_mean','answered_correctly_tag_1_std'] test_df = pd.merge(test_df, questions_df[question_cols], left_on = 'content_id', right_on = 'question_id', how = 'left') previous_test_df = test_df.copy() test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop = True) test_df[TARGET] = 0 test_df = add_features(test_df, answered_correctly_u_count, answered_correctly_u_sum, elapsed_time_u_sum, explanation_u_sum, timestamp_u, timestamp_u_incorrect, answered_correctly_q_count, answered_correctly_q_sum, elapsed_time_q_sum, explanation_q_sum, answered_correctly_up_count, answered_correctly_up_sum, update=False) for col in test_df[FEATURES].columns: test_df[col].fillna(train_mean_dict[col],inplace=True) X_test = test_df[FEATURES].values X_test = scaler.transform(X_test) test_df[TARGET] = net.predict(X_test) set_predict(test_df[['row_id', TARGET]]) print('Job Done')
def run(model_dir, verbose=False): model = KurupicalModel(model_dir=model_dir, verbose=verbose) # environment env = riiideducation.make_env() logger = get_logger() iter_test = env.iter_test() df_test_prev = None for (df_test, df_sample_prediction) in iter_test: if verbose: logger.info("inference!") if df_test_prev is not None: model.update(df_test_prev, df_test) predicts, df_test_prev = model.predict(df_test) df_sample_prediction = df_test[df_test["content_type_id"] == 0][["row_id"]] df_sample_prediction["answered_correctly"] = predicts env.predict(df_sample_prediction)
def run(debug, model_dir, kaggle=False): if kaggle: files_dir = "/kaggle/input/riiid-split10/*.pickle" else: files_dir = "../input/riiid-test-answer-prediction/split10_base/*.pickle" logger = get_logger() # environment env = riiideducation.make_env() df_question = pd.read_csv( "../input/riiid-test-answer-prediction/questions.csv", dtype={ "bundle_id": "int32", "question_id": "int32", "correct_answer": "int8", "part": "int8" }) df_lecture = pd.read_csv( "../input/riiid-test-answer-prediction/lectures.csv", dtype={ "lecture_id": "int32", "tag": "int16", "part": "int8" }) # model loading models = [] for model_path in glob.glob(f"{model_dir}/*model*.pickle"): with open(model_path, "rb") as f: models.append(pickle.load(f)) # data preprocessing logger = get_logger() feature_factory_dict = {} feature_factory_dict["tags"] = {"TagsSeparator": TagsSeparator()} for column in [ "content_id", "user_id", "content_type_id", "prior_question_had_explanation", "tags1", "tags2", "tags3", "tags4", "tags5", "tags6", ("user_id", "content_type_id"), ("user_id", "prior_question_had_explanation") ]: is_partial_fit = column == "content_id" is_onebyone = "content_id" in column if type(column) == str: feature_factory_dict[column] = { "CountEncoder": CountEncoder(column=column), "TargetEncoder": TargetEncoder(column=column, is_partial_fit=is_partial_fit) } else: feature_factory_dict[column] = { "CountEncoder": CountEncoder(column=list(column)), "TargetEncoder": TargetEncoder(column=list(column), is_partial_fit=is_partial_fit) } for column in [ "part", ("user_id", "tag"), ("user_id", "part"), ("content_type_id", "part"), ("user_id", "content_id") ]: if type(column) == str: feature_factory_dict[column] = { "CountEncoder": CountEncoder(column=column) } else: feature_factory_dict[column] = { "CountEncoder": CountEncoder(column=list(column)) } feature_factory_dict["user_id"][ "MeanAggregatorTimestamp"] = MeanAggregator(column="user_id", agg_column="timestamp", remove_now=False) feature_factory_dict["user_id"][ "MeanAggregatorPriorQuestionElapsedTime"] = MeanAggregator( column="user_id", agg_column="prior_question_elapsed_time", remove_now=True) feature_factory_dict["user_id"]["ShiftDiffEncoder"] = ShiftDiffEncoder( groupby="user_id", column="timestamp") feature_factory_dict["content_id"][ "MeanAggregatorPriorQuestionElapsedTime"] = MeanAggregator( column="content_id", agg_column="prior_question_elapsed_time", remove_now=True) feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger) for model_id, fname in enumerate(glob.glob(files_dir)): logger.info(f"loading... {fname}") df = pd.read_pickle(fname) df["answered_correctly"] = df["answered_correctly"].replace(-1, np.nan) df["prior_question_had_explanation"] = df[ "prior_question_had_explanation"].fillna(-1).astype("int8") if debug: df = df.head(1000) df = pd.concat([ pd.merge(df[df["content_type_id"] == 0], df_question, how="left", left_on="content_id", right_on="question_id"), pd.merge(df[df["content_type_id"] == 1], df_lecture, how="left", left_on="content_id", right_on="lecture_id") ]).sort_values(["user_id", "timestamp"]) feature_factory_manager.fit(df, is_first_fit=True) iter_test = env.iter_test() df_test_prev = pd.DataFrame() df_test_prev1 = pd.DataFrame() answered_correctlies = [] user_answers = [] i = 0 t = time.time() for (df_test, df_sample_prediction) in iter_test: i += 1 logger.info( f"[time: {int(time.time() - t)}iteration {i}: data_length: {len(df_test)}" ) # 前回のデータ更新 if len(df_test_prev) > 0: # 初回のみパスするためのif answered_correctly = df_test.iloc[0]["prior_group_answers_correct"] user_answer = df_test.iloc[0]["prior_group_responses"] answered_correctlies.extend([ int(x) for x in answered_correctly.replace("[", "").replace( "'", "").replace("]", "").replace(" ", "").split(",") ]) user_answers.extend([ int(x) for x in user_answer.replace("[", "").replace("'", "").replace( "]", "").replace(" ", "").split(",") ]) if debug: update_record = 1 else: update_record = 150 if len(df_test_prev) > update_record: df_test_prev["answered_correctly"] = answered_correctlies df_test_prev["user_answer"] = user_answers # df_test_prev = df_test_prev.drop(prior_columns, axis=1) df_test_prev = df_test_prev[ df_test_prev["answered_correctly"] != -1] df_test_prev["answered_correctly"] = df_test_prev[ "answered_correctly"].replace(-1, np.nan) df_test_prev["prior_question_had_explanation"] = df_test_prev[ "prior_question_had_explanation"].fillna(-1).astype("int8") feature_factory_manager.fit(df_test_prev) df_test_prev = pd.DataFrame() answered_correctlies = [] user_answers = [] # 今回のデータ取得&計算 # logger.info(f"[time: {int(time.time() - t)}dataload") logger.info(f"merge... ") w_df1 = pd.merge(df_test[df_test["content_type_id"] == 0], df_question, how="left", left_on="content_id", right_on="question_id") w_df2 = pd.merge(df_test[df_test["content_type_id"] == 1], df_lecture, how="left", left_on="content_id", right_on="lecture_id") df_test = pd.concat([w_df1, w_df2]) df_test["tag"] = df_test["tag"].fillna(-1) df_test["correct_answer"] = df_test["correct_answer"].fillna(-1) df_test["bundle_id"] = df_test["bundle_id"].fillna(-1) logger.info(f"transform... ") df_test["prior_question_had_explanation"] = df_test[ "prior_question_had_explanation"].astype("float16").fillna( -1).astype("int8") df = feature_factory_manager.partial_predict(df_test) df.columns = [x.replace(" ", "_") for x in df.columns] logger.info(f"other... ") # predict predicts = [] cols = models[0].feature_name() for model in models: predicts.append(model.predict(df[cols])) df["answered_correctly"] = np.array(predicts).transpose().mean(axis=1) df_sample_prediction = pd.merge(df_sample_prediction[["row_id"]], df[["row_id", "answered_correctly"]], how="inner") env.predict(df_sample_prediction) df_test_prev = df_test_prev.append(df[cols + ["user_id", "tags"]]) if debug: df_test_prev.to_csv(f"{i}.csv")
def run(debug, model_dir, kaggle=False): if kaggle: files_dir = "/kaggle/input/riiid-split10/*.pickle" else: files_dir = "../input/riiid-test-answer-prediction/split10_base/*.pickle" logger = get_logger() # environment env = riiideducation.make_env() df_question = pd.read_csv( "../input/riiid-test-answer-prediction/questions.csv", dtype={ "bundle_id": "int32", "question_id": "int32", "correct_answer": "int8", "part": "int8" }) df_lecture = pd.read_csv( "../input/riiid-test-answer-prediction/lectures.csv", dtype={ "lecture_id": "int32", "tag": "int16", "part": "int8" }) # model loading models = [] for model_path in glob.glob(f"{model_dir}/*model*.pickle"): with open(model_path, "rb") as f: models.append(pickle.load(f)) # load feature_factory_manager logger = get_logger() ff_manager_path = f"{model_dir}/feature_factory_manager.pickle" with open(ff_manager_path, "rb") as f: feature_factory_manager = pickle.load(f) for dicts in feature_factory_manager.feature_factory_dict.values(): for factory in dicts.values(): factory.logger = logger feature_factory_manager.logger = logger iter_test = env.iter_test() df_test_prev = pd.DataFrame() answered_correctlies = [] user_answers = [] i = 0 t = time.time() for (df_test, df_sample_prediction) in iter_test: i += 1 logger.info( f"[time: {int(time.time() - t)}iteration {i}: data_length: {len(df_test)}" ) # 前回のデータ更新 if len(df_test_prev) > 0: # 初回のみパスするためのif answered_correctly = df_test.iloc[0]["prior_group_answers_correct"] user_answer = df_test.iloc[0]["prior_group_responses"] answered_correctlies.extend([ int(x) for x in answered_correctly.replace("[", "").replace( "'", "").replace("]", "").replace(" ", "").split(",") ]) user_answers.extend([ int(x) for x in user_answer.replace("[", "").replace("'", "").replace( "]", "").replace(" ", "").split(",") ]) if debug: update_record = 1 else: update_record = 50 if len(df_test_prev) > update_record: df_test_prev["answered_correctly"] = answered_correctlies df_test_prev["user_answer"] = user_answers # df_test_prev = df_test_prev.drop(prior_columns, axis=1) df_test_prev = df_test_prev[ df_test_prev["answered_correctly"] != -1] df_test_prev["answered_correctly"] = df_test_prev[ "answered_correctly"].replace(-1, np.nan) df_test_prev["prior_question_had_explanation"] = df_test_prev[ "prior_question_had_explanation"].fillna(-1).astype("int8") feature_factory_manager.fit(df_test_prev) df_test_prev = pd.DataFrame() answered_correctlies = [] user_answers = [] # 今回のデータ取得&計算 # logger.info(f"[time: {int(time.time() - t)}dataload") logger.info(f"merge... ") w_df1 = pd.merge(df_test[df_test["content_type_id"] == 0], df_question, how="left", left_on="content_id", right_on="question_id") w_df2 = pd.merge(df_test[df_test["content_type_id"] == 1], df_lecture, how="left", left_on="content_id", right_on="lecture_id") df_test = pd.concat([w_df1, w_df2]).sort_values(["user_id", "timestamp"]).sort_index() df_test["tag"] = df_test["tag"].fillna(-1) df_test["correct_answer"] = df_test["correct_answer"].fillna(-1) df_test["bundle_id"] = df_test["bundle_id"].fillna(-1) logger.info(f"transform... ") df_test["prior_question_had_explanation"] = df_test[ "prior_question_had_explanation"].astype("float16").fillna( -1).astype("int8") df = feature_factory_manager.partial_predict(df_test) df.columns = [x.replace(" ", "_") for x in df.columns] logger.info(f"other... ") # predict predicts = [] cols = models[0].feature_name() for model in models: predicts.append(model.predict(df[cols])) df["answered_correctly"] = np.array(predicts).transpose().mean(axis=1) df_sample_prediction = pd.merge(df_sample_prediction[["row_id"]], df[["row_id", "answered_correctly"]], how="inner") env.predict(df_sample_prediction) df_test_prev = df_test_prev.append(df[cols + ["user_id", "tags"]]) if i < 5: df_test_prev.to_csv(f"{i}.csv")
import riiideducation env = riiideducation.make_env() iter_test = env.iter_test() import sys import logging PATH = '/kaggle/input/riiid-saint-model' sys.path.append(PATH) from riiid.utils import configure_console_logging, check_versions from riiid.saint.model import SaintModel configure_console_logging() check_versions() logging.info('Load model') MODEL_ID = 'saint_20210101_132425' model: SaintModel = SaintModel.load(PATH, MODEL_ID) model.load_model_from_path('gs://riiid-models/{}_model'.format(MODEL_ID)) for test, _ in iter_test: test = model.update(test) _, predictions = model.predict(test) env.predict(predictions)
def run(debug, model_dir, update_record, kaggle=False): # environment env = riiideducation.make_env() df_question = pd.read_csv( "../input/riiid-test-answer-prediction/questions.csv", dtype={ "bundle_id": "int32", "question_id": "int32", "correct_answer": "int8", "part": "int8" }) df_lecture = pd.read_csv( "../input/riiid-test-answer-prediction/lectures.csv", dtype={ "lecture_id": "int32", "tag": "int16", "part": "int8" }) # params with open(f"{model_dir}/transformer_param.json", "r") as f: params = json.load(f) # model loading models = [] for model_path in glob.glob(f"{model_dir}/*transformer*.pth"): model = SAKTModel(13782, embed_dim=params["embed_dim"], max_seq=params["max_seq"]) torch.load(model_path) model.load_state_dict(torch.load(model_path)) model.to(device) models.append(model) # load feature_factory_manager logger = get_logger() ff_manager_path_for_transformer = f"{model_dir}/feature_factory_manager_for_transformer.pickle" with open(ff_manager_path_for_transformer, "rb") as f: feature_factory_manager_for_transformer = pickle.load(f) feature_factory_manager_for_transformer.logger = logger iter_test = env.iter_test() df_test_prev = [] df_test_prev_rows = 0 answered_correctlies = [] user_answers = [] i = 0 for (df_test, df_sample_prediction) in iter_test: i += 1 logger.info(f"[iteration {i}: data_length: {len(df_test)}") # 前回のデータ更新 if df_test_prev_rows > 0: # 初回のみパスするためのif answered_correctly = df_test.iloc[0]["prior_group_answers_correct"] user_answer = df_test.iloc[0]["prior_group_responses"] answered_correctlies.extend([ int(x) for x in answered_correctly.replace("[", "").replace( "'", "").replace("]", "").replace(" ", "").split(",") ]) user_answers.extend([ int(x) for x in user_answer.replace("[", "").replace("'", "").replace( "]", "").replace(" ", "").split(",") ]) if debug: update_record = 1 if df_test_prev_rows > update_record: logger.info("------ fitting ------") logger.info("concat df") df_test_prev = pd.concat(df_test_prev) df_test_prev["answered_correctly"] = answered_correctlies df_test_prev["user_answer"] = user_answers # df_test_prev = df_test_prev.drop(prior_columns, axis=1) df_test_prev = df_test_prev[ df_test_prev["answered_correctly"] != -1] df_test_prev["answered_correctly"] = df_test_prev[ "answered_correctly"].replace(-1, np.nan) df_test_prev["prior_question_had_explanation"] = df_test_prev[ "prior_question_had_explanation"].fillna(-1).astype("int8") logger.info("fit data") feature_factory_manager_for_transformer.fit(df_test_prev) df_test_prev = [] df_test_prev_rows = 0 answered_correctlies = [] user_answers = [] # 今回のデータ取得&計算 # logger.info(f"[time: {int(time.time() - t)}dataload") logger.info(f"------ question&lecture merge ------") w_df1 = pd.merge(df_test[df_test["content_type_id"] == 0], df_question, how="left", left_on="content_id", right_on="question_id") w_df2 = pd.merge(df_test[df_test["content_type_id"] == 1], df_lecture, how="left", left_on="content_id", right_on="lecture_id") df_test = pd.concat([w_df1, w_df2]).sort_values(["user_id", "timestamp"]).sort_index() df_test["tag"] = df_test["tag"].fillna(-1) df_test["correct_answer"] = df_test["correct_answer"].fillna(-1) df_test["bundle_id"] = df_test["bundle_id"].fillna(-1) logger.info(f"------ transform ------ ") df_test["prior_question_had_explanation"] = df_test[ "prior_question_had_explanation"].astype("float16").fillna( -1).astype("int8") df = df_test group = feature_factory_manager_for_transformer.partial_predict( df_test[df_test["content_type_id"] == 0]) logger.info(f"------ predict ------") dataset_val = SAKTDataset(group, 13782, predict_mode=True, max_seq=params["max_seq"]) dataloader_val = DataLoader(dataset_val, batch_size=1024, shuffle=False, num_workers=1) predicts = [] for d in dataloader_val: x = d[0].to(device).long() target_id = d[1].to(device).long() part = d[2].to(device).long() label = d[3].to(device).long() output, atten_weight = model(x, target_id, part) predicts.extend(torch.nn.Sigmoid()( output[:, -1]).view(-1).data.cpu().numpy().tolist()) logger.info("------ other ------") df_sample_prediction = df[df["content_type_id"] == 0][["row_id"]] df_sample_prediction["answered_correctly"] = predicts env.predict(df_sample_prediction) df_test_prev.append(df) df_test_prev_rows += len(df) if i < 5: df.to_csv(f"{i}.csv")
def run(debug, model_dir, kaggle=False, rewrite=False): if kaggle: files_dir = "/kaggle/input/riiid-split10/*.feather" else: files_dir = "../input/riiid-test-answer-prediction/split10/*.feather" logger = get_logger() # environment env = riiideducation.make_env() # model loading models = [] for model_path in glob.glob(f"{model_dir}/*model*.pickle"): with open(model_path, "rb") as f: models.append(pickle.load(f)) # data preprocessing data_dir = "../work_csv" if rewrite: if os.path.isdir(data_dir): shutil.rmtree(data_dir) os.makedirs(data_dir, exist_ok=True) for model_id, fname in enumerate(glob.glob(files_dir)): logger.info(f"loading... {fname}") df = pd.read_pickle(fname) if debug: df = df.head(1000) df = transform(df) for user_id, w_df in tqdm.tqdm(df.groupby("user_id")): os.makedirs(f"{data_dir}/{user_id}/", exist_ok=True) w_df.to_pickle(f"{data_dir}/{user_id}/original.feather") iter_test = env.iter_test() df_test_prev = pd.DataFrame() i = 0 t = time.time() for (df_test, df_sample_prediction) in iter_test: i += 1 logger.info( f"[time: {int(time.time() - t)}iteration {i}: data_length: {len(df_test)}" ) # 前回のデータ更新 if len(df_test_prev) > 0: answered_correctly = df_test.iloc[0]["prior_group_answers_correct"] user_answer = df_test.iloc[0]["prior_group_responses"] df_test_prev["answered_correctly"] = [ int(x) for x in answered_correctly.replace("[", "").replace( "'", "").replace("]", "").replace(" ", "").split(",") ] df_test_prev["user_answer"] = [ int(x) for x in user_answer.replace("[", "").replace("'", "").replace( "]", "").replace(" ", "").split(",") ] # df_test_prev = df_test_prev.drop(prior_columns, axis=1) for user_id, df_prev in df_test_prev.groupby("user_id"): # logger.info(f"[time: {int(time.time() - t)}iteration {i}") os.makedirs(f"{data_dir}/{user_id}/", exist_ok=True) data_id = len(f"{data_dir}/{user_id}") df_prev.to_pickle(f"{data_dir}/{user_id}/{data_id}.pickle") # 今回のデータ取得&計算 dfs = [] df_nows = [] for user_id, df_now in df_test.groupby("user_id"): # logger.info(f"[time: {int(time.time() - t)}dataload") fnames = glob.glob(f"{data_dir}/{user_id}/*.feather") if len(fnames) > 0: read_dfs = [pd.read_pickle(x) for x in fnames] df = pd.concat(read_dfs + [df_now]).reset_index(drop=True) else: df = df_now[:] df["user_answer"] = -1 df["answered_correctly"] = -1 df = df.astype(data_types_dict) df = transform(df) cols = models[0].feature_name() for col in cols: if col not in df.columns: df[col] = -99999 dfs.append(df[cols].iloc[-len(df_now):].drop("row_id", axis=1, errors="ignore")) df_nows.append(df_now) # predict df_test_prev = pd.concat(dfs) logger.info(f"[time: {int(time.time() - t)}model") predicts = [] for model in models: predicts.append(model.predict(df_test_prev)) df_nows = pd.concat(df_nows) df_nows["answered_correctly"] = np.array(predicts).transpose().mean( axis=1) df_sample_prediction = pd.merge( df_sample_prediction[["row_id"]], df_nows[["row_id", "answered_correctly"]], how="inner") env.predict(df_sample_prediction)
def run(debug, model_dir, update_record, kaggle=False): if kaggle: files_dir = "/kaggle/input/riiid-split10/*.pickle" else: files_dir = "../input/riiid-test-answer-prediction/split10_base/*.pickle" logger = get_logger() # environment env = riiideducation.make_env() df_question = pd.read_csv( "../input/riiid-test-answer-prediction/questions.csv", dtype={ "bundle_id": "int32", "question_id": "int32", "correct_answer": "int8", "part": "int8" }) df_lecture = pd.read_csv( "../input/riiid-test-answer-prediction/lectures.csv", dtype={ "lecture_id": "int32", "tag": "int16", "part": "int8" }) # model loading models = [] for model_path in glob.glob(f"{model_dir}/*model*.pickle"): with open(model_path, "rb") as f: models.append(pickle.load(f)) # load feature_factory_manager logger = get_logger() ff_manager_path = f"{model_dir}/feature_factory_manager.pickle" with open(ff_manager_path, "rb") as f: feature_factory_manager = pickle.load(f) for dicts in feature_factory_manager.feature_factory_dict.values(): for factory in dicts.values(): factory.logger = logger feature_factory_manager.logger = logger iter_test = env.iter_test() df_test_prev = [] df_test_prev_rows = 0 answered_correctlies = [] user_answers = [] i = 0 df_all = pd.read_pickle(f"{os.path.dirname(files_dir)}/train_0.pickle").head(1000000)\ #.drop( #["question_id", "bundle_id", "correct_answer", "part", "lecture_id", "tag", "part", "type_of"], axis=1 #) for idx in range(len(df_all) // 20): df_test = df_all.iloc[i * 20:(i + 1) * 20] i += 1 logger.info(f"[iteration {i}: data_length: {len(df_test)}") # 前回のデータ更新 """ if df_test_prev_rows > 0: # 初回のみパスするためのif answered_correctly = df_test.iloc[0]["prior_group_answers_correct"] user_answer = df_test.iloc[0]["prior_group_responses"] answered_correctlies.extend([int(x) for x in answered_correctly.replace("[", "").replace("'", "").replace("]", "").replace(" ", "").split(",")]) user_answers.extend([int(x) for x in user_answer.replace("[", "").replace("'", "").replace("]", "").replace(" ", "").split(",")]) """ if debug: update_record = 1 if df_test_prev_rows > update_record: logger.info("------ fitting ------") logger.info("concat df") df_test_prev = pd.concat(df_test_prev) # df_test_prev["answered_correctly"] = answered_correctlies # df_test_prev["user_answer"] = user_answers # df_test_prev = df_test_prev.drop(prior_columns, axis=1) df_test_prev = df_test_prev[ df_test_prev["answered_correctly"] != -1] df_test_prev["answered_correctly"] = df_test_prev[ "answered_correctly"].replace(-1, np.nan) df_test_prev["prior_question_had_explanation"] = df_test_prev[ "prior_question_had_explanation"].fillna(-1).astype("int8") logger.info("fit data") feature_factory_manager.fit(df_test_prev) df_test_prev = [] df_test_prev_rows = 0 answered_correctlies = [] user_answers = [] # 今回のデータ取得&計算 # logger.info(f"[time: {int(time.time() - t)}dataload") logger.info(f"------ question&lecture merge ------") w_df1 = pd.merge(df_test[df_test["content_type_id"] == 0], df_question, how="left", left_on="content_id", right_on="question_id") w_df2 = pd.merge(df_test[df_test["content_type_id"] == 1], df_lecture, how="left", left_on="content_id", right_on="lecture_id") df_test = pd.concat([w_df1, w_df2]).sort_values(["user_id", "timestamp"]).sort_index() df_test["tag"] = df_test["tag"].fillna(-1) df_test["correct_answer"] = df_test["correct_answer"].fillna(-1) df_test["bundle_id"] = df_test["bundle_id"].fillna(-1) logger.info(f"------ transform ------ ") df_test["prior_question_had_explanation"] = df_test[ "prior_question_had_explanation"].astype("float16").fillna( -1).astype("int8") df = feature_factory_manager.partial_predict(df_test) df.columns = [x.replace(" ", "_") for x in df.columns] logger.info(f"------ predict ------") # predict predicts = [] cols = models[0].feature_name() w_df = df[cols] for model in models: predicts.append(model.predict(w_df)) logger.info("------ other ------") df["answered_correctly"] = np.array(predicts).transpose().mean(axis=1) df_sample_prediction = df[df["content_type_id"] == 0][[ "answered_correctly" ]] df_test_prev.append(df[cols + ["user_id", "tags", "answered_correctly"]]) df_test_prev_rows += len(df) if i < 5: df.to_csv(f"{i}.csv")
def inference(TARGET, FEATURES, model, prior_question_elapsed_time_mean, features_dicts, lectures_df, q_taglist_df): # Get feature dict answered_correctly_u_count = features_dicts['answered_correctly_u_count'] answered_correctly_u_sum = features_dicts['answered_correctly_u_sum'] elapsed_time_u_sum = features_dicts['elapsed_time_u_sum'] explanation_u_sum = features_dicts['explanation_u_sum'] answered_correctly_q_count = features_dicts['answered_correctly_q_count'] elapsed_time_q_sum = features_dicts['elapsed_time_q_sum'] explanation_q_sum = features_dicts['explanation_q_sum'] timestamp_u = features_dicts['timestamp_u'] timestamp_u_incorrect = features_dicts['timestamp_u_incorrect'] answered_correctly_up_count = features_dicts['answered_correctly_up_count'] answered_correctly_up_sum = features_dicts['answered_correctly_up_sum'] answered_correctly_tag1_count = features_dicts[ 'answered_correctly_tag1_count'] answered_correctly_tag1_sum = features_dicts['answered_correctly_tag1_sum'] lect_dict = features_dicts['lect_dict'] tag_list_dict = features_dicts['tag_list_dict'] # Get api iterator and predictor env = riiideducation.make_env() iter_test = env.iter_test() set_predict = env.predict questions_df = pd.read_pickle('questions_df.pkl') previous_test_df = None for (test_df, sample_prediction_df) in iter_test: if previous_test_df is not None: previous_test_df[TARGET] = eval( test_df["prior_group_answers_correct"].iloc[0]) update_features(previous_test_df, answered_correctly_u_sum, answered_correctly_u_count, answered_correctly_q_count, timestamp_u_incorrect, answered_correctly_up_count, answered_correctly_up_sum, answered_correctly_tag1_count, answered_correctly_tag1_sum) test_df[ 'prior_question_had_explanation'] = test_df.prior_question_had_explanation.fillna( False).astype('int8') test_df['prior_question_elapsed_time'].fillna( prior_question_elapsed_time_mean, inplace=True) question_cols = [ 'question_id', 'part', 'tag_1', 'answered_correctly_q_mean', 'answered_correctly_q_std', 'answered_correctly_p_mean', 'answered_correctly_p_std', 'answered_correctly_b_mean', 'answered_correctly_b_std', 'answered_correctly_tag_1_mean', 'answered_correctly_tag_1_std' ] test_df = pd.merge(test_df, questions_df[question_cols], left_on='content_id', right_on='question_id', how='left') previous_test_df = test_df.copy() test_df = add_lectures_feats(test_df, lect_dict, tag_list_dict, lectures_df, q_taglist_df) test_df = test_df[test_df['content_type_id'] == 0].reset_index( drop=True) test_df[TARGET] = 0 test_df = add_features(test_df, answered_correctly_u_count, answered_correctly_u_sum, elapsed_time_u_sum, explanation_u_sum, timestamp_u, timestamp_u_incorrect, answered_correctly_q_count, elapsed_time_q_sum, explanation_q_sum, answered_correctly_up_count, answered_correctly_up_sum, answered_correctly_tag1_count, answered_correctly_tag1_sum, update=False) test_df[TARGET] = model.predict(test_df[FEATURES]) set_predict(test_df[['row_id', TARGET]]) print('Job Done')
def run(debug, model_dir, kaggle=False): if kaggle: files_dir = "/kaggle/input/riiid-split10/*.pickle" else: files_dir = "../input/riiid-test-answer-prediction/split10/*.pickle" logger = get_logger() # environment env = riiideducation.make_env() # model loading models = [] for model_path in glob.glob(f"{model_dir}/*model*.pickle"): with open(model_path, "rb") as f: models.append(pickle.load(f)) # data preprocessing logger = get_logger() feature_factory_dict = {} for column in [ "user_id", "content_id", "content_type_id", "prior_question_had_explanation" ]: feature_factory_dict[column] = { "CountEncoder": CountEncoder(column=column), "TargetEncoder": TargetEncoder(column=column) } feature_factory_dict["user_id"][ "MeanAggregatorTimestamp"] = MeanAggregator(column="user_id", agg_column="timestamp", remove_now=False) feature_factory_dict["user_id"][ "MeanAggregatorPriorQuestionElapsedTime"] = MeanAggregator( column="user_id", agg_column="prior_question_elapsed_time", remove_now=True) feature_factory_dict["content_id"][ "MeanAggregatorPriorQuestionElapsedTime"] = MeanAggregator( column="content_id", agg_column="prior_question_elapsed_time", remove_now=True) for column in [("user_id", "content_type_id"), ("user_id", "prior_question_had_explanation")]: feature_factory_dict[column] = { "CountEncoder": CountEncoder(column=list(column)), "TargetEncoder": TargetEncoder(column=list(column)) } feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger) for model_id, fname in enumerate(glob.glob(files_dir)): logger.info(f"loading... {fname}") df = pd.read_pickle(fname) df["prior_question_had_explanation"] = df[ "prior_question_had_explanation"].astype("float16").fillna( -1).astype("int8") if debug: df = df.head(1000) feature_factory_manager.fit(df) iter_test = env.iter_test() df_test_prev = pd.DataFrame() i = 0 t = time.time() for (df_test, df_sample_prediction) in iter_test: i += 1 logger.info( f"[time: {int(time.time() - t)}iteration {i}: data_length: {len(df_test)}" ) # 前回のデータ更新 if len(df_test_prev) > 0: answered_correctly = df_test.iloc[0]["prior_group_answers_correct"] user_answer = df_test.iloc[0]["prior_group_responses"] df_test_prev["answered_correctly"] = [ int(x) for x in answered_correctly.replace("[", "").replace( "'", "").replace("]", "").replace(" ", "").split(",") ] df_test_prev["user_answer"] = [ int(x) for x in user_answer.replace("[", "").replace("'", "").replace( "]", "").replace(" ", "").split(",") ] # df_test_prev = df_test_prev.drop(prior_columns, axis=1) feature_factory_manager.fit(df_test_prev) # 今回のデータ取得&計算 # logger.info(f"[time: {int(time.time() - t)}dataload") logger.info(f"transform... ") df_test["prior_question_had_explanation"] = df_test[ "prior_question_had_explanation"].astype("float16").fillna( -1).astype("int8") df = feature_factory_manager.partial_predict(df_test) logger.info(f"other... ") cols = models[0].feature_name() for col in cols: if col not in df.columns: df[col] = -99999 # predict predicts = [] cols = models[0].feature_name() for model in models: predicts.append(model.predict(df[cols])) df["answered_correctly"] = np.array(predicts).transpose().mean(axis=1) df_sample_prediction = pd.merge(df_sample_prediction[["row_id"]], df[["row_id", "answered_correctly"]], how="inner") env.predict(df_sample_prediction) df_test_prev = df[cols + ["user_id"]] df_test_prev.to_csv(f"{i}.csv")
def run(debug, model_dir, update_record, kaggle=False): # environment env = riiideducation.make_env() df_question = pd.read_csv("../input/riiid-test-answer-prediction/questions.csv", dtype={"bundle_id": "int32", "question_id": "int32", "correct_answer": "int8", "part": "int8"}) df_lecture = pd.read_csv("../input/riiid-test-answer-prediction/lectures.csv", dtype={"lecture_id": "int32", "tag": "int16", "part": "int8"}) # params with open(f"{model_dir}/transformer_param.json", "r") as f: params = json.load(f) # model loading model_path = f"{model_dir}/transformers.pth" model = SAKTModel(13938, embed_dim=params["embed_dim"], max_seq=params["max_seq"], cont_emb=8) model.load_state_dict(torch.load(model_path)) model.to(device) model.eval() # load feature_factory_manager logger = get_logger() ff_manager_path_for_transformer = f"{model_dir}/feature_factory_manager_for_transformer.pickle" with open(ff_manager_path_for_transformer, "rb") as f: feature_factory_manager_for_transformer = pickle.load(f) feature_factory_manager_for_transformer.logger = logger ff_manager_path = f"{model_dir}/feature_factory_manager.pickle" with open(ff_manager_path, "rb") as f: feature_factory_manager = pickle.load(f) feature_factory_manager.logger = logger iter_test = env.iter_test() df_test_prev = [] df_test_prev_rows = 0 answered_correctlies = [] user_answers = [] i = 0 for (df_test, df_sample_prediction) in iter_test: i += 1 logger.info(f"[iteration {i}: data_length: {len(df_test)}") # 前回のデータ更新 if df_test_prev_rows > 0: # 初回のみパスするためのif answered_correctly = df_test.iloc[0]["prior_group_answers_correct"] user_answer = df_test.iloc[0]["prior_group_responses"] answered_correctlies.extend([int(x) for x in answered_correctly.replace("[", "").replace("'", "").replace("]", "").replace(" ", "").split(",")]) user_answers.extend([int(x) for x in user_answer.replace("[", "").replace("'", "").replace("]", "").replace(" ", "").split(",")]) if debug: update_record = 1 if df_test_prev_rows > update_record: logger.info("------ fitting ------") logger.info("concat df") df_test_prev = pd.concat(df_test_prev) df_test_prev["answered_correctly"] = answered_correctlies df_test_prev["user_answer"] = user_answers # df_test_prev = df_test_prev.drop(prior_columns, axis=1) # df_test_prev = df_test_prev[df_test_prev["answered_correctly"] != -1] df_test_prev["answered_correctly"] = df_test_prev["answered_correctly"].replace(-1, np.nan) df_test_prev["prior_question_had_explanation"] = df_test_prev["prior_question_had_explanation"].fillna(-1).astype("int8") logger.info("fit data") feature_factory_manager.fit(df_test_prev) df_test_prev["answered_correctly"] = df_test_prev["answered_correctly"].replace(np.nan, -1) feature_factory_manager_for_transformer.fit(df_test_prev) df_test_prev = [] df_test_prev_rows = 0 answered_correctlies = [] user_answers = [] # 今回のデータ取得&計算 # logger.info(f"[time: {int(time.time() - t)}dataload") logger.info(f"------ question&lecture merge ------") w_df1 = pd.merge(df_test[df_test["content_type_id"] == 0], df_question, how="left", left_on="content_id", right_on="question_id") w_df2 = pd.merge(df_test[df_test["content_type_id"] == 1], df_lecture, how="left", left_on="content_id", right_on="lecture_id") df_test = pd.concat([w_df1, w_df2]).sort_values(["user_id", "timestamp"]).sort_index() df_test["tag"] = df_test["tag"].fillna(-1) df_test["correct_answer"] = df_test["correct_answer"].fillna(-1) df_test["bundle_id"] = df_test["bundle_id"].fillna(-1) df_test["task_container_id_bin300"] = [x if x < 300 else 300 for x in df_test["task_container_id"].values] logger.info(f"------ transform ------ ") df_test["prior_question_had_explanation"] = df_test["prior_question_had_explanation"].astype("float16").fillna(-1).astype("int8") df_test = feature_factory_manager.partial_predict(df_test) group = feature_factory_manager_for_transformer.partial_predict(df_test[df_test["content_type_id"] == 0]) logger.info(f"------ predict ------") dataset_val = SAKTDataset(group, 13939, predict_mode=True, max_seq=params["max_seq"]) dataloader_val = DataLoader(dataset_val, batch_size=1024, shuffle=False, num_workers=1) predicts = [] with torch.no_grad(): for item in dataloader_val: x = item["x"].to(device).long() target_id = item["target_id"].to(device).long() part = item["part"].to(device).long() label = item["label"].to(device).float() elapsed_time = item["elapsed_time"].to(device).long() duration_previous_content = item["duration_previous_content"].to(device).long() prior_question_had_explanation = item["prior_q"].to(device).long() user_answer = item["user_answer"].to(device).long() rate_diff = item["rate_diff"].to(device).float() container_id = item["container_id"].to(device).long() prev_ans_idx = item["previous_answer_index_content_id"].to(device).long() prev_answer_content_id = item["previous_answer_content_id"].to(device).long() output = model(x, target_id, part, elapsed_time, duration_previous_content, prior_question_had_explanation, user_answer, rate_diff, container_id, prev_ans_idx, prev_answer_content_id) predicts.extend(torch.nn.Sigmoid()(output[:, -1]).view(-1).data.cpu().numpy().tolist()) logger.info("------ other ------") df_sample_prediction = df_test[df_test["content_type_id"] == 0][["row_id"]] df_sample_prediction["answered_correctly"] = predicts env.predict(df_sample_prediction) df_test_prev.append(df_test) df_test_prev_rows += len(df_test) if i < 5: df_test.to_csv(f"{i}.csv") if i == 3: class EmptyLogger: def __init__(self): pass def info(self, s): pass logger = EmptyLogger()
def run(debug, model_dir, kaggle=False): if kaggle: files_dir = "/kaggle/input/riiid-split10/*.pickle" else: files_dir = "../input/riiid-test-answer-prediction/split10_base/*.pickle" logger = get_logger() # environment env = riiideducation.make_env() df_question = pd.read_csv( "../input/riiid-test-answer-prediction/questions.csv", dtype={ "bundle_id": "int32", "question_id": "int32", "correct_answer": "int8", "part": "int8" }) df_lecture = pd.read_csv( "../input/riiid-test-answer-prediction/lectures.csv", dtype={ "lecture_id": "int32", "tag": "int16", "part": "int8" }) # model loading models = [] for model_path in glob.glob(f"{model_dir}/*model*.pickle"): with open(model_path, "rb") as f: models.append(pickle.load(f)) # data preprocessing logger = get_logger() feature_factory_dict = {} feature_factory_dict["tags"] = {"TagsSeparator": TagsSeparator()} for column in [ "content_id", "user_id", "part", "prior_question_had_explanation", "tags1", "tags2", ("user_id", "prior_question_had_explanation"), ("user_id", "part") ]: is_partial_fit = (column == "content_id" or column == "user_id") if type(column) == str: feature_factory_dict[column] = { "CountEncoder": CountEncoder(column=column, is_partial_fit=is_partial_fit), "TargetEncoder": TargetEncoder(column=column, is_partial_fit=is_partial_fit) } else: feature_factory_dict[column] = { "CountEncoder": CountEncoder(column=list(column), is_partial_fit=is_partial_fit), "TargetEncoder": TargetEncoder(column=list(column), is_partial_fit=is_partial_fit) } feature_factory_dict["user_id"][ "ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id", column="timestamp", is_partial_fit=True) feature_factory_dict["user_id"][ "ShiftDiffEncoderContentId"] = ShiftDiffEncoder(groupby="user_id", column="content_id") for column in ["user_id", "content_id"]: feature_factory_dict[column][ f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator( column=column, agg_column="prior_question_elapsed_time", remove_now=True) feature_factory_dict["user_id"][ "UserLevelEncoder2ContentId"] = UserLevelEncoder2( vs_column="content_id") feature_factory_dict["user_id"][ "UserCountBinningEncoder"] = UserCountBinningEncoder( is_partial_fit=True) feature_factory_dict["user_count_bin"] = {} feature_factory_dict["user_count_bin"]["CountEncoder"] = CountEncoder( column="user_count_bin") feature_factory_dict["user_count_bin"]["TargetEncoder"] = TargetEncoder( column="user_count_bin") feature_factory_dict[("user_id", "user_count_bin")] = { "CountEncoder": CountEncoder(column=["user_id", "user_count_bin"]), "TargetEncoder": TargetEncoder(column=["user_id", "user_count_bin"]) } feature_factory_dict[("content_id", "user_count_bin")] = { "CountEncoder": CountEncoder(column=["content_id", "user_count_bin"]), "TargetEncoder": TargetEncoder(column=["content_id", "user_count_bin"]) } feature_factory_dict["user_id"][ "CategoryLevelEncoderPart"] = CategoryLevelEncoder( groupby_column="user_id", agg_column="part", categories=[1, 2, 3, 4, 5, 6, 7]) feature_factory_dict["user_count_bin"]["CategoryLevelEncoderUserCountBin"] = \ CategoryLevelEncoder(groupby_column="user_id", agg_column="user_count_bin", categories=[0, 1, 2, 3, 4, 5]) feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger) for model_id, fname in enumerate(glob.glob(files_dir)): logger.info(f"loading... {fname}") df = pd.read_pickle(fname) df = df[df["answered_correctly"] != -1] df["prior_question_had_explanation"] = df[ "prior_question_had_explanation"].fillna(-1).astype("int8") if debug: df = df.head(1000) df = pd.concat([ pd.merge(df[df["content_type_id"] == 0], df_question, how="left", left_on="content_id", right_on="question_id"), pd.merge(df[df["content_type_id"] == 1], df_lecture, how="left", left_on="content_id", right_on="lecture_id") ]).sort_values(["user_id", "timestamp"]) # df = feature_factory_manager.feature_factory_dict["content_id"]["TargetEncoder"].all_predict(df) feature_factory_manager.fit(df, is_first_fit=True) for dicts in feature_factory_manager.feature_factory_dict.values(): for factory in dicts.values(): factory.logger = None feature_factory_manager.logger = None with open(f"feature_factory_manager.pickle", "wb") as f: pickle.dump(feature_factory_manager, f) return
def run(debug, model_dir, update_record, kaggle=False): if kaggle: files_dir = "/kaggle/input/riiid-split10/*.pickle" else: files_dir = "../input/riiid-test-answer-prediction/split10_base/*.pickle" logger = get_logger() # environment env = riiideducation.make_env() df_question = pd.read_csv( "../input/riiid-test-answer-prediction/questions.csv", dtype={ "bundle_id": "int32", "question_id": "int32", "correct_answer": "int8", "part": "int8" }) df_lecture = pd.read_csv( "../input/riiid-test-answer-prediction/lectures.csv", dtype={ "lecture_id": "int32", "tag": "int16", "part": "int8" }) # model loading models_lgbm = [] for model_path in glob.glob(f"{model_dir}/*lgbm*.pickle"): with open(model_path, "rb") as f: models_lgbm.append(pickle.load(f)) models_cat = [] params = { 'n_estimators': 12000, 'learning_rate': 0.3, 'eval_metric': 'AUC', 'loss_function': 'Logloss', 'random_seed': 0, 'metric_period': 50, 'od_wait': 400, 'task_type': 'GPU', 'max_depth': 8, "verbose": 100 } for model_path in glob.glob(f"{model_dir}/*catboost"): models_cat.append(CatBoostClassifier().load_model(model_path, format="cbm")) print(models_cat[0].get_best_iteration()) # load feature_factory_manager logger = get_logger() ff_manager_path = f"{model_dir}/feature_factory_manager.pickle" with open(ff_manager_path, "rb") as f: feature_factory_manager = pickle.load(f) for dicts in feature_factory_manager.feature_factory_dict.values(): for factory in dicts.values(): factory.logger = logger feature_factory_manager.logger = logger iter_test = env.iter_test() df_test_prev = [] df_test_prev_rows = 0 answered_correctlies = [] user_answers = [] i = 0 for (df_test, df_sample_prediction) in iter_test: i += 1 # logger.info(f"[iteration {i}: data_length: {len(df_test)}") # 前回のデータ更新 if df_test_prev_rows > 0: # 初回のみパスするためのif answered_correctly = df_test.iloc[0]["prior_group_answers_correct"] user_answer = df_test.iloc[0]["prior_group_responses"] answered_correctlies.extend([ int(x) for x in answered_correctly.replace("[", "").replace( "'", "").replace("]", "").replace(" ", "").split(",") ]) user_answers.extend([ int(x) for x in user_answer.replace("[", "").replace("'", "").replace( "]", "").replace(" ", "").split(",") ]) if df_test_prev_rows > update_record: # logger.info("------ fitting ------") # logger.info("concat df") df_test_prev = pd.concat(df_test_prev) df_test_prev["answered_correctly"] = answered_correctlies df_test_prev["user_answer"] = user_answers # df_test_prev = df_test_prev.drop(prior_columns, axis=1) df_test_prev = df_test_prev[ df_test_prev["answered_correctly"] != -1] df_test_prev["answered_correctly"] = df_test_prev[ "answered_correctly"].replace(-1, np.nan) df_test_prev["prior_question_had_explanation"] = df_test_prev[ "prior_question_had_explanation"].fillna(-1).astype("int8") # logger.info("fit data") feature_factory_manager.fit(df_test_prev) df_test_prev = [] df_test_prev_rows = 0 answered_correctlies = [] user_answers = [] # 今回のデータ取得&計算 # logger.info(f"[time: {int(time.time() - t)}dataload") # logger.info(f"merge... ") w_df1 = pd.merge(df_test[df_test["content_type_id"] == 0], df_question, how="left", left_on="content_id", right_on="question_id") w_df2 = pd.merge(df_test[df_test["content_type_id"] == 1], df_lecture, how="left", left_on="content_id", right_on="lecture_id") df_test = pd.concat([w_df1, w_df2]).sort_values(["user_id", "timestamp"]).sort_index() df_test["tag"] = df_test["tag"].fillna(-1) df_test["correct_answer"] = df_test["correct_answer"].fillna(-1) df_test["bundle_id"] = df_test["bundle_id"].fillna(-1) # logger.info(f"transform... ") df_test["prior_question_had_explanation"] = df_test[ "prior_question_had_explanation"].astype("float16").fillna( -1).astype("int8") df = feature_factory_manager.partial_predict(df_test) df.columns = [ x.replace("[", "_").replace("]", "_").replace("'", "_").replace( " ", "_").replace(",", "_") for x in df.columns ] # predict # logger.info(f"predict lgbm...") predicts_lgbm = [] cols = models_cat[0].feature_names_ w_df = df[cols] for model in models_lgbm: predicts_lgbm.append(model.predict(w_df)) pred_lgbm = np.array(predicts_lgbm).mean(axis=0) # logger.info(f"predict cat...") predicts_cat = [] for model in models_cat: predicts_cat.append( model.predict_proba(w_df.values)[:, 1].flatten()) pred_cat = np.array(predicts_cat).mean(axis=0) # logger.info("other...") df["answered_correctly"] = pred_lgbm * 0.5 + pred_cat * 0.5 df_sample_prediction = df[df["content_type_id"] == 0][[ "row_id", "answered_correctly" ]] env.predict(df_sample_prediction) df_test_prev.append(df[cols + ["user_id", "tags"]]) df_test_prev_rows += len(df) if i < 5: df.to_csv(f"{i}.csv")
def inference(TARGET, FEATURES, sakt_model, lgb_model, prior_question_elapsed_time_mean, features_dicts): # Get feature dict answered_correctly_u_count = features_dicts['answered_correctly_u_count'] answered_correctly_u_sum = features_dicts['answered_correctly_u_sum'] elapsed_time_u_sum = features_dicts['elapsed_time_u_sum'] explanation_u_sum = features_dicts['explanation_u_sum'] answered_correctly_q_count = features_dicts['answered_correctly_q_count'] answered_correctly_q_sum = features_dicts['answered_correctly_q_sum'] answered_correctly_uq = features_dicts["answered_correctly_uq"] elapsed_time_q_sum = features_dicts['elapsed_time_q_sum'] explanation_q_sum = features_dicts['explanation_q_sum'] timestamp_u = features_dicts['timestamp_u'] timestamp_u_incorrect = features_dicts['timestamp_u_incorrect'] answered_correctly_up_count = features_dicts['answered_correctly_up_count'] answered_correctly_up_sum = features_dicts['answered_correctly_up_sum'] # Get api iterator and predictor env = riiideducation.make_env() iter_test = env.iter_test() set_predict = env.predict questions_df = pd.read_pickle(question_file) questions_df.part = questions_df.part.astype(np.int8) previous_test_df = None for (test_df, sample_prediction_df) in iter_test: if previous_test_df is not None: previous_test_df[TARGET] = eval( test_df["prior_group_answers_correct"].iloc[0]) update_features(previous_test_df, answered_correctly_u_sum, answered_correctly_u_count, answered_correctly_q_sum, answered_correctly_q_count, timestamp_u_incorrect, answered_correctly_uq, answered_correctly_up_count, answered_correctly_up_sum) previous_test_df = previous_test_df[ previous_test_df.content_type_id == False] prev_group = previous_test_df[[ 'user_id', 'content_id', 'answered_correctly', 'part', 'prior_question_elapsed_time_sakt' ]].groupby('user_id').apply(lambda r: (r['content_id'].values, r[ 'answered_correctly'].values, r['part'].values, r[ 'prior_question_elapsed_time_sakt'].values)) for prev_user_id in prev_group.index: if prev_user_id in group.index: group[prev_user_id] = ( np.append(group[prev_user_id][0], prev_group[prev_user_id][0])[-MAX_SEQ:], np.append(group[prev_user_id][1], prev_group[prev_user_id][1])[-MAX_SEQ:], np.append(group[prev_user_id][2], prev_group[prev_user_id][2])[-MAX_SEQ:], np.append(group[prev_user_id][3], prev_group[prev_user_id][3])[-MAX_SEQ:]) else: group[prev_user_id] = (prev_group[prev_user_id][0], prev_group[prev_user_id][1], prev_group[prev_user_id][2], prev_group[prev_user_id][3]) test_df[ 'prior_question_had_explanation'] = test_df.prior_question_had_explanation.fillna( False).astype('int8') test_df['prior_question_elapsed_time_sakt'] = test_df[ 'prior_question_elapsed_time'] / 3600 test_df.prior_question_elapsed_time_sakt.fillna( prior_question_elapsed_time_mean_sakt, inplace=True) test_df.prior_question_elapsed_time_sakt.clip(lower=0, upper=16, inplace=True) test_df['prior_question_elapsed_time_sakt'] = test_df[ 'prior_question_elapsed_time_sakt'].astype(np.int16) test_df['prior_question_elapsed_time'].fillna( prior_question_elapsed_time_mean, inplace=True) test_df = pd.merge(test_df, questions_df[question_cols], left_on='content_id', right_on='question_id', how='left') previous_test_df = test_df.copy() test_df = test_df[test_df['content_type_id'] == 0].reset_index( drop=True) test_dataset = TestDataset(group, test_df, skills) test_dataloader = DataLoader(test_dataset, batch_size=51200, shuffle=False) outs = [] for item in test_dataloader: x = item[0].to(device).long() p = item[1].to(device).long() e_time = item[2].to(device).long() target_id = item[3].to(device).long() with torch.no_grad(): output, att_weight = model(x, p, e_time, target_id) outs.extend( torch.sigmoid(output)[:, -1].view(-1).data.cpu().numpy()) test_df[TARGET] = 0 test_df = add_features(test_df, answered_correctly_u_count, answered_correctly_u_sum, elapsed_time_u_sum, explanation_u_sum, timestamp_u, timestamp_u_incorrect, answered_correctly_q_count, answered_correctly_q_sum, elapsed_time_q_sum, explanation_q_sum, answered_correctly_uq, answered_correctly_up_count, answered_correctly_up_sum, update=False) test_df[TARGET] = lgb_model.predict( test_df[FEATURES]) * 0.6 + np.array(outs) * 0.4 set_predict(test_df[['row_id', TARGET]]) print('Job Done')
def run(debug, model_dir, kaggle=False): if kaggle: files_dir = "/kaggle/input/riiid-split10/*.pickle" else: files_dir = "../input/riiid-test-answer-prediction/split10/*.pickle" logger = get_logger() # environment env = riiideducation.make_env() # model loading models = [] for model_path in glob.glob(f"{model_dir}/*model*.pickle"): with open(model_path, "rb") as f: models.append(pickle.load(f)) # data preprocessing pipeline = Pipeline(logger=logger) for model_id, fname in enumerate(glob.glob(files_dir)): logger.info(f"loading... {fname}") df = pd.read_pickle(fname) if debug: df = df.head(1000) pipeline.fit(df) iter_test = env.iter_test() df_test_prev = pd.DataFrame() i = 0 t = time.time() for (df_test, df_sample_prediction) in iter_test: i += 1 logger.info( f"[time: {int(time.time() - t)}iteration {i}: data_length: {len(df_test)}" ) # 前回のデータ更新 if len(df_test_prev) > 0: answered_correctly = df_test.iloc[0]["prior_group_answers_correct"] user_answer = df_test.iloc[0]["prior_group_responses"] df_test_prev["answered_correctly"] = [ int(x) for x in answered_correctly.replace("[", "").replace( "'", "").replace("]", "").replace(" ", "").split(",") ] df_test_prev["user_answer"] = [ int(x) for x in user_answer.replace("[", "").replace("'", "").replace( "]", "").replace(" ", "").split(",") ] # df_test_prev = df_test_prev.drop(prior_columns, axis=1) pipeline.fit(df_test_prev) # 今回のデータ取得&計算 # logger.info(f"[time: {int(time.time() - t)}dataload") logger.info(f"transform... ") df = pipeline.partial_transform(df_test) logger.info(f"other... ") cols = models[0].feature_name() for col in cols: if col not in df.columns: df[col] = -99999 # predict predicts = [] cols = models[0].feature_name() for model in models: predicts.append(model.predict(df[cols])) df["answered_correctly"] = np.array(predicts).transpose().mean(axis=1) df_sample_prediction = pd.merge(df_sample_prediction[["row_id"]], df[["row_id", "answered_correctly"]], how="inner") env.predict(df_sample_prediction) df_test_prev = df[cols]