"p2w_trans", "CAFE20_RECENCY_SRKIT", "max_MC_rev", "CAFE20_P1Y_VISITS_DAY", "max_MC_Quantity", labels, ] catfeatures = [ "CAFE20_gender", "CAFE20_region", "CAFE20_levels", "is_festival_user", "level_use", "is_LAST_2YEAR_DD_ACTIVE", "cafe_tag_is_mop_available", "is_merch_user", "p4week_active", "is_LAST_1YEAR_DD_ACTIVE", "msr_lifestatus", "IS_SR_KIT_USER", "member_monetary" ] # 数据预处理 df_train, df_btest = data_clean2(df) df_train = df_train[select_columns] df_btest = df_btest[select_columns] for cats in catfeatures: df_train[cats] = df_train[cats].astype(int) df_btest[cats] = df_btest[cats].astype(int) # # 抽样 # df_train = df_train.sample(n=None, frac=0.1, replace=False, weights=None, # random_state=0, axis=0) # df_btest = df_btest.sample(n=None, frac=0.1, replace=False, weights=None, # random_state=0, axis=0) print( '正/负',
'DD_end_gap', 'MC_end_gap', 'p2w_amt', 'cafe_tag_p6m_merch_qty', 'MCoffer_red', 'p2w_trans', 'CAFE20_RECENCY_SRKIT', labels, ] catfeatures = [ 'is_festival_user', 'is_LAST_2YEAR_DD_ACTIVE', 'cafe_tag_is_mop_available', 'IS_SR_KIT_USER' ] # 数据预处理 df_train = data_clean2(df) df_btest = data_clean2(df_btest) df_train = df_train[select_columns] df_btest = df_btest[select_columns] for cats in catfeatures: df_train[cats] = df_train[cats].astype(int) df_btest[cats] = df_btest[cats].astype(int) # # 抽样 # df_train = df_train.sample(n=None, frac=0.1, replace=False, weights=None, # random_state=0, axis=0) # df_btest = df_btest.sample(n=None, frac=0.1, replace=False, weights=None, # random_state=0, axis=0) print(
"class_rank_fillna", "student_province_byphone", "subject_ids", "student_grade_lpo", "school_background", "is_first_trail", "class_background_label", 'student_grade', # "exam_year", "coil_in" ] # 数据预处理 df_train, df_btest = data_clean2(df, min_date="2018-08-21", mid_date="2018-12-03", max_date="2018-12-18", label=labels) for f in cat_features: df_train[f] = df_train[f].astype(int) df_btest[f] = df_btest[f].astype(int) df_train = df_train[select_columns] df_btest = df_btest[select_columns] # 抽样 df_train = df_train.sample(n=None, frac=0.1, replace=False, weights=None, random_state=0, axis=0)
from data_treatment import data_clean2 from data_treatment import load_data_new import joblib import pandas as pd import matplotlib.pyplot as plt if __name__ == '__main__': sql = " " df = load_data_new(sql, filename="btest.csv") df = data_clean2(df) labels = "target_is_DD_ACTIVE" select_columns = [ 'is_festival_user', 'is_LAST_2YEAR_DD_ACTIVE', 'cafe_tag_is_mop_available', 'IS_SR_KIT_USER', 'level_use', 'skr_rate', 'merch_rate', 'active_index', 'cafe_tag_p6m_food_qty', 'DD_rev', 'svc_revenue', 'SR_KIT_NUM', 'cafe_tag_p3m_merch_party_size', 'CAFE20_VISIT_MERCH', 'CAFE20_AMT', 'cafe_tag_p3m_food_qty', 'p3m_weekday_trans', 'max_DD_rev', 'DD_end_gap',
df = load_data_new(sql, filename="df_20190215.csv") student_sql = """SELECT student_id,count(student_id) from trail_pigeon GROUP BY student_id""" student_mul = load_data_new(student_sql, filename="student_mul.csv") student_ids = student_mul[student_mul["count(student_id)"] == 1] df = df[df["student_id"].isin(student_mul["student_id"])] label_by_contract = "is_pigeon" labels = label_by_contract # 数据预处理 df = data_clean2(df, min_date="2018-05-01", mid_date="2018-09-15", max_date="2018-09-30", label=labels) print("data_count", len(df)) drop_features = [ "order_id", "lesson_plan_id", "student_id", "student_no", "recent_scores", "order_apply_time", "adjust_start_time" ] df = df.drop(drop_features, axis=1) #单因素方差分析 df_anova_single = pd.DataFrame() i = 0 columns = list(df.columns)