예제 #1
0
def data_split(data, threads, split_flag):
    data = shuffle(pd.DataFrame(data))
    total = len(data)
    sep = round(total / threads, 2)
    split_point = []
    for i in range(threads + 1):
        split_point.append(int(i * sep))
    logger.info("CutPoint: " + str(split_point))
    split_res = []
    for j in range(len(split_point) - 1):
        t = []
        t.append(split_point[j])
        t.append(split_point[j + 1])
        split_res.append(t)
    res = []
    for s in split_res:
        content = data[s[0]:s[1]]
        if len(content) != 0:
            res.append(content)
    res_final = []
    for ss in range(len(res)):
        tmp = res[ss]
        tmp = pd.DataFrame(tmp)
        tmp[split_flag] = ss
        res_final.append(tmp)
    return res_final
예제 #2
0
def main():
    df = pd.read_csv("data/hl_test_clean.csv", encoding="utf8")
    df['book_date'] = pd.to_datetime(df['book_date'])
    trainSet = df[(df['book_date'] >= '2017-04-01')
                  & (df['book_date'] <= '2017-07-20')].reset_index(drop=True)
    testSet = df[(df['book_date'] >= '2017-07-20')
                 & (df['book_date'] <= '2017-08-31')].reset_index(drop=True)
    logger.info(
        "============================Data is ready!============================"
    )
    clf = RandomForestClassifier(
        n_estimators=10,
        max_features=10,
        max_depth=4,
        min_samples_split=0.05,
    )
    myexe = MyExecutor(df, "fpd", clf)
    leftVaris = myexe.get_result()
    leftVaris = leftVaris[leftVaris.values > 7].keys()
    X_train = trainSet[leftVaris].copy()
    y_train = trainSet['fpd'].copy()
    X_test = testSet[leftVaris].copy()
    y_test = testSet[leftVaris].copy()
    # AutoSklearn阶段:
    cls = autosklearn.classification.AutoSklearnClassifier(
        time_left_for_this_task=62,
        per_run_time_limit=60,
        include_estimators=['adaboost'],
        resampling_strategy='holdout',
        resampling_strategy_arguments={'train_size': 0.67})
    getReport(cls, trainSet, X_train, y_train, testSet, X_test, y_test)
예제 #3
0
def drop_by_iv(df, y, p=10):
    """
    select variables by information value
    :param df: 
    :param y:
    :param p: 线程数, < cpu核数/线程数
    :return: 
    """
    columns = list(df.columns)
    result = []
    # print(columns)
    # for col in tqdm(columns):
    #     tmp = {
    #         "columns": col,
    #         "IV": calc_iv(df, y, col)[1]
    #     }
    #     result.append(copy(tmp))

    logger.info("Start filter variables by IV, Current thread: {}".format(p))
    pool = Pool(processes=p)
    for col in columns:
        result.append(pool.apply_async(calc_iv, args=(df, y, col)))
    pool.close()
    pool.join()
    result = pd.DataFrame([s.get() for s in result])
    selected_vars = list(result[result.IV >= 0.02]["columns"])
    selected_vars.append("fpd")
    return selected_vars
예제 #4
0
 def build_model(self, mymodel, train_x, train_y, test_x, test_y):
     """
     :param mymodel: sklearn对象
     :param train_x: 训练集_X
     :param train_y: 训练集_Y
     :param test_x:  测试集_X
     :param test_y:  测试集_Y
     :return: auc
     """
     # TODO: 训练模型,使用九份数据训练,在第十份数据上检验,得到相应的AUC,KS,  总共训练10次
     clf = mymodel
     clf.fit(train_x, train_y)
     predict_prob_y = clf.predict_proba(test_x)
     predict_prob_y = pd.DataFrame(predict_prob_y, columns=["fpd0", "fpd1"])
     test_auc = metrics.roc_auc_score(test_y, predict_prob_y["fpd1"])
     left_variables = train_x.columns[np.where(
         clf.feature_importances_ > 0)].tolist()
     logger.info(left_variables)
     logger.info(len(left_variables))
     this_feature_impoirtance = pd.DataFrame(
         list(zip(train_x.columns, clf.feature_importances_)),
         columns=["variable", "importance"])
     used_feature_importance = this_feature_impoirtance[
         this_feature_impoirtance.importance > 0]
     # print(used_feature_importance)
     return test_auc, used_feature_importance
예제 #5
0
def main():
    df = pd.read_csv("data/model_mix/clean_data.csv", encoding="utf8")
    trainSet = df[
        df["book_mon"].isin(['2017-05', '2017-06', '2017-07'])
    ].reset_index(drop=True)
    testSet = df[
        df["book_mon"] == "2017-08"
    ].reset_index(drop=True)

    logger.info("============================Data is ready!============================")
    clf = XGBClassifier(
        n_estimators=10,
        max_features=10,
        max_depth=4,
        min_samples_split=0.05,
    )
    myexe = MyExecutor(df, "fpd", clf)
    leftVaris = myexe.get_result()
    leftVaris = leftVaris[leftVaris.values > 7].keys()
    X_train = trainSet[leftVaris].copy()
    y_train = trainSet['fpd'].copy()
    X_test = testSet[leftVaris].copy()
    y_test = testSet['fpd'].copy()
    # AutoSklearn阶段:
    cls = autosklearn.classification.AutoSklearnClassifier(
        time_left_for_this_task=62,
        per_run_time_limit=60,
        include_estimators=['adaboost'],
        resampling_strategy='holdout',
        resampling_strategy_arguments={'train_size': 0.67}
    )
    getReport(cls, trainSet, X_train, y_train, testSet, X_test, y_test)
예제 #6
0
 def feature_select(self, sub_train_set, y):
     logger.info("Start filer Variables total: {}".format(len(sub_train_set.columns)))
     tmp1 = drop_useless(sub_train_set, 'pre_apply_no', 'book_date', 'book_mon')
     sub_train_set = sub_train_set[tmp1]
     tmp2 = drop_by_iv(sub_train_set, "fpd", 2)
     logger.info("Stop filter Variables total: {}".format(len(tmp2)))
     return tmp2
예제 #7
0
def main():
    df = pd.read_csv("data/hl_test_clean.csv", encoding="utf8")
    logger.info("============================Data is ready!============================")
    clf = DecisionTreeClassifier(
        max_features=1,
        min_weight_fraction_leaf=0.05,
        min_samples_split=0.05,
        criterion="entropy",
        max_leaf_nodes=5
    )
    myexe = myExecutor(df, "fpd", clf)
    myexe.train_all()
예제 #8
0
def main():
    df = pd.read_csv("data/hl_test_clean.csv", encoding="utf8")
    df['book_date'] = pd.to_datetime(df['book_date'])
    trainSet = df[(df['book_date'] >= '2017-04-01')
                  & (df['book_date'] <= '2017-07-20')].reset_index(drop=True)
    testSet = df[(df['book_date'] >= '2017-07-20')
                 & (df['book_date'] <= '2017-08-31')].reset_index(drop=True)
    logger.info(
        "============================Data is ready!============================"
    )
    clf = XGBClassifier(learning_rate=0.01,
                        max_depth=7,
                        min_child_weight=15,
                        n_estimators=100,
                        nthread=1,
                        subsample=0.6500000000000001)
    myexe = MyExecutor(df, "fpd", clf)
    #leftVaris = myexe.get_result()
    #leftVaris = leftVaris[leftVaris.values > 7].keys()
    #print(leftVaris)
    leftVaris = [
        'hl_call_domesitc_cnt_2m', 'hl_contact_early_morning_cnt_5m',
        'hl_phone_silent_frequentcy', 'hl_contact_night_pct',
        'hl_transactions_total_amt_5m', 'hl_region_call_cnt_max_uniq_num_cnt',
        'hl_region_call_out_cnt_max_avg_call_in_time',
        'hl_contact_morning_cnt_5m',
        'hl_region_call_in_time_max_avg_call_in_time',
        'hl_transactions_total_amt_2m', 'hl_contact_night_cnt_5m',
        'hl_phone_num_used_time_months',
        'hl_region_call_cnt_max_avg_callin_time',
        'hl_region_call_in_time_max_uniq_num_cnt',
        'hl_region_call_in_cnt_max_avg_call_out_time',
        'hl_transactions_min_5m',
        'hl_region_call_out_time_max_avg_call_out_time'
    ]

    X_train = trainSet[leftVaris].copy()
    y_train = trainSet['fpd'].copy()
    X_test = testSet[leftVaris].copy()
    y_test = testSet['fpd'].copy()
    # AutoSklearn阶段:
    pipeline_optimizer = TPOTClassifier(generations=5,
                                        population_size=20,
                                        cv=4,
                                        random_state=42,
                                        verbosity=2)
    pipeline_optimizer.fit(X_train, y_train)
    # print(pipeline_optimizer.score(X_test, y_test))
    pipeline_optimizer.export('tpot_exported_pipeline.py')
    getReport(pipeline_optimizer, trainSet, X_train, y_train, testSet, X_test,
              y_test)
예제 #9
0
def judge_function_v1(result):
    """
    1. 测试AUC, KS表现
    2. 单个变量多次入选B1~B10
    3. 变量业务逻辑核查
    :param: train_all的返回值
    :return: 
    """
    # TODO: 对变量集对应的10组测试AUC均值和方差进行评判
    calc_res = []
    for model_res in result:
        current_auc = model_res[0]
        logger.info(current_auc)
        var_importance = model_res[1]
        var_importance[
            "importance_plus"] = var_importance["importance"] * current_auc
        calc_res.append(var_importance)
    return pd.concat(calc_res)
예제 #10
0
def main():
    df = pd.read_csv("data/model_mix/clean_data.csv", encoding="utf8")
    trainSet = df[df["book_mon"].isin(['2017-05', '2017-06',
                                       '2017-07'])].reset_index(drop=True)
    testSet = df[df["book_mon"] == "2017-08"].reset_index(drop=True)

    logger.info(
        "============================Data is ready!============================"
    )
    clf = RandomForestClassifier(
        n_estimators=10,
        max_features=10,
        max_depth=4,
        min_samples_split=0.05,
    )
    myexe = MyExecutor(trainSet, "fpd", clf)
    leftVaris = myexe.get_result(20)
    print(
        "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    print(leftVaris)
    print(
        "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )

    X_train = trainSet[leftVaris].copy()
    y_train = trainSet['fpd'].copy()
    X_test = testSet[leftVaris].copy()
    y_test = testSet['fpd'].copy()
    # AutoSklearn阶段:
    pipeline_optimizer = TPOTClassifier(generations=5,
                                        population_size=20,
                                        cv=4,
                                        random_state=42,
                                        n_jobs=1,
                                        verbosity=2)
    pipeline_optimizer.fit(X_train, y_train)
    pipeline_optimizer.export('tpot_exported_pipeline.py')
    getReport(pipeline_optimizer, trainSet, X_train, y_train, testSet, X_test,
              y_test)
예제 #11
0
def main(kwargs):
    df = pd.read_csv("data/model_mix/clean_data.csv", encoding="utf8")
    trainSet = df[df["book_mon"].isin(['2017-05', '2017-06',
                                       '2017-07'])].reset_index(drop=True)
    testSet = df[df["book_mon"] == "2017-08"].reset_index(drop=True)

    logger.info(
        "============================Data is ready!============================"
    )
    # clf = RandomForestClassifier(
    #     n_estimators=10,
    #     max_features=10,
    #     max_depth=4,
    #     min_samples_split=0.05,
    # )
    clf = kwargs["feature_model"]
    myexe = MyExecutor(df, "fpd", clf)
    leftVaris = myexe.get_result(kwargs["feature_num"])
    print(
        "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    print(leftVaris)
    print(
        "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    X_train = trainSet[leftVaris].copy()
    y_train = trainSet['fpd'].copy()
    X_test = testSet[leftVaris].copy()
    y_test = testSet['fpd'].copy()
    # AutoSklearn阶段:
    pipeline_optimizer = TPOTClassifier(
        generations=int(kwargs["generations"]),
        population_size=int(kwargs["population_size"]),
        #offspring_size=kwargs["offspring_size"],
        #mutation_rate=kwargs["mutation_rate"],
        #crossover_rate=kwargs["crossover_rate"],
        scoring=kwargs["scoring"],
        cv=int(kwargs["cv"]),
        subsample=float(kwargs["subsample"]),
        n_jobs=int(kwargs["n_jobs"]),
        #max_time_mins=kwargs["max_time_mins"],  # max_eval_time_seconds = max(int(self.max_eval_time_mins * 60), 1)
        max_eval_time_mins=int(kwargs["max_eval_time_mins"]),
        random_state=random.randint(1, 100))
    pipeline_optimizer.fit(X_train, y_train)
    trainKS, testKS, abs_trainKS_testKS, trainAUC, testAUC, abs_trainAUC_testAUC = \
        getReport(pipeline_optimizer, trainSet, X_train, y_train, testSet, X_test, y_test)

    # 记录结果
    if kwargs["uid"] is os.listdir("tpot_result/"):
        os.removedirs("tpot_result/{}".format(kwargs['uid']))
    os.mkdir("tpot_result/{}".format(kwargs["uid"]))
    pipeline_optimizer.export(
        'tpot_result/{}/tpot_exported_pipeline.py'.format(kwargs["uid"]))
    with open('tpot_result/{}/vars'.format(kwargs["uid"]), "w+") as f1:
        f1.write(str(leftVaris))
    report = pd.DataFrame([
        {
            "trainKS": trainKS,
            "testKS": testKS,
            "abs_trainKS_testKS": abs_trainKS_testKS,
            "trainAUC": trainAUC,
            "testAUC": testAUC,
            "abs_trainAUC_testAUC": abs_trainAUC_testAUC
        },
    ])
    report.to_csv('tpot_result/{}/report.csv'.format(kwargs['uid']),
                  index=False,
                  encoding="utf8")
예제 #12
0
 def feature_select(self, sub_train_set, y):
     logger.info("Start filer Variables total: {}".format(
         len(sub_train_set.columns)))
     tmp1 = drop_useless(sub_train_set, 'pre_apply_no', 'book_date',
                         'book_mon')
     return tmp1