예제 #1
0
def _model_predict(all_feature,
                   predict_feature,
                   predict_col,
                   num_boost_round=1000):
    # 多余的col
    del_cols = None
    for index, i in enumerate(tool.types):
        if predict_col in i:
            del_cols = i.copy()
            break
    test_label_col = str(index) + "_test"
    del_cols.extend(["0_test", "1_test", "2_test", "3_test"])

    k_v = {}
    if predict_col in enum_col or predict_col in ext_enum_col:
        # 删除数量较少的类别
        def func_count(df):
            df['value_count'] = df[predict_col].count()
            return df

        if predict_col in large_limit_col.keys():
            number_limit = large_limit_col[predict_col]
        else:
            number_limit = 10
        all_feature = all_feature.groupby(predict_col).apply(func_count)
        del_test_size = len(
            all_feature[(all_feature[test_label_col] == 1)
                        & (all_feature["value_count"] < number_limit)])
        print(predict_col, "del_test_size:", del_test_size)

        # 原本应有的所有测试集
        test_feature_org = all_feature[all_feature[test_label_col] == 1]
        test_feature_org.drop(["value_count"], axis=1, inplace=True)
        test_y_org = np.array(test_feature_org[predict_col])
        test_x_org = np.array(test_feature_org.drop(del_cols, axis=1))
        print("test_x_org", test_x_org.shape)

        all_feature = all_feature[all_feature["value_count"] >= number_limit]
        all_feature.drop(["value_count"], axis=1, inplace=True)

        # 将value转换为class
        label = all_feature[predict_col]
        all_y = sorted(list(set(label)))

        if len(all_y) == 1:
            # 只有一个值,直接返回预测结果
            print("only one value!")
            return np.array([all_y[0]] * len(predict_feature)), 1

        v_k = {}
        for k, v in enumerate(all_y):
            v_k[v] = k
            k_v[k] = v
        label = np.array([v_k[i] for i in label])
        all_feature[predict_col] = label

    train_feature = all_feature[all_feature[test_label_col] == 0]
    train_y = np.array(train_feature[predict_col])
    train_x = np.array(train_feature.drop(del_cols, axis=1))
    test_feature = all_feature[all_feature[test_label_col] == 1]
    test_y = np.array(test_feature[predict_col])
    test_x = np.array(test_feature.drop(del_cols, axis=1))
    predict_x = np.array(predict_feature.drop(del_cols, axis=1))
    print("train_x:", train_x.shape, "test_x:", test_x.shape, "predict_x",
          predict_x.shape)

    lgb_params = {
        'boosting_type': 'gbdt',
        'learning_rate': 0.02,
        'num_leaves': 256,
        'subsample': 0.8,
        'colsample_bytree': 0.9,
        'min_data_in_leaf': 40,
        'num_threads': num_threads,
        'verbosity': 0
    }
    if predict_col in bool_col:
        lgb_params["objective"] = "binary"
        lgb_params["metric"] = "binary_error"
        lgb_params["is_unbalance"] = True
        eval_metric = None
    elif predict_col in enum_col or predict_col in ext_enum_col:
        lgb_params["objective"] = "multiclass"
        lgb_params["metric"] = "multi_error"
        lgb_params["num_class"] = max(label) + 1
        eval_metric = None
    else:
        lgb_params["objective"] = lgb_obj
        eval_metric = tool.lgb_metric

    train_set = lgb.Dataset(train_x, label=train_y)
    valid_set = lgb.Dataset(test_x, label=test_y)
    temp_model = lgb.train(lgb_params,
                           train_set,
                           num_boost_round=num_boost_round,
                           valid_sets=[valid_set],
                           feval=eval_metric,
                           early_stopping_rounds=50,
                           verbose_eval=False)
    test_pred = temp_model.predict(test_x)

    # 把概率转换为label
    if predict_col in bool_col:
        test_pred = np.where(test_pred > 0.5, 1, 0)
    elif predict_col in enum_col or predict_col in ext_enum_col:
        # 用原始的全测试集
        if del_test_size > 0:
            test_pred = temp_model.predict(test_x_org)
        test_y = test_y_org
        test_pred = [list(x).index(max(x)) for x in test_pred]
        # 取回原来的值
        test_pred = np.array([k_v[i] for i in test_pred])

    if predict_col in category_col:
        test_s = tool.label_score(test_y, test_pred)
    else:
        test_s = tool.regression_score(test_y, test_pred)

    # 可能保留两位小数或一位小数更好
    if_round = False
    test_pred2 = np.round(test_pred, 2)
    test_s2 = tool.regression_score(test_y, test_pred2)
    if test_s < test_s2 - threshold:
        if_round = 2
        test_s = test_s2
    test_pred2 = np.round(test_pred, 1)
    test_s2 = tool.regression_score(test_y, test_pred2)
    if test_s < test_s2 - threshold:
        if_round = 1
        test_s = test_s2
    test_pred2 = np.round(test_pred, 0)
    test_s2 = tool.regression_score(test_y, test_pred2)
    if test_s < test_s2 - threshold:
        if_round = 0
        test_s = test_s2

    print("best iteration: ", temp_model.best_iteration)
    print("test score: ", test_s)

    predict_target = temp_model.predict(predict_x)
    predict_target = np.array(predict_target)
    if predict_col in enum_col or predict_col in ext_enum_col:
        predict_target = [list(x).index(max(x)) for x in predict_target]
        predict_target = np.array([k_v[i] for i in predict_target])
    elif predict_col in bool_col:
        predict_target = np.where(predict_target > 0.5, 1, 0)

    if if_round:
        predict_target = np.round(predict_target, if_round)

    return predict_target, test_s
def _model_predict(all_feature,
                   predict_feature,
                   predict_col,
                   num_boost_round=1000):
    # 多余的col
    del_cols = None
    for index, i in enumerate(tool.types):
        if predict_col in i:
            del_cols = i.copy()
            break
    test_label_col = str(index) + "_test"
    del_cols.extend(["0_test", "1_test", "2_test", "3_test"])

    k_v = {}
    if predict_col in enum_col or predict_col in ext_enum_col:
        # 删除数量较少的类别
        def func_count(df):
            df['value_count'] = df[predict_col].count()
            return df

        if predict_col in large_limit_col.keys():
            number_limit = large_limit_col[predict_col]
        else:
            number_limit = 10
        all_feature = all_feature.groupby(predict_col).apply(func_count)
        del_test_size = len(
            all_feature[(all_feature[test_label_col] == 1)
                        & (all_feature["value_count"] < number_limit)])
        print(predict_col, "del_test_size:", del_test_size)

        # 原本应有的所有测试集
        test_feature_org = all_feature[all_feature[test_label_col] == 1]
        test_feature_org.drop(["value_count"], axis=1, inplace=True)
        test_y_org = np.array(test_feature_org[predict_col])
        test_x_org = np.array(test_feature_org.drop(del_cols, axis=1))
        print("test_x_org", test_x_org.shape)

        all_feature = all_feature[all_feature["value_count"] >= number_limit]
        all_feature.drop(["value_count"], axis=1, inplace=True)

        # 将value转换为class
        label = all_feature[predict_col]
        all_y = sorted(list(set(label)))

        if len(all_y) == 1:
            # 只有一个值,直接返回预测结果
            print("only one value!")
            return np.array([all_y[0]] * len(predict_feature)), 1

        v_k = {}
        for k, v in enumerate(all_y):
            v_k[v] = k
            k_v[k] = v
        label = np.array([v_k[i] for i in label])
        all_feature[predict_col] = label

    train_feature = all_feature[all_feature[test_label_col] == 0]
    train_y = np.array(train_feature[predict_col])
    train_x = np.array(train_feature.drop(del_cols, axis=1))
    test_feature = all_feature[all_feature[test_label_col] == 1]
    test_y = np.array(test_feature[predict_col])
    test_x = np.array(test_feature.drop(del_cols, axis=1))
    predict_x = np.array(predict_feature.drop(del_cols, axis=1))
    print("train_x:", train_x.shape, "test_x:", test_x.shape, "predict_x",
          predict_x.shape)

    params = {
        'booster': 'gbtree',
        'eta': 0.02,
        'max_depth': 8,  # 5 4 3
        'colsample_bytree': 0.9,  # 0.8 0.7
        'subsample': 0.8,
        'min_child_weight': 40,  # 2 3
        'silent': 1,
        'nthread': 4,
        'tree_method': 'gpu_hist',
        "gpu_id": 0,
        "seed": 0
    }
    if predict_col in bool_col:
        params["objective"] = "binary:logistic"
        params["eval_metric"] = "error"
        params["is_unbalance"] = True
        eval_metric = None
    elif predict_col in enum_col or predict_col in ext_enum_col:
        params["objective"] = "multi:softmax"
        params["eval_metric"] = "merror"
        params["num_class"] = max(label) + 1
        eval_metric = None
    else:
        params["objective"] = "reg:linear"
        eval_metric = tool.xgb_metric

    train_set = xgb.DMatrix(train_x, label=train_y)
    valid_set = xgb.DMatrix(test_x, label=test_y)
    temp_model = xgb.train(params,
                           train_set,
                           num_boost_round=num_boost_round,
                           evals=[(valid_set, "validate")],
                           feval=eval_metric,
                           maximize=True,
                           early_stopping_rounds=200,
                           verbose_eval=False)
    test_pred = temp_model.predict(valid_set)

    # 把概率转换为label
    if predict_col in bool_col:
        test_pred = np.where(test_pred > 0.5, 1, 0)
    elif predict_col in enum_col or predict_col in ext_enum_col:
        # 用原始的全测试集
        if del_test_size > 0:
            valid_set = xgb.DMatrix(test_x_org)
            test_pred = temp_model.predict(valid_set)
        test_y = test_y_org
        # 取回原来的值
        test_pred = np.array([k_v[i] for i in test_pred])

    if predict_col in category_col:
        test_s = tool.label_score(test_y, test_pred)
    else:
        test_s = tool.regression_score(test_y, test_pred)

    # 可能保留两位小数或一位小数更好
    if_round = False
    test_pred2 = np.round(test_pred, 2)
    test_s2 = tool.regression_score(test_y, test_pred2)
    if test_s < test_s2 - threshold:
        if_round = 2
        test_s = test_s2
    test_pred2 = np.round(test_pred, 1)
    test_s2 = tool.regression_score(test_y, test_pred2)
    if test_s < test_s2 - threshold:
        if_round = 1
        test_s = test_s2
    test_pred2 = np.round(test_pred, 0)
    test_s2 = tool.regression_score(test_y, test_pred2)
    if test_s < test_s2 - threshold:
        if_round = 0
        test_s = test_s2

    print("best iteration: ", temp_model.best_iteration)
    print("test score: ", test_s)

    predict_set = xgb.DMatrix(predict_x)
    predict_target = temp_model.predict(predict_set)
    predict_target = np.array(predict_target)
    if predict_col in enum_col or predict_col in ext_enum_col:
        predict_target = np.array([k_v[i] for i in predict_target])
    elif predict_col in bool_col:
        predict_target = np.where(predict_target > 0.5, 1, 0)

    if if_round:
        predict_target = np.round(predict_target, if_round)

    return predict_target, test_s
예제 #3
0
def interpolate_predict(method="index"):
    start = datetime.datetime.now()
    data = pd.read_hdf(data_path)

    final_result = pd.DataFrame()
    score_df = pd.DataFrame()
    score_df["var"] = var_col
    for i in tqdm(range(1, 34)):
        sub = data[data["wtid"] == i]
        score_temp = []
        for var in var_col:
            sub1 = sub[pd.notna(sub[var])].reset_index(drop=True)
            index = 0
            for index, t in enumerate(tool.types):
                if var in t:
                    break
            col_name = str(index) + "_test"
            sub2 = sub1[[var]].copy()
            sub1.loc[sub1[col_name] == 1, var] = np.nan
            sub1[var] = sub1[var].interpolate(method=method)

            true_value = sub2[sub1[col_name] == 1][var]
            predict_value = sub1[sub1[col_name] == 1][var]
            if_round = False
            if var in category_col:
                predict_value = np.array(predict_value).astype(int)
                true_value = np.array(true_value).astype(int)
                score = tool.label_score(true_value, predict_value)
            else:
                score = tool.regression_score(true_value, predict_value)
                predict_value2 = np.round(predict_value, 2)
                score2 = tool.regression_score(true_value, predict_value2)
                if score < score2 - threshold:
                    score = score2
                    if_round = 2
                predict_value2 = np.round(predict_value, 1)
                score2 = tool.regression_score(true_value, predict_value2)
                if score < score2 - threshold:
                    score = score2
                    if_round = 1
            score_temp.append(score)

            # 预测结果
            sub[var] = sub[var].interpolate(method=method)
            if if_round:
                sub[var] = np.round(sub[var], if_round)

        final_result = pd.concat((final_result, sub),
                                 axis=0,
                                 ignore_index=True)
        score_df[str(i)] = score_temp

    score_df.set_index("var", inplace=True)
    score_df = score_df.T
    score_df.reset_index(inplace=True)
    score_df.rename(columns={"index": "wtid"}, inplace=True)
    score_df.to_csv("./result/{}_score.csv".format(method),
                    encoding="utf8",
                    index=False,
                    float_format='%.4f')

    final_result = final_result[final_result["count_miss"] > 0]
    final_result = final_result[head_col]
    final_result.sort_values(["wtid", "ts"], inplace=True)
    for var in category_col:
        final_result[var] = final_result[var].astype(int)
    final_result.to_csv("./result/{}_result.csv".format(method),
                        encoding="utf8",
                        index=False,
                        float_format='%.2f')
    end = datetime.datetime.now()
    print("finish", method, "interpolate_predict time: ", end - start)
예제 #4
0
def _model_predict(all_feature, predict_feature, predict_col):
    # 多余的col
    del_cols = None
    for index, i in enumerate(tool.types):
        if predict_col in i:
            del_cols = i.copy()
            break
    test_label_col = str(index) + "_test"
    del_cols.append(test_label_col)

    all_col = list(all_feature.columns)
    all_col.remove(predict_col)
    all_col.remove(test_label_col)
    for c in all_col:
        all_feature[c] = all_feature[c].fillna(value=-1000)
        predict_feature[c] = predict_feature[c].fillna(value=-1000)

    k_v = {}
    if predict_col in enum_col or predict_col in ext_enum_col:
        # 删除数量较少的类别
        def func_count(df):
            df['value_count'] = df[predict_col].count()
            return df

        if predict_col in large_limit_col.keys():
            number_limit = large_limit_col[predict_col]
        else:
            number_limit = 10
        all_feature = all_feature.groupby(predict_col).apply(func_count)
        del_test_size = len(
            all_feature[(all_feature[test_label_col] == 1)
                        & (all_feature["value_count"] < number_limit)])
        print(predict_col, "del_test_size:", del_test_size)

        # 原本应有的所有测试集
        test_feature_org = all_feature[all_feature[test_label_col] == 1]
        test_feature_org.drop(["value_count"], axis=1, inplace=True)
        test_y_org = np.array(test_feature_org[predict_col])
        test_x_org = np.array(test_feature_org.drop(del_cols, axis=1))
        print("test_x_org", test_x_org.shape)

        all_feature = all_feature[all_feature["value_count"] >= number_limit]
        all_feature.drop(["value_count"], axis=1, inplace=True)

        # 将value转换为class
        label = all_feature[predict_col]
        all_y = sorted(list(set(label)))

        if len(all_y) == 1:
            # 只有一个值,直接返回预测结果
            print("only one value!")
            return np.array([all_y[0]] * len(predict_feature)), 1

        v_k = {}
        for k, v in enumerate(all_y):
            v_k[v] = k
            k_v[k] = v
        label = np.array([v_k[i] for i in label])
        all_feature[predict_col] = label

    train_feature = all_feature[all_feature[test_label_col] == 0]
    train_y = np.array(train_feature[predict_col])
    train_x = np.array(train_feature.drop(del_cols, axis=1))
    test_feature = all_feature[all_feature[test_label_col] == 1]
    test_y = np.array(test_feature[predict_col])
    test_x = np.array(test_feature.drop(del_cols, axis=1))
    predict_x = np.array(predict_feature.drop(del_cols, axis=1))
    print("train_x:", train_x.shape, "test_x:", test_x.shape, "predict_x",
          predict_x.shape)

    if predict_col in category_col or predict_col in ext_enum_col:
        temp_model = RandomForestClassifier(n_jobs=num_threads)
    else:
        temp_model = RandomForestRegressor(n_jobs=num_threads)
    temp_model.fit(train_x, train_y)
    test_pred = temp_model.predict(test_x)

    # 把概率转换为label
    if predict_col in enum_col or predict_col in ext_enum_col:
        # 用原始的全测试集
        if del_test_size > 0:
            test_pred = temp_model.predict(test_x_org)
        test_y = test_y_org
        # 取回原来的值
        test_pred = np.array([k_v[i] for i in test_pred])

    if predict_col in category_col:
        test_s = tool.label_score(test_y, test_pred)
    else:
        test_s = tool.regression_score(test_y, test_pred)

    # 可能保留两位小数或一位小数更好,或取整
    if_round = False
    test_pred2 = np.round(test_pred, 2)
    test_s2 = tool.regression_score(test_y, test_pred2)
    if test_s < test_s2 - threshold:
        if_round = 2
        test_s = test_s2
    test_pred2 = np.round(test_pred, 1)
    test_s2 = tool.regression_score(test_y, test_pred2)
    if test_s < test_s2 - threshold:
        if_round = 1
        test_s = test_s2
    test_pred2 = np.round(test_pred, 0)
    test_s2 = tool.regression_score(test_y, test_pred2)
    if test_s < test_s2 - threshold:
        if_round = 0
        test_s = test_s2

    print("test score: ", test_s)

    predict_target = temp_model.predict(predict_x)
    predict_target = np.array(predict_target)
    if predict_col in enum_col or predict_col in ext_enum_col:
        predict_target = np.array([k_v[i] for i in predict_target])
    if if_round:
        predict_target = np.round(predict_target, if_round)

    return predict_target, test_s
예제 #5
0
def top_predict():
    data = pd.read_hdf(data_path)

    score_df = pd.DataFrame()
    score_df["var"] = [i for i in var_col]
    final_result = pd.DataFrame()
    start = datetime.datetime.now()

    for wtid in tqdm(range(1, 34)):
        use_data = data[data["wtid"] == wtid]
        test_scores = []

        for var in var_col:
            train_data = use_data[pd.notna(use_data[var])]
            predict_data = use_data[pd.isna(use_data[var])]

            index = 0
            for index, t in enumerate(tool.types):
                if var in t:
                    break
            test_label_col = str(index) + "_test"

            train_feature = train_data[train_data[test_label_col] == 0]
            top_values = train_feature[var].value_counts().index
            test_feature = train_data[train_data[test_label_col] == 1]
            test_y = np.array(test_feature[var])

            # 用出现次数最多的数值
            test_pred = np.array([top_values[0]] * len(test_y))
            predict_y = np.array([top_values[0]] * len(predict_data))
            if var in category_col:
                test_score = tool.label_score(test_y, test_pred)
            else:
                test_score = tool.regression_score(test_y, test_pred)

            # 检验第二多的数值
            if test_score > 0.1 and len(top_values) > 1:
                test_pred2 = [top_values[1]] * len(test_y)
                if var in category_col:
                    test_score2 = tool.label_score(test_y, test_pred2)
                else:
                    test_score2 = tool.regression_score(test_y, test_pred2)
                if test_score2 > test_score:
                    test_score = test_score2
                    predict_y = np.array([top_values[1]] * len(predict_data))

            test_scores.append(test_score)
            use_data.loc[predict_data.index, var] = predict_y

        score_df[str(wtid)] = test_scores
        final_result = pd.concat(
            (final_result, use_data[use_data["count_miss"] > 0]),
            axis=0,
            ignore_index=True)

    final_result = final_result[head_col]
    final_result.sort_values(["wtid", "ts"], inplace=True)
    final_result.to_csv("./result/top_result.csv",
                        encoding="utf8",
                        index=False,
                        float_format='%.2f')

    score_df.set_index("var", inplace=True)
    score_df = score_df.T
    score_df.reset_index(inplace=True)
    score_df.rename(columns={"index": "wtid"}, inplace=True)
    score_df.to_csv("./result/top_score.csv",
                    encoding="utf8",
                    index=False,
                    float_format='%.4f')
    end = datetime.datetime.now()
    print("finish top_predict time: ", end - start, "\n")