예제 #1
0
def get_standardscaler(samples_path, channelid):
    data_manager = DataManager(channelid=channelid)
    data_manager.loadLabeledPoint(samples_path)
    X_res, Y_res = resamples(data_manager.data)
    scaler = preprocessing.StandardScaler().fit(X_res)
    pickle.dumps(open(workdir + "scaler_{}.pkl".format(channelid)), "wb")
    return scaler
예제 #2
0
    def run(self,
            samples_path,
            model_save_path,
            algo="ffm",
            params=None,
            df_path=None,
            channelid=4,
            phase="train"):
        if df_path is None and not os.path.exists(
                samples_path + ".pkl") and not os.path.exists(samples_path):
            print(samples_path + " not exists")
            return

        pipelineModel = None
        if df_path is None:
            """
            实际上这个分支已经不支持了
            """
            data_manager = DataManager(channelid=channelid,
                                       config_param=config_param)
            data_manager.loadLabeledPoint(samples_path)
            x, y = CtrExperiment.resamples(data_manager.data)
            pipelineModel = CtrExperiment.experiment(x, y, algo, params=params)
        else:
            config_param[algo]["phase"] = phase
            CtrExperiment.experiment(None,
                                     None,
                                     algo,
                                     params=params,
                                     df_path=df_path)

        if pipelineModel is not None:
            CtrExperiment.saveModel2PMMLFormat(pipelineModel, model_save_path)
            pickle.dump(
                pipelineModel,
                open(
                    config_param["workdir"][platform.system()] + algo +
                    "_{}.pkl".format(channelid), "wb"))
예제 #3
0
    pos_samples = data_manager_instance.data["pos_samples"]["X"]
    neg_samples = data_manager_instance.data["neg_samples"]["X"]

    pos_columns = (pos_samples != 0).sum(0) / pos_samples.shape[0]
    neg_columns = (neg_samples != 0).sum(0) / neg_samples.shape[0]

    x = linspace(0, pos_columns.shape[0], pos_columns.shape[0])
    plt.plot(x, pos_columns, 'r', linewidth=2)
    # plt.plot(x, neg_columns, 'b', linewidth=2)
    plt.xlabel(r'feature', fontsize=16)
    plt.ylabel(r'nonzero ratio', fontsize=16)
    plt.savefig(config_param["workdir"][platform.system()] +
                "explore_nonz-pos.png")

    plt.figure()
    plt.plot(x, neg_columns, 'b', linewidth=2)
    plt.xlabel(r'feature', fontsize=16)
    plt.ylabel(r'nonzero ratio', fontsize=16)
    plt.savefig(config_param["workdir"][platform.system()] +
                "explore_nonz-neg.png")


if __name__ == "__main__":
    channelid = 4
    data_manager_instance = DataManager(
        channelid=channelid,
        workdir=config_param["workdir"][platform.system()])

    explore_nonz(config_param["workdir"][platform.system()] +
                 "samples-optimization_{}.labeledpoint".format(channelid))
예제 #4
0
    with open(workdir + "ffm_train.txt", "w", encoding="utf-8") as file_write:
        for idx in pos_list[:int(pos_cnt * 0.7)]:
            file_write.write(ffm_data["pos_samples"][idx] + "\n")
        for idx in samples_index[:int(neg_cnt * 0.7)]:
            file_write.write(ffm_data["neg_samples"][idx] + "\n")

    with open(workdir + "ffm_vaild.txt", "w", encoding="utf-8") as file_write:
        for idx in pos_list[int(pos_cnt * 0.7):int(pos_cnt * 0.8)]:
            file_write.write(ffm_data["pos_samples"][idx] + "\n")
        for idx in samples_index[int(neg_cnt * 0.7):int(neg_cnt * 0.8)]:
            file_write.write(ffm_data["neg_samples"][idx] + "\n")

    with open(workdir + "ffm_test.txt", "w", encoding="utf-8") as file_write:
        for idx in pos_list[int(pos_cnt * 0.8):]:
            file_write.write(ffm_data["pos_samples"][idx] + "\n")
        for idx in samples_index[int(neg_cnt * 0.8):]:
            file_write.write(ffm_data["neg_samples"][idx] + "\n")


if __name__ == "__main__":
    config_param = yaml.load(open("config.yml", "r", encoding="utf-8").read())
    workdir = config_param["workdir"][platform.system()]
    dm_instance = DataManager(4, workdir)
    ffm_field_idx_path = workdir + "field_idx_4.ffm"

    generate_ffm_data(
        workdir + "samples-optimization_{}.labeledpoint".format(4),
        ffm_format_path=workdir + "samples-optimization_{}.ffm".format(4))
    make_ffm_train_vaild_test_data(ffm_format_path=workdir +
                                   "samples-optimization_{}.ffm".format(4))
예제 #5
0
    def experiment(x, y, algo, params=None, df_path=None):
        x_train, x_test, y_train, y_test = None, None, None, None
        if df_path is None:
            x_train, x_test, y_train, y_test = train_test_split(
                x, y, test_size=0.3, random_state=42)
            print("train samples dim", x_train.shape, "test samples dim",
                  x_test.shape)

            # 数据归一化
            min_max_scaler = preprocessing.MinMaxScaler().fit(x)
            x_train = np.around(min_max_scaler.transform(x_train), 4)
            x_test = np.around(min_max_scaler.transform(x_test), 4)

        print("runing algorithm:" + algo)

        if algo == "gbdt":
            return GbdtExperiment.train(x_train, y_train, params)
        elif algo == "lr":
            clf_l2_LR = LogisticRegression(penalty='l2', tol=0.01)
            clf_l2_LR.fit(x_train, y_train)
            y_pre_train = clf_l2_LR.predict(x_train)
            # print(classification_report(y_train, y_pre_train, target_names=["exposure", "click"]))
        elif algo == "gbdt_plus_lr":
            return GbdtPlusLrExperiment.experiment(x_train, x_test, y_train,
                                                   y_test, params)
            # # todo 类别变量与连续变量分开
            # X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train, y_train, test_size=0.5)
            # params_ = params.copy()
            # params_["n_estimators"] = 40
            # pipelineModel = GbdtExperiment.train(X_train, y_train, params_)
            # gbt_enc = OneHotEncoder()
            # gbt_enc.fit(pipelineModel.named_steps['gbtclassifier'].apply(X_train)[:, :, 0])
            #
            # grd_lm = LogisticRegression(max_iter=300)
            # grd_lm.fit(gbt_enc.transform(pipelineModel.named_steps['gbtclassifier'].apply(X_train_lr)[:, :, 0]),
            #            y_train_lr)
            #
            # y_pred_grd_lm = grd_lm.predict(
            #     gbt_enc.transform(pipelineModel.named_steps['gbtclassifier'].apply(X_test)[:, :, 0]))  # [:, 1]
            # print(classification_report(y_test, y_pred_grd_lm, target_names=["exposure", "click"]))

        elif algo == "xgboost":
            pipelineModel = XgboostExperiment.experiment(
                x_train, y_train, params)
            evaluateOnTrainAndTest(y_train, y_test,
                                   pipelineModel.predict(x_train),
                                   pipelineModel.predict(x_test))
            return pipelineModel
        elif algo == "xgboost_plus_fm":
            pass
        elif algo == "xgboost_plus_ffm":
            xgboost_plust_ffm = XgboostPlusFFMExperiment(
                config_params=config_param)
            phase = config_param[algo]["phase"]
            if phase.startswith("train"):
                xgboost_plust_ffm.experiment()
            else:
                xgboost_plust_ffm.experiment(
                    DataManager.load_dataframe(df_path, 10000000))
        elif algo == "lightgbm_plus_ffm":
            lgbm_plust_ffm = LightGbmPlusFFMExperiment(
                config_params=config_param)
            phase = config_param[algo]["phase"]
            if phase.startswith("train"):
                lgbm_plust_ffm.experiment()
            else:
                lgbm_plust_ffm.experiment(
                    DataManager.load_dataframe(df_path, 10000000))
        elif algo == "ffm":
            ffm = FFMExperiment(config_params=config_param)
            phase = config_param[algo]["phase"]
            if phase.startswith("train") or phase == "emsemble":
                ffm.experiment()
            else:
                ffm.experiment(DataManager.load_dataframe(df_path, 10000000))
예제 #6
0
         samples["neg_samples"]["Y"].shape[0]))

    scaler = preprocessing.StandardScaler().fit(X)
    x_scaler = scaler.transform(X)

    pipelineModel = pickle.load(open(workdir + "xgboost_4.pkl", "rb"))

    y_pre_train = pipelineModel.named_steps['xgbclassifier'].predict_proba(
        x_scaler)

    pred_click_num = sum([
        1 if y_pre_train[i][1] >= 0.5 else 0
        for i in range(y_pre_train.shape[0])
    ])

    print("ctr_pre_avg = ", pred_click_num / y_pre_train.shape[0])


if __name__ == "__main__":
    workdir = "E:/Work/jobs/data/DSP/CTR预估/samples/"
    if platform.system() == "Linux":
        workdir = "/data/kongyy/ctr/"
    config_param = yaml.load(open("config.yml", "r", encoding="utf-8").read())

    channelid = '4'

    data_manager = DataManager(channelid=channelid, workdir=workdir)
    data_manager.loadLabeledPoint(
        workdir + "samples-optimization_labeledpoint_{}".format(channelid))
    get_avg_ctr()
예제 #7
0
from dsp.ctr.data_manager import DataManager
from dsp.utils.data_utils import *

# 测试 Python 生成的特征向量与java 生成的是否一样


if __name__ == "__main__":
    config_param = yaml.load(open("config.yml", "r", encoding="utf-8").read())
    if platform.system() == "Linux":
        workdir = config_param["work_dir"]["Linux"]
    else:
        workdir = config_param["work_dir"]["Windows"]

    channelid = 4
    predict_day = "2018-08-09"
    data_manager = DataManager(channelid=channelid, workdir=workdir)

    if not os.path.exists(workdir + "/samples-optimization_test_std".format(channelid)):
        std_raw_samples(workdir + "/samples-optimization_test")
    data_manager.load_raw_fields(workdir + "samples-optimization_test_std",
                                 workdir + "samples-optimization_test_labeledpoint_{}".format(channelid),
                                 {
                                     "creativeid": workdir + "ctr_dsp_creativeid_statistics.csv",
                                     "adid": workdir + "ctr_dsp_adid_statistics.csv",
                                     "advertiserid": workdir + "ctr_dsp_advertiserid_statistics.csv",
                                 },
                                 predict_day
                                 )
    print("end")