Пример #1
0
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params):
    fd = FeatureDictionary(dfTrain=dfTrain,
                           dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols=config.IGNORE_COLS)
    data_parser = DataParser(feat_dict=fd)
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
    Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest)

    dfm_params["feature_size"] = fd.feat_dim
    dfm_params["field_size"] = len(Xi_train[0])

    y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float)
    y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
    _get = lambda x, l: [x[i] for i in l]
    gini_results_cv = np.zeros(len(folds), dtype=float)
    gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]),
                                        dtype=float)
    gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]),
                                        dtype=float)
    for i, (train_idx, valid_idx) in enumerate(folds):
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(
            Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(
            Xv_train, valid_idx), _get(y_train, valid_idx)

        dfm = DeepFM(**dfm_params)
        dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)

        y_train_meta[valid_idx, 0] = dfm.predict(Xi_valid_, Xv_valid_)
        y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test)

        gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx])
        gini_results_epoch_train[i] = dfm.train_result
        gini_results_epoch_valid[i] = dfm.valid_result

    y_test_meta /= float(len(folds))

    # save result
    if dfm_params["module_name"] == "DeepFM":
        if dfm_params["use_fm"] and dfm_params["use_deep"]:
            clf_str = "DeepFM"
        elif dfm_params["use_fm"]:
            clf_str = "FM"
        elif dfm_params["use_deep"]:
            clf_str = "DNN"
    elif dfm_params["module_name"] == "LR":
        clf_str = "LR"
    elif dfm_params["module_name"] == "WideDeep":
        clf_str = "WideDeep"

    print("%s: %.5f (%.5f)" %
          (clf_str, gini_results_cv.mean(), gini_results_cv.std()))
    filename = "%s_Mean%.5f_Std%.5f.csv" % (clf_str, gini_results_cv.mean(),
                                            gini_results_cv.std())
    _make_submission(ids_test, y_test_meta, filename)

    _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str)

    return y_train_meta, y_test_meta
Пример #2
0
    def eval_on_dev(split_vector_data):
        e_b_s = len(dev_data) / graph_hyper_params['batch_size']
        auc_true, auc_pre = [], []
        # auc = []
        for index in tqdm(range(e_b_s)):
            start = index * graph_hyper_params['batch_size']
            end = (index + 1) * graph_hyper_params['batch_size'] if (
                index + 1) * graph_hyper_params['batch_size'] < len(
                    dev_data) else len(dev_data)
            b_dev_data = dev_data[start:end]
            fed_dict = get_fed_dict(b_dev_data, split_vector_data)
            pred_value = sess.run([pred_val], feed_dict=fed_dict)

            pre_real_val = np.array(pred_value).reshape((-1))
            auc_true = auc_true + list(b_dev_data['label'].values)
            auc_pre = auc_pre + pre_real_val.tolist()
            # auc.append()
        # auc_pre = np.array(auc_pre)
        # auc_pre = np.exp(auc_pre) / np.exp(auc_pre).sum()
        # print auc_true
        # print auc_pre
        fpr, tpr, thresholds = metrics.roc_curve(auc_true,
                                                 auc_pre,
                                                 pos_label=1)
        # >> > metrics.auc(fpr, tpr)
        return metrics.auc(fpr, tpr), gini_norm(auc_true, auc_pre)
Пример #3
0
        def eval_on_dev(split_vector_data):
            e_b_s = len(dev_data) / graph_hyper_params['batch_size']
            auc_true, auc_pre = [], []
            # auc = []
            for index in tqdm(range(e_b_s)):
                start = index * graph_hyper_params['batch_size']
                end = (index + 1) * graph_hyper_params['batch_size'] if (index + 1) * graph_hyper_params['batch_size'] < len(dev_data) else len(dev_data)
                b_dev_data = dev_data[start:end]
                fed_dict = get_fed_dict(b_dev_data, split_vector_data, feature_conf_dict)
                pred_value, pre_pred_value, final_vec, uu, vv = sess.run([pred_val, network_params[0], network_params[1], network_params[2], network_params[3]], feed_dict=fed_dict)

                pre_real_val = np.array(pred_value).reshape((-1))
                auc_true = auc_true + list(b_dev_data['label'].values)
                auc_pre = auc_pre + pre_real_val.tolist()

                if True in np.isnan(pre_real_val):
                    print 'contain nan: ', np.array(pre_pred_value).reshape((-1))
                    print np.array(final_vec).reshape((-1))
                    print np.array(uu).reshape((-1))
                    print np.array(vv).reshape((-1))

                # auc.append()
            # auc_pre = np.array(auc_pre)
            # auc_pre = np.exp(auc_pre) / np.exp(auc_pre).sum()
            # print auc_true
            # print auc_pre
            fpr, tpr, thresholds = metrics.roc_curve(auc_true, auc_pre, pos_label=1)
            auc_v, gni = metrics.auc(fpr, tpr), gini_norm(auc_true, auc_pre)

            auc_pre_2 = np.array(auc_pre)
            auc_pre_2.sort()
            print('dev_pre_top2=%.4f %.4f min2=%.4f %.4f' %
                  (auc_pre_2.tolist()[-1], auc_pre_2.tolist()[-2], auc_pre_2.tolist()[0], auc_pre_2.tolist()[1]))
            return auc_v, gni
Пример #4
0
def val(model, dataloader):
    """
    计算模型在验证集上的信息
    """
    model.eval()  ########固定
    total = 0
    loss_val = 0
    val_iteration = 0
    y_true = []
    y_pre = []
    for i, (Xi_batch, Xv_batch, y_batch) in enumerate(dataloader):
        Xi_batch = Xi_batch.to(device)
        Xv_batch = Xv_batch.to(device)
        y_batch = y_batch.to(device)

        outputs = model(Xi_batch, Xv_batch)
        loss = criterion(outputs, y_batch)
        # _, predicted = torch.max(outputs.data, 1)
        y_true.append(y_batch.data.cpu().numpy())
        prob = F.sigmoid(outputs)
        y_pre.append(prob.data.cpu().numpy())

        loss_val += loss.item()
        val_iteration += 1

    loss_val /= val_iteration

    y_true = np.concatenate(y_true, axis=0)
    y_pre = np.concatenate(y_pre, axis=0)
    gini_val = gini_norm(y_true, y_pre)

    model.train()  ####重启
    return loss_val, gini_val
Пример #5
0
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params,save_path:str,past_epoch:int=0):
    fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols=config.IGNORE_COLS)
    data_parser = DataParser(feat_dict=fd)
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
    Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest)

    dfm_params["feature_size"] = fd.feat_dim
    dfm_params["field_size"] = len(Xi_train[0])

    y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float)
    y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
    _get = lambda x, l: [x[i] for i in l]
    gini_results_cv = np.zeros(len(folds), dtype=float)
    # gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float)
    # gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float)
    dfm = DeepFM(**dfm_params)
    if past_epoch!=0 :dfm.saver.restore(dfm.sess, save_path + '-'+str(past_epoch))
    for i, (train_idx, valid_idx) in enumerate(folds):
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx)
        # print(y_train)


        # print(y_train_)
        # print(dfm.predict(Xi_train_, Xv_train_))
        # continue
        dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)

        y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_)
        y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test)

        gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx])
        # gini_results_epoch_train[i] = dfm.train_result
        # gini_results_epoch_valid[i] = dfm.valid_result
        # print('saving')
        dfm.saver.save(dfm.sess, save_path, global_step=past_epoch+dfm_params["epoch"]*(i+1))

    y_test_meta /= float(len(folds))

    # save result
    if dfm_params["use_fm"] and dfm_params["use_deep"]:
        clf_str = "DeepFM"
    elif dfm_params["use_fm"]:
        clf_str = "FM"
    elif dfm_params["use_deep"]:
        clf_str = "DNN"
    print("%s: %.5f (%.5f)"%(clf_str, gini_results_cv.mean(), gini_results_cv.std()))
    filename = "%s_Mean%.5f_Std%.5f.csv"%(clf_str, gini_results_cv.mean(), gini_results_cv.std())
    _make_submission(ids_test, y_test_meta, filename)
    gini_results_epoch_train = np.zeros((1, dfm_params["epoch"]*len(folds)), dtype=float)
    gini_results_epoch_valid = np.zeros((1, dfm_params["epoch"]*len(folds)), dtype=float)
    gini_results_epoch_train[0]=dfm.train_result
    gini_results_epoch_valid[0]=dfm.valid_result
    _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str)

    return y_train_meta, y_test_meta,dfm
Пример #6
0
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params):
    fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols=config.IGNORE_COLS)
    data_parser = DataParser(feat_dict=fd)
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
    Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest)

    dfm_params["feature_size"] = fd.feat_dim
    dfm_params["field_size"] = len(Xi_train[0])

    y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float)
    y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)

    _get = lambda x, l: [x[i] for i in l]
    gini_results_cv = np.zeros(len(folds), dtype=float)
    gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float)
    gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float)

    for i, (train_idx, valid_idx) in enumerate(folds):
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx)

        # todo dfm_params={'use_fm': True, 'use_deep': True, 'embedding_size': 8, 'dropout_fm': [1.0, 1.0], 'deep_layers': [32, 32],
        #  'dropout_deep': [0.5, 0.5, 0.5], 'deep_layers_activation': <function relu at 0x7fe4917da950>, 'epoch': 30, 'batch_size': 1024,
        #  'learning_rate': 0.001, 'optimizer_type': 'adam', 'batch_norm': 1, 'batch_norm_decay': 0.995, 'l2_reg': 0.01, 'verbose': True,
        #  'eval_metric': <function gini_norm at 0x7fe495b06048>, 'random_seed': 2017, 'feature_size': 259, 'field_size': 39}
        # print(f"dfm_params={dfm_params}")
        dfm = DeepFM(**dfm_params)
        dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)

        y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_)
        y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test)

        gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx])
        # todo 所谓的 train_result 是训练集的gini系数
        gini_results_epoch_train[i] = dfm.train_result
        gini_results_epoch_valid[i] = dfm.valid_result

    # todo 上面用了5折交叉,y_test_meta是各折交叉的加和,这里相当于5折交叉取平均
    y_test_meta /= float(len(folds))

    # save result
    if dfm_params["use_fm"] and dfm_params["use_deep"]:
        clf_str = "DeepFM"
    elif dfm_params["use_fm"]:
        clf_str = "FM"
    elif dfm_params["use_deep"]:
        clf_str = "DNN"
    print("%s: %.5f (%.5f)"%(clf_str, gini_results_cv.mean(), gini_results_cv.std()))
    filename = "%s_Mean%.5f_Std%.5f.csv"%(clf_str, gini_results_cv.mean(), gini_results_cv.std())
    _make_submission(ids_test, y_test_meta, filename)

    _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str)

    return y_train_meta, y_test_meta
Пример #7
0
def run_base_model_dfm(dfTrain,dfTest,folds,dfm_params):
    fd = FeatureDictionary(dfTrain=dfTrain,
                           dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols = config.IGNORE_COLS)
    data_parser = DataParser(feat_dict= fd)
    # Xi_train :列的序号
    # Xv_train :列的对应的值
    Xi_train,Xv_train,y_train = data_parser.parse(df=dfTrain,has_label=True)
    Xi_test,Xv_test,ids_test = data_parser.parse(df=dfTest)

    print(dfTrain.dtypes)

    dfm_params['feature_size'] = fd.feat_dim
    dfm_params['field_size'] = len(Xi_train[0])

    y_train_meta = np.zeros((dfTrain.shape[0],1),dtype=float)
    y_test_meta = np.zeros((dfTest.shape[0],1),dtype=float)

    _get = lambda x,l:[x[i] for i in l]

    gini_results_cv = np.zeros(len(folds),dtype=float)
    gini_results_epoch_train = np.zeros((len(folds),dfm_params['epoch']),dtype=float)
    gini_results_epoch_valid = np.zeros((len(folds),dfm_params['epoch']),dtype=float)

    for i, (train_idx, valid_idx) in enumerate(folds):
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx)

        dfm = DeepFM(**dfm_params)
        dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)

        y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_)
        y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test)

        gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx])
        gini_results_epoch_train[i] = dfm.train_result
        gini_results_epoch_valid[i] = dfm.valid_result

    y_test_meta /= float(len(folds))

    # save result
    if dfm_params["use_fm"] and dfm_params["use_deep"]:
        clf_str = "DeepFM"
    elif dfm_params["use_fm"]:
        clf_str = "FM"
    elif dfm_params["use_deep"]:
        clf_str = "DNN"
    print("%s: %.5f (%.5f)"%(clf_str, gini_results_cv.mean(), gini_results_cv.std()))
    filename = "%s_Mean%.5f_Std%.5f.csv"%(clf_str, gini_results_cv.mean(), gini_results_cv.std())
    _make_submission(ids_test, y_test_meta, filename)

    _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str)

    return y_train_meta, y_test_meta
Пример #8
0
def _run_base_model_dfm(Xi_train, Xv_train, y_train, Xi_test, Xv_test,
                        ids_test, cate_cnt, folds, dfm_params):
    dfm_params["cate_feature_size"] = cate_cnt
    dfm_params["cate_field_size"] = len(Xi_train[0])
    dfm_params["num_field_size"] = len(Xv_train[0])

    y_train_meta = np.zeros((Xi_train.shape[0], 1), dtype=float)
    y_test_meta = np.zeros((Xi_test.shape[0], 1), dtype=float)
    _get = lambda x, l: [x[i] for i in l]
    gini_results_cv = np.zeros(len(folds), dtype=float)
    gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]),
                                        dtype=float)
    gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]),
                                        dtype=float)
    for i, (train_idx, valid_idx) in enumerate(folds):
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(
            Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(
            Xv_train, valid_idx), _get(y_train, valid_idx)

        dfm = DCN(**dfm_params)
        dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)

        y_train_meta[valid_idx, 0] = dfm.predict(Xi_valid_, Xv_valid_)
        y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test)

        gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx])
        gini_results_epoch_train[i] = dfm.train_result
        gini_results_epoch_valid[i] = dfm.valid_result

    y_test_meta /= float(len(folds))

    # save result
    if dfm_params["use_cross"] and dfm_params["use_deep"]:
        clf_str = "DeepAndCross"
    elif dfm_params["use_cross"]:
        clf_str = "CROSS"
    elif dfm_params["use_deep"]:
        clf_str = "DNN"
    print("%s: %.5f (%.5f)" %
          (clf_str, gini_results_cv.mean(), gini_results_cv.std()))
    filename = "%s_Mean%.5f_Std%.5f.csv" % (clf_str, gini_results_cv.mean(),
                                            gini_results_cv.std())
    _make_submission(ids_test, y_test_meta, filename)

    _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str)

    return y_train_meta, y_test_meta
Пример #9
0
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params):
    #获取dict
    fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest,#训练集和测试集
                           numeric_cols=config.NUMERIC_COLS,#num类列
                           ignore_cols=config.IGNORE_COLS)#ignore特征,dfTrain和dfTest没有过滤掉
    data_parser = DataParser(feat_dict=fd)#data_parser对象
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)调用parse方法获取处理后的数据
    Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest)

    dfm_params["feature_size"] = fd.feat_dim#处理之后的特征个数,即考虑了one-hot之后
    dfm_params["field_size"] = len(Xi_train[0])#field个数

    y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float)
    y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
    _get = lambda x, l: [x[i] for i in l]
    gini_results_cv = np.zeros(len(folds), dtype=float)
    gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float)
    gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float)
    for i, (train_idx, valid_idx) in enumerate(folds):#应该是划分k份
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx)

        dfm = DeepFM(**dfm_params)
        dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)

        y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_)#k次折交
        y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test)#每次训练都预测一次,然后把预测结果累加取来

        gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx])
        gini_results_epoch_train[i] = dfm.train_result
        gini_results_epoch_valid[i] = dfm.valid_result

    y_test_meta /= float(len(folds))#在测试集上的累加结果求平均

    # save result
    if dfm_params["use_fm"] and dfm_params["use_deep"]:#deepFM
        clf_str = "DeepFM"
    elif dfm_params["use_fm"]:#FM
        clf_str = "FM"
    elif dfm_params["use_deep"]:#DNN
        clf_str = "DNN"
    print("%s: %.5f (%.5f)"%(clf_str, gini_results_cv.mean(), gini_results_cv.std()))
    filename = "%s_Mean%.5f_Std%.5f.csv"%(clf_str, gini_results_cv.mean(), gini_results_cv.std())
    _make_submission(ids_test, y_test_meta, filename)

    _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str)

    return y_train_meta, y_test_meta#返回验证的预测和测试集的预测
Пример #10
0
def run_base_model_dfm(dfTrain, dfTest, folds, dfm_params):
    '''
     对模型的运行部分,  可以往下看发现,这部分  同时可以用于   设置使用FM 、Deep 、DeepFM这三种不同的模型
    '''

    #  别忽视了  FeatureDictionary 这里面有非常多的信息包装 转换的。 这里 解析 和字典包装真的是有点不明白,太复杂了,v是怎么获取使用的
    fd = FeatureDictionary(dfTrain=dfTrain,
                           dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols=config.IGNORE_COLS)
    # 在解析数据中,逐行处理每一条数据,dfi 记录了当前的特征在总的输入的特征中的索引。dfv 中记录的是具体的值,
    # 如果是 numerical 特征,存的是原始的值,如果是 categories 类型的,就存放 1。这个相当于进行了 one-hot 编码,
    # 在 dfi 存储了特征所在的索引。输入到网络中的特征的长度是      ( numerical 特征的个数 +categories 特征 one-hot 编码的长度 )。
    # 最终,Xi 和 Xv 是一个二维的 list,里面的每一个 list 是一行数据,Xi 存放的是特征所在的索引,Xv 存放的是具体的特征值。
    data_parser = DataParser(feat_dict=fd)

    # Xi_train :列的序号
    # Xv_train :列的对应的值

    # 解析数据 Xi_train 存放的是特征对应的索引 Xv_train 存放的是特征的具体的值
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
    Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest)

    #这里面是二维的,  大列表是 每个样本,小列表表示具体对应feature_index下的value的长度  。 小列表长度应该不是统一的,因为针对one-hot,只显示为1的
    print('Xi_train:', Xi_train)  #存储了对应标签索引
    print('Xv_train:', Xv_train)  #存储了真实值
    print('y_train:', y_train)
    print('Xi_test:', Xi_test)
    print('Xv_test:', Xv_test)

    print('Xi_train shape:', len(Xi_train))  # 存储了对应标签索引
    print('Xv_train shape:', len(Xv_train))  # 存储了真实值
    print('y_train shape:', len(y_train))
    print('Xi_test shape:', len(Xi_test))
    print('Xv_test shape:', len(Xv_test))
    #print('ids_test:', ids_test)
    print(dfTrain.dtypes)

    #field_size  是原始的特征size,   feature_size是经过对离散型数据one-hot处理后的特征数量
    dfm_params['feature_size'] = fd.feat_dim
    dfm_params['field_size'] = len(Xi_train[0])

    y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float)
    y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)

    _get = lambda x, l: [x[i] for i in l]

    gini_results_cv = np.zeros(len(folds), dtype=float)
    gini_results_epoch_train = np.zeros((len(folds), dfm_params['epoch']),
                                        dtype=float)
    gini_results_epoch_valid = np.zeros((len(folds), dfm_params['epoch']),
                                        dtype=float)

    for i, (train_idx, valid_idx) in enumerate(folds):

        #   这里Xi_train_, Xv_train_, y_train_ 分别表示当前的特征在总的输入的特征中的索引、特征的具体的值、对应的标签索引
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(
            Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(
            Xv_train, valid_idx), _get(y_train, valid_idx)

        # 训练好模型 并进行预测
        dfm = DeepFM(**dfm_params)

        print('before fit   Xi_train_:', Xi_train_[0:3])
        print('before fit   Xv_train_:', Xv_train_[0:3])
        print('before fit   y_train_:', y_train_[0:3])
        dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)

        y_train_meta[valid_idx, 0] = dfm.predict(Xi_valid_, Xv_valid_)
        y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test)

        gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx])
        gini_results_epoch_train[i] = dfm.train_result
        gini_results_epoch_valid[i] = dfm.valid_result

    y_test_meta /= float(len(folds))

    # save result
    if dfm_params["use_fm"] and dfm_params["use_deep"]:
        clf_str = "DeepFM"
    elif dfm_params["use_fm"]:
        clf_str = "FM"
    elif dfm_params["use_deep"]:
        clf_str = "DNN"
    print("%s: %.5f (%.5f)" %
          (clf_str, gini_results_cv.mean(), gini_results_cv.std()))
    filename = "%s_Mean%.5f_Std%.5f.csv" % (clf_str, gini_results_cv.mean(),
                                            gini_results_cv.std())
    _make_submission(ids_test, y_test_meta, filename)

    _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str)

    return y_train_meta, y_test_meta
Пример #11
0
    def fit(self,
            Xi_train,
            Xv_train,
            y_train,
            Xi_valid=None,
            Xv_valid=None,
            y_valid=None,
            early_stopping=False,
            refit=False):

        has_valid = Xv_valid is not None
        self.gini_train = []
        self.gini_valid = []

        for epoch in range(self.epoch):
            pre_train = []
            pre_valid = []
            t1 = time()
            self.shuffle_in_unison_scary(Xi_train, Xv_train, y_train)
            total_batch = int((len(y_train) - 1) / self.batch_size) + 1
            for i in range(total_batch):
                Xi_batch, Xv_batch, y_batch = self.get_batch(
                    Xi_train, Xv_train, y_train, self.batch_size, i)

                feed_dict = {
                    self.feat_index: np.array(Xi_batch),
                    self.feat_value: np.array(Xv_batch),
                    self.label: np.array(y_batch).reshape((-1, 1)),
                    self.dropout_keep_fm: self.dropout_fm,
                    self.dropout_keep_deep: self.dropout_dep,
                    self.train_phase: True
                }

                # loss, opt = self.sess.run([self.loss, self.optimizer], feed_dict=feed_dict)
                loss, opt, train_out = self.sess.run(
                    (self.loss, self.optimizer, self.out), feed_dict=feed_dict)
                # pre_train.append(train_out)
            # dfm.fit_on_batch(Xi_batch, Xv_batch, y_batch)

            for i in range(total_batch):
                dummy = [1] * len(Xi_train)
                Xi_batch, Xv_batch, y_batch = self.get_batch(
                    Xi_train, Xv_train, dummy, self.batch_size, i)
                num_batch = len(y_batch)
                feed_dict = {
                    self.feat_index: np.array(Xi_batch),
                    self.feat_value: np.array(Xv_batch),
                    self.label: np.array(y_batch).reshape((-1, 1)),
                    self.dropout_keep_fm: [1.0] * len(self.dropout_fm),
                    self.dropout_keep_deep: [1.0] * len(self.dropout_dep),
                    self.train_phase: False
                }
                loss, train_out = self.sess.run((self.loss, self.out),
                                                feed_dict=feed_dict)
                if i == 0:
                    pre_train = np.reshape(train_out, (num_batch, ))
                else:
                    pre_train = np.concatenate(
                        (pre_train, np.reshape(train_out, (num_batch, ))))

            sig_gini_train = gini_norm(y_train, pre_train)
            # sig_gini_train = self.evaluate(Xi_train, Xv_train, y_train)
            self.gini_train.append(sig_gini_train)

            # evaluate training and validation datasets
            train_result = self.evaluate(Xi_train, Xv_train, y_train)
            self.train_result.append(train_result)
            if has_valid:
                valid_result = self.evaluate(Xi_valid, Xv_valid, y_valid)
                self.valid_result.append(valid_result)
                feed_dict = {
                    self.feat_index: np.array(Xi_valid),
                    self.feat_value: np.array(Xv_valid),
                    self.label: np.array(y_valid).reshape((-1, 1)),
                    self.dropout_keep_fm: [1.0] * len(self.dropout_fm),
                    self.dropout_keep_deep: [1.0] * len(self.dropout_dep),
                    self.train_phase: False
                }
                loss_test, valid_out = self.sess.run((self.loss, self.out),
                                                     feed_dict=feed_dict)
                pre_valid.append(valid_out)
                pre_valid = [y for x in pre_valid for y in x]
                sig_gini_valid = gini_norm(y_valid, pre_valid)
                self.gini_valid.append(sig_gini_valid)

            if self.verbose > 0 and epoch % self.verbose == 0:
                if has_valid:
                    print(
                        "[%d] train-result=%.4f, valid-result=%.4f,my-train=%.4f, my_valid=%.4f [%.1f s],"
                        % (epoch + 1, train_result, valid_result,
                           sig_gini_train, sig_gini_valid, time() - t1))
                else:
                    print("[%d] train-result=%.4f [%.1f s]" %
                          (epoch + 1, train_result, time() - t1))
            if has_valid and early_stopping and self.training_termination(
                    self.valid_result):
                break
Пример #12
0
    def my_fit(self,
               Xi_train_,
               Xv_train_,
               y_train_,
               Xi_valid_=None,
               Xv_valid_=None,
               y_valid_=None):

        max_checks_without_progress = 10
        checks_without_progress = 0
        best_gini = 0

        loss_train = 0
        loss_test = 0
        self.gini_train = []
        self.gini_valid = []
        for epoch in range(self.epoch):
            t1 = time()

            pre_train = []
            pre_valid = []

            self.shuffle_in_unison_scary(Xi_train_, Xv_train_, y_train_)
            total_batch = int((len(y_train_) - 1) / self.batch_size) + 1
            for i in range(total_batch):
                Xi_batch, Xv_batch, y_batch = self.get_batch(
                    Xi_train_, Xv_train_, y_train_, self.batch_size, i)
                feed_dict = {
                    self.feat_index: np.array(Xi_batch),
                    self.feat_value: np.array(Xv_batch),
                    self.label: np.array(y_batch).reshape((-1, 1)),
                    self.dropout_keep_fm: self.dropout_fm,
                    self.dropout_keep_deep: self.dropout_dep,
                    self.train_phase: True
                }

                loss, opt, train_out = self.sess.run(
                    (self.loss, self.optimizer, self.out), feed_dict=feed_dict)
                # if extra_update_ops:
                #     sess.run(extra_update_ops,feed_dict=feed_dict)
                loss_train += loss
                pre_train.append(train_out)
                # dfm.fit_on_batch(Xi_batch, Xv_batch, y_batch)
            loss_train /= total_batch
            pre_train = [y for x in pre_train for y in x]
            sig_gini_train = gini_norm(y_train_, pre_train)
            self.gini_train.append(sig_gini_train)

            feed_dict = {
                self.feat_index: np.array(Xi_valid_),
                self.feat_value: np.array(Xv_valid_),
                self.label: np.array(y_valid_).reshape((-1, 1)),
                self.dropout_keep_fm: [1.0] * len(self.dropout_fm),
                self.dropout_keep_deep: [1.0] * len(self.dropout_dep),
                self.train_phase: False
            }
            loss_test, valid_out = self.sess.run((self.loss, self.out),
                                                 feed_dict=feed_dict)
            pre_valid.append(valid_out)
            pre_valid = [y for x in pre_valid for y in x]
            sig_gini_valid = gini_norm(y_valid_, pre_valid)
            self.gini_valid.append(sig_gini_valid)

            if sig_gini_valid > best_gini:
                gvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)

                best_params = {
                    gvar.op.name: value
                    for gvar, value in zip(gvars, self.sess.run(gvars))
                }
                best_gini = sig_gini_valid
                checks_without_progress = 0
            else:
                checks_without_progress += 1

            print("[%d] train-result=%.4f, valid-result=%.4f [%.1f s]" %
                  (epoch + 1, sig_gini_train, sig_gini_valid, time() - t1))

            if checks_without_progress > max_checks_without_progress:
                print('early stopping!')
                break
        ##########将训练过程中保存的最好的参数重新返回到模型参数,此时得到的是最好的模型
        if best_params:
            gvars_names = list(best_params.keys())
            assign_ops = {
                gvar_name:
                tf.get_default_graph().get_operation_by_name(gvar_name +
                                                             '/Assign')
                for gvar_name in gvars_names
            }
            init_values = {
                gvar_name: assign_op.inputs[1]
                for gvar_name, assign_op in assign_ops.items()
            }
            feed_dict = {
                init_values[gvar_name]: best_params[gvar_name]
                for gvar_name in gvars_names
            }
            self.sess.run(assign_ops, feed_dict=feed_dict)

        return self
Пример #13
0
def run_base_model_dfm(train_df, test_df, data_folds, params):

    # 解析数据,构建特征索引和特征值
    fd = FeatureDictionay(df_train=train_df,
                          df_test=test_df,
                          numeric_cols=params.numeric_cols,
                          ignore_cols=params.ignore_cols)

    data_parser = DataParser(feat_dict=fd)

    train_Xi, train_Xv, train_y = data_parser.parse(df=train_df,
                                                    has_label=True)
    test_Xi, test_Xv, test_ids = data_parser.parse(df=test_df)

    # get feature size and field size
    feature_size = fd.feat_size
    field_size = len(train_Xi[0])

    train_meta_y = np.zeros((train_df.shape[0], 1), dtype=float)
    test_meta_y = np.zeros((test_df.shape[0], 1), dtype=float)

    _get = lambda x, l: [x[i] for i in l]

    # metric
    gini_results_cv = np.zeros(len(data_folds), dtype=float)
    gini_results_epoch_train = np.zeros((len(data_folds), params.epochs),
                                        dtype=float)
    gini_results_epoch_valid = np.zeros((len(data_folds), params.epochs),
                                        dtype=float)

    for idx, (train_idx, valid_idx) in enumerate(data_folds):
        train_Xi_ = _get(train_Xi, train_idx)
        train_Xv_ = _get(train_Xv, train_idx)
        train_y_ = _get(train_y, train_idx)

        valid_Xi_ = _get(train_Xi, valid_idx)
        valid_Xv_ = _get(train_Xv, valid_idx)
        valid_y_ = _get(train_y, valid_idx)

        # construct model, for folds
        dfm = Train(params, feature_size, field_size)

        dfm.training(train_Xi_,
                     train_Xv_,
                     train_y_,
                     valid_Xi_,
                     valid_Xv_,
                     valid_y_,
                     early_stopping=False,
                     refit=False)

        train_meta_y[valid_idx, 0] = dfm.predict(valid_Xi_, valid_Xv_)
        test_meta_y[:, 0] += dfm.predict(test_Xi, test_Xv)

        gini_results_cv[idx] = gini_norm(valid_y_, train_meta_y[valid_idx])
        gini_results_epoch_train[
            idx, :len(dfm.train_results)] = dfm.train_results
        gini_results_epoch_valid[
            idx, :len(dfm.valid_results)] = dfm.valid_results

    test_meta_y = test_meta_y / float(len(data_folds))

    # save result
    if params.use_fm and params.use_deep:
        clf_str = "DeepFM"
    elif params.use_fm:
        clf_str = "FM"
    else:
        clf_str = "DNN"
    print("%s: %.5f (%.5f)" %
          (clf_str, gini_results_cv.mean(), gini_results_cv.std()))
    filename = "%s_Mean%.5f_Std%.5f.csv" % (clf_str, gini_results_cv.mean(),
                                            gini_results_cv.std())
    _make_submission(test_ids, test_meta_y, params.sub_dir, filename)

    _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str)

    return train_meta_y, test_meta_y
Пример #14
0
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params):
    fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols=config.IGNORE_COLS)
    data_parser = DataParser(feat_dict=fd)

    """
    Xi_x是一个n_samples x n_features的索引list,每个数值型特征编码为一个固定索引,每个类别型特征根据类别
    数编码为不同的索引
    Xv_x是一个n_samples x n_features的值list
    """
    _print("parse data begin")
    Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest)
  
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
    _print("parse data finish")

    dfm_params["feature_size"] = fd.feat_dim #最大索引
    dfm_params["field_size"] = len(Xi_train[0]) #特征数,这个还是原始的特征数

    y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float)
    y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)

    _get = lambda x, l: [x[i] for i in l]
    gini_results_cv = np.zeros(len(folds), dtype=float)
    gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float)
    gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float)

    #train_idx和valid_idx分别是训练集和验证集的idx,因为做了kfold所以下面要从
    #全样本中根据idx提取出来
    for i, (train_idx, valid_idx) in enumerate(folds):
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx)

        _print("fit, fold=%d" % i)
        dfm = DeepFM(**dfm_params, n_samples=len(Xi_train_))
        dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)

        y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_)
        y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test)

        gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx])
        gini_results_epoch_train[i] = dfm.train_result
        gini_results_epoch_valid[i] = dfm.valid_result

    y_test_meta /= float(len(folds))

    # save result
    if dfm_params["use_fm"] and dfm_params["use_deep"]:
        clf_str = "DeepFM"
    elif dfm_params["use_fm"]:
        clf_str = "FM"
    elif dfm_params["use_deep"]:
        clf_str = "DNN"

    line = "%s: %.5f (%.5f)"%(clf_str, gini_results_cv.mean(), gini_results_cv.std())
    _print(line)

    filename = "%s_Mean%.5f_Std%.5f.csv"%(clf_str, gini_results_cv.mean(), gini_results_cv.std())
    _make_submission(ids_test, y_test_meta, filename)

    # 暂时不画图了
    #_plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str)

    return y_train_meta, y_test_meta
Пример #15
0
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params):
    fd = FeatureDictionary(dfTrain=dfTrain,
                           dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols=config.IGNORE_COLS)
    data_parser = DataParser(feat_dict=fd)
    Xi_train, Xv_train, y_train = data_parser.parse(
        df=dfTrain, has_label=True)  # 返回样本特征id, 样本特征值, label
    Xi_test, Xv_test, ids_test = data_parser.parse(
        df=dfTest)  # 返回样本特征id, 样本特征值, 样本id

    dfm_params["feature_size"] = fd.feat_dim  # 特征总数
    dfm_params["field_size"] = len(
        Xi_train[0])  # Xi_train[0]是训练集的第一条样本,该长度描述的是field的数量

    y_train_meta = np.zeros((dfTrain.shape[0], 1),
                            dtype=float)  # 构建中间变量,长度和样本数保持一致
    y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
    _get = lambda x, l: [x[i] for i in l
                         ]  # lambda语法,类似于C语言中的宏定义,冒号前的是变量,冒号后的是变量执行的语句
    gini_results_cv = np.zeros(len(folds),
                               dtype=float)  # len(folds)表示分割训练集和验证集的方法数(kfold)
    gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]),
                                        dtype=float)
    gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]),
                                        dtype=float)
    for i, (train_idx, valid_idx) in enumerate(
            folds):  # 反复训练模型k(len(folds))次,只在训练集量少时进行, 数据量足够大时,无需循环
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(
            Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(
            Xv_train, valid_idx), _get(y_train, valid_idx)

        dfm = DeepFM(**dfm_params)  # 构造网络
        dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_,
                y_valid_)  # 拟合

        y_train_meta[valid_idx, 0] = dfm.predict(Xi_valid_,
                                                 Xv_valid_)  # 在验证集上预测
        y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test)  # 在测试集上预测

        gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx])
        gini_results_epoch_train[i] = dfm.train_result
        gini_results_epoch_valid[i] = dfm.valid_result

    y_test_meta /= float(len(folds))

    # save result
    if dfm_params["use_fm"] and dfm_params["use_deep"]:
        clf_str = "DeepFM"
    elif dfm_params["use_fm"]:
        clf_str = "FM"
    elif dfm_params["use_deep"]:
        clf_str = "DNN"
    print("%s: %.5f (%.5f)" %
          (clf_str, gini_results_cv.mean(), gini_results_cv.std()))
    filename = "%s_Mean%.5f_Std%.5f.csv" % (clf_str, gini_results_cv.mean(),
                                            gini_results_cv.std())
    _make_submission(ids_test, y_test_meta, filename)

    _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str)

    return y_train_meta, y_test_meta
Пример #16
0
        feed_dict = {
            dfm.feat_index: np.array(Xi_batch),
            dfm.feat_value: np.array(Xv_batch),
            dfm.label: np.array(y_batch).reshape((-1, 1)),
            dfm.dropout_keep_fm: [1.0] * len(dfm.dropout_fm),
            dfm.dropout_keep_deep: [1.0] * len(dfm.dropout_deep),
            dfm._training: False
        }
        loss, train_out = sess.run((dfm.loss, dfm.out), feed_dict=feed_dict)
        if i == 0:
            pre_train = np.reshape(train_out, (num_batch, ))
        else:
            pre_train = np.concatenate(
                (pre_train, np.reshape(train_out, (num_batch, ))))

    sig_gini_train = gini_norm(y_train_, pre_train)

    gini_train.append(sig_gini_train)

    feed_dict = {
        dfm.feat_index: np.array(Xi_valid_),
        dfm.feat_value: np.array(Xv_valid_),
        dfm.label: np.array(y_valid_).reshape((-1, 1)),
        dfm.dropout_keep_fm: [1.0] * len(dfm.dropout_fm),
        dfm.dropout_keep_deep: [1.0] * len(dfm.dropout_deep),
        dfm._training: False
    }
    loss_test, valid_out = sess.run((dfm.loss, dfm.out), feed_dict=feed_dict)
    pre_valid.append(valid_out)
    pre_valid = [y for x in pre_valid for y in x]
    sig_gini_valid = gini_norm(y_valid_, pre_valid)
Пример #17
0
def _run_base_model_dfm(dfTrain,
                        dfTest,
                        folds,
                        dfm_params,
                        NUMERIC_COLS,
                        IGNORE_COLS,
                        application='classification'):
    fd = FeatureDictionary(dfTrain=dfTrain,
                           dfTest=dfTest,
                           numeric_cols=NUMERIC_COLS,
                           ignore_cols=IGNORE_COLS)
    data_parser = DataParser(feat_dict=fd)
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
    Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest, has_label=True)

    dfm_params["feature_size"] = fd.feat_dim
    dfm_params["field_size"] = len(Xi_train[0])

    y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float)
    y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
    _get = lambda x, l: [x[i] for i in l]
    results_cv = np.zeros(len(folds), dtype=float)
    results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]),
                                   dtype=float)
    results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]),
                                   dtype=float)
    for i, (train_idx, valid_idx) in enumerate(folds):
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(
            Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(
            Xv_train, valid_idx), _get(y_train, valid_idx)

        dfm = DeepFM(**dfm_params)
        dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)

        y_train_meta[valid_idx, 0] = dfm.predict(Xi_valid_, Xv_valid_)
        y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test)

        if application == 'classification':
            results_cv[i] = roc_auc_score(y_valid_, y_train_meta[valid_idx])
        elif application == 'regression':
            results_cv[i] = np.sqrt(
                mean_squared_error(y_valid_, y_train_meta[valid_idx]))
        else:
            results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx])
        results_epoch_train[i] = dfm.train_result
        results_epoch_valid[i] = dfm.valid_result

    y_test_meta /= float(len(folds))

    # save result
    if dfm_params["use_fm"] and dfm_params["use_deep"]:
        clf_str = "DeepFM"
    elif dfm_params["use_fm"]:
        clf_str = "FM"
    elif dfm_params["use_deep"]:
        clf_str = "DNN"
    print("%s: rmse/accuracy/gini is %.4f (std is %.4f)" %
          (clf_str, results_cv.mean(), results_cv.std()))
    filename = "%s_Mean%.5f.csv" % (clf_str, results_cv.mean())
    _make_submission(ids_test, y_test_meta, filename)

    _plot_fig(results_epoch_train, results_epoch_valid, clf_str, application)

    return y_train_meta, y_test_meta
Пример #18
0
    def my_fit(self,
               Xi_train_,
               Xv_train_,
               y_train_,
               Xi_valid_=None,
               Xv_valid_=None,
               y_valid_=None):

        self.close_session()
        self._init_graph()
        # extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

        self._session = tf.Session()
        self._init = tf.global_variables_initializer()
        self._session.run(self._init)

        max_checks_without_progress = 10
        checks_without_progress = 0
        best_gini = 0

        loss_train = 0
        loss_test = 0
        self.gini_train = []
        self.gini_valid = []
        for epoch in range(self.epoch):
            t1 = time()

            pre_train = []
            pre_valid = []

            self.shuffle_in_unison_scary(Xi_train_, Xv_train_, y_train_)
            total_batch = int((len(y_train_) - 1) / self.batch_size) + 1
            for i in range(total_batch):
                Xi_batch, Xv_batch, y_batch = self.get_batch(
                    Xi_train_, Xv_train_, y_train_, self.batch_size, i)
                feed_dict = {
                    self.feat_index: np.array(Xi_batch),
                    self.feat_value: np.array(Xv_batch),
                    self.label: np.array(y_batch).reshape((-1, 1)),
                    self.dropout_keep_fm: self.dropout_fm,
                    self.dropout_keep_deep: self.dropout_deep,
                    self._training: True
                }
                # if extra_update_ops:
                #     self._session.run(extra_update_ops, feed_dict=feed_dict)
                loss, opt, train_out = self._session.run(
                    (self.loss, self.train_step, self.out),
                    feed_dict=feed_dict)

            loss_train /= total_batch

            # ##########----------------gini train---------
            # for i in range(total_batch):
            #     dummy = [1] * len(Xi_train_)
            #     Xi_batch, Xv_batch, y_batch = self.get_batch(Xi_train_, Xv_train_, dummy, self.batch_size, i)
            #     num_batch = len(y_batch)
            #     feed_dict = {self.feat_index: np.array(Xi_batch),
            #                  self.feat_value: np.array(Xv_batch),
            #                  self.label: np.array(y_batch).reshape((-1, 1)),
            #                  self.dropout_keep_fm: [1.0] * len(self.dropout_fm),
            #                  self.dropout_keep_deep: [1.0] * len(self.dropout_deep),
            #                  self._training: False
            #
            #                  }
            #     loss, train_out = self._session.run((self.loss, self.out), feed_dict=feed_dict)
            #     if i == 0:
            #         pre_train = np.reshape(train_out, (num_batch,))
            #     else:
            #         pre_train = np.concatenate((pre_train, np.reshape(train_out, (num_batch,))))
            pre_train = self.my_predict_prob(Xi_train_, Xv_train_)
            sig_gini_train = gini_norm(y_train_, pre_train)
            self.gini_train.append(sig_gini_train)

            #########################valid   value-------------
            # feed_dict = {self.feat_index: np.array(Xi_valid_),
            #              self.feat_value: np.array(Xv_valid_),
            #              self.label: np.array(y_valid_).reshape((-1, 1)),
            #              self.dropout_keep_fm: [1.0] * len(self.dropout_fm),
            #              self.dropout_keep_deep: [1.0] * len(self.dropout_deep)
            #
            #              }
            # loss_test, valid_out = self._session.run((self.loss, self.out), feed_dict=feed_dict)
            # pre_valid.append(valid_out)
            # pre_valid = [y for x in pre_valid for y in x]

            pre_valid = self.my_predict_prob(Xi_valid_, Xv_valid_)
            sig_gini_valid = gini_norm(y_valid_, pre_valid)
            self.gini_valid.append(sig_gini_valid)

            if sig_gini_valid > best_gini:
                gvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)

                best_params = {
                    gvar.op.name: value
                    for gvar, value in zip(gvars, self._session.run(gvars))
                }
                best_gini = sig_gini_valid
                checks_without_progress = 0
            else:
                checks_without_progress += 1

            print("[%d] train-result=%.4f, valid-result=%.4f [%.1f s]" %
                  (epoch + 1, sig_gini_train, sig_gini_valid, time() - t1))
#####################-------------希望早停的话把这句加上
# if checks_without_progress > max_checks_without_progress:
#     print('early stopping!')
#     break
##########将训练过程中保存的最好的参数重新返回到模型参数,此时得到的是最好的模型
        if best_params:
            gvars_names = list(best_params.keys())
            assign_ops = {
                gvar_name:
                tf.get_default_graph().get_operation_by_name(gvar_name +
                                                             '/Assign')
                for gvar_name in gvars_names
            }
            init_values = {
                gvar_name: assign_op.inputs[1]
                for gvar_name, assign_op in assign_ops.items()
            }
            feed_dict = {
                init_values[gvar_name]: best_params[gvar_name]
                for gvar_name in gvars_names
            }
            self._session.run(assign_ops, feed_dict=feed_dict)

        return self
Пример #19
0
        prob = F.sigmoid(outputs)
        y_pre.append(prob.data.cpu().numpy())

        loss = criterion(outputs, y_batch)
        # _, predicted = torch.max(outputs.data, 1)
        loss_train += loss.item()
        ###########backward and optimize
        torch_optim.zero_grad()
        loss.backward()
        torch_optim.step()
        # if (i + 1) % 10 == 0:
        #     print('steps:[%d],train_loss:[%.3f]' % (i + 1, loss.item()))

    y_true = np.concatenate(y_true, axis=0)
    y_pre = np.concatenate(y_pre, axis=0)
    gini_train = gini_norm(y_true, y_pre)

    loss_train /= steps
    loss_val, gini_val = val(model, valid_dataloader)
    history['loss_train'].append(loss_train)
    history['loss_val'].append(loss_val)
    history['gini_train'].append(gini_train)
    history['gini_val'].append(gini_val)

    if loss_val < best_loss:
        torch.save(model.state_dict(), check_file)
        best_loss = loss_val
        checks_without_progress = 0
    else:
        checks_without_progress += 1
        if checks_without_progress > max_checks_without_progress:
    def training(self, train_Xi, train_Xv, train_y,
                 valid_Xi=None, valid_Xv=None, valid_y=None,
                 early_stopping=False, refit=False):
        need_valid = False
        if valid_Xi is not None:
            need_valid = True

        ## construct optimizer
        optimizer_type = self.params.optimizer
        if optimizer_type == "sgd":
            optimizer = optim.SGD(self.deepfm.parameters(), lr=self.params.learning_rate)
        else:
            optimizer = optim.Adam(self.deepfm.parameters(),
                                   lr=self.params.learning_rate, betas=(0.9, 0.99),
                                   eps=1e-8, amsgrad=True)


        ##
        loss_type = self.params.loss_type
        for epoch in range(1, self.params.epochs + 1):
            self.deepfm.train()
            t1 = time.time()
            self.shuffle_in_unison_scary(train_Xi, train_Xv, train_y)
            total_batch = int(len(train_y) / self.params.batch_size) + 1
            for i in range(total_batch):
                batch_Xi, batch_Xv, batch_y = self.get_batch(train_Xi, train_Xv, train_y,
                                                             self.params.batch_size, i)
                batch_Xi = torch.tensor(batch_Xi, dtype=torch.long)
                batch_Xv = torch.tensor(batch_Xv, dtype=torch.float)
                batch_y = torch.tensor(batch_y, dtype=torch.long)

                optimizer.zero_grad()

                output, _, _, _ = self.deepfm(batch_Xi, batch_Xv)
                if loss_type == "logloss":
                    # for classification
                    output = F.sigmoid(output)
                    loss = -torch.mul(batch_y, torch.log(output)) \
                           - torch.mul((1-batch_y), torch.log(1-output))
                    loss = torch.mean(loss)
                elif loss_type == "mse":
                    # for regression
                    loss = F.mse_loss(input=output, target=batch_y)
                else:
                    raise ValueError("Unknown loss type, should be one of 'logloss/mes'")

                # l2 regularization on weights for preventing over-fitting
                if self.params.l2_reg > 0:
                    loss += self.params.l2_reg * torch.norm(self.deepfm.final_W, 2)
                    if self.params.use_deep:
                        for weight in self.deepfm.deep_layers:
                            loss += self.params.l2_reg * torch.norm(weight.W, 2)
                print("epoch: %d, loss: %.4f" % (epoch, loss.item()))

                # backward
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.deepfm.parameters(), self.params.grad_clip)

                optimizer.step()

            # each epoch with evaluate training and validation datasets
            train_pred_y = self.predict(train_Xi, train_Xv)
            self.train_results.append(gini_norm(train_y, train_pred_y))
            if need_valid:
                valid_pred_y = self.predict(valid_Xi, valid_Xv)
                self.valid_results.append(gini_norm(valid_y, valid_pred_y))
                print("epoch: %d, train-result: %.4f, valid-result: %.4f, "
                      "cost-time: %.2f s" % (epoch, self.train_results[-1],
                                             self.valid_results[-1], time.time() - t1))
            else:
                print("epoch: %d, train-result: %.4f, cost-time: %.2f s"
                      % (epoch, self.train_results[-1], time.time() - t1))

            if need_valid and early_stopping and self.training_termination(self.valid_results):
                print("Early Stopping!!")
                break

        # fit a few more epochs on train+valid until result reaches the best_train_score
        if need_valid and refit:
            greater_is_better = self.params.greater_is_better
            if greater_is_better:
                best_valid_score = max(self.valid_results)
            else:
                best_valid_score = min(self.valid_results)

            best_epoch = self.valid_results.index(best_valid_score)
            best_train_score = self.train_results[best_epoch]
            train_Xi = train_Xi + valid_Xi
            train_Xv = train_Xv + valid_Xv
            train_y = train_y + valid_y
            for epoch in range(1, 100):
                self.shuffle_in_unison_scary(train_Xi, train_Xv, train_y)
                total_batch = int(len(train_y) / self.params.batch_size) + 1
                for i in range(total_batch):
                    batch_Xi, batch_Xv, batch_y = self.get_batch(train_Xi, train_Xv, train_y,
                                                                 self.params.batch_size, i)

                    batch_Xi = torch.tensor(batch_Xi, dtype=torch.long)
                    batch_Xv = torch.tensor(batch_Xv, dtype=torch.float)
                    batch_y = torch.tensor(batch_y, dtype=torch.long)

                    optimizer.zero_grad()

                    output, _, _, _ = self.deepfm(batch_Xi, batch_Xv)
                    if loss_type == "logloss":
                        # for classification
                        output = F.sigmoid(output)
                        loss = -torch.mul(batch_y, torch.log(output)) \
                               - torch.mul((1 - batch_y), torch.log(1 - output))
                        loss = torch.mean(loss)
                    elif loss_type == "mse":
                        # for regression
                        loss = F.mse_loss(input=output, target=batch_y)
                    else:
                        raise ValueError("Unknown loss type, should be one of 'logloss/mes'")

                    # l2 regularization on weights for preventing over-fitting
                    if self.params.l2_reg > 0:
                        loss += self.params.l2_reg * torch.norm(self.deepfm.final_W, 2)
                        if self.params.use_deep:
                            for weight in self.deepfm.deep_layers:
                                loss += self.params.l2_reg * torch.norm(weight.W, 2)
                    print("epoch: %d, loss: %.4f" % (epoch, loss.item()))

                    # backward
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.deepfm.parameters(), self.params.grad_clip)

                    optimizer.step()

                # check
                train_pred_y = self.predict(train_Xi, train_Xv)
                train_result = gini_norm(train_y, train_pred_y)
                if abs(train_result - best_train_score) < 0.001 \
                    or (greater_is_better and train_result > best_train_score) \
                    or ((not greater_is_better) and train_result < best_train_score):
                    print("Find best train score!!")
                    break