예제 #1
0
파일: base.py 프로젝트: Datafruit/sugo-tag
def kfold_cross_validation(alg, X, y, k, valid_threshold=0.9):
    """
    :param alg: 算法模型实例 
    :param X:   
    :param y:   
    :param k: kfold算法的数目
    :param valid_threshold: 验证通过的阈值(训练集和测试集预测结果的余弦相似度)
    :return: mse, auc, threshold
    """
    mse_kfold = {"train": [], "test": []}
    auc_kfold = {"train": [], "test": []}
    threshold_kfold = []
    kf = KFold(n_splits=k, shuffle=True)
    for idx, (train_idx, test_idx) in enumerate(kf.split(y)):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx, :]
        X_test = X.iloc[test_idx, :]
        y_test = y.iloc[test_idx, :]
        alg_cp = copy.deepcopy(alg)
        alg_cp = alg_cp.fit(X_train, y_train)
        y_train_pred = alg_cp.predict(X_train)
        y_test_pred = alg_cp.predict(X_test)
        mse_train_value = mse(y_train, y_train_pred)
        auc_train_value = auc(y_train, y_train_pred)
        mse_test_value = mse(y_test, y_test_pred)
        auc_test_value = auc(y_test, y_test_pred)
        fpr, tpr, threshold = roc_curve(y_test, y_test_pred)
        f1score = []
        for thr in threshold:
            y_test_pred_class = np.apply_along_axis(func1d=lambda x: 1
                                                    if x >= thr else 0,
                                                    axis=1,
                                                    arr=y_test_pred)
            f1score.append(f1_score(y_test, y_test_pred_class))
        threshold_optimal = threshold[np.argmax(f1score)]
        mse_kfold["train"].append(mse_train_value)
        mse_kfold["test"].append(mse_test_value)
        auc_kfold["train"].append(auc_train_value)
        auc_kfold["test"].append(auc_test_value)
        threshold_kfold.append(threshold_optimal)
        alg.logger.info(
            "cross validation fold[%d]: mse_train[%.6f], mse_test[%.6f], auc_train[%.6f], auc_test[%.6f], threshold[%.6f]"
            % (idx, mse_train_value, mse_test_value, auc_train_value,
               auc_test_value, threshold_optimal))
    cos_sim_mse = \
        np.dot(mse_kfold["train"], mse_kfold["test"]) / \
        (np.linalg.norm(mse_kfold["train"]) * np.linalg.norm(mse_kfold["test"]))
    cos_sim_auc = \
        np.dot(auc_kfold["train"], auc_kfold["test"]) / \
        (np.linalg.norm(auc_kfold["train"]) * np.linalg.norm(auc_kfold["test"]))
    alg.logger.info("cross validation: cosine similarity of mse: %.6f" %
                    cos_sim_mse)
    alg.logger.info("cross validation: cosine similarity of auc: %.6f" %
                    cos_sim_auc)
    if cos_sim_mse < valid_threshold or cos_sim_auc < valid_threshold:
        alg.logger.error("cross validation: algorithm is overfit")
        raise Exception("%s cross validation: algorithm is overfit" %
                        alg.__class__.__name__)
    return np.mean(mse_kfold["test"]), np.mean(
        auc_kfold["test"]), np.mean(threshold_kfold)
예제 #2
0
파일: lr.py 프로젝트: Datafruit/sugo-tag
 def fit(self, X, y, *args, **kwargs):
     n_row, n_col = X.shape
     ## convert to scipy.sparse_matrix
     X = csr_matrix(X, dtype=np.float64)
     y = csr_matrix(y, dtype=np.float64)
     x_sts = np.matrix(X.transpose().sum(axis=1))
     x_ratio = csr_matrix(
         np.divide(1.0, x_sts, out=np.zeros_like(x_sts), where=x_sts != 0))
     ## init factors
     w0 = self._w0_default
     w = csr_matrix(np.full((n_col, 1), self._w_default))
     ## train model
     mse_value = 1.0
     logloss_value = sys.maxint
     auc_value = 0.0
     for i in xrange(self._max_iter):
         ## delta
         delta = self.sig_mod(X, w0, w) - y
         ## descent
         descent_w0 = np.sum(delta)
         descent_w = X.transpose().dot(delta)
         ## penalty
         if self._penalty == "L2":
             penalty_w0 = self._C * w0
             penalty_w = self._C * w
         elif self._penalty == "L1":
             penalty_w0 = self._C
             penalty_w = csr_matrix(np.full((n_col, 1), self._C))
         else:
             raise Exception("FM must have penalty with one of [L1, L2]")
         ## gradient descent
         w0 = w0 - self._step * (descent_w0 / float(n_row) + penalty_w0)
         w = w - self._step * (descent_w.multiply(x_ratio) + penalty_w)
         ## evaluate
         if (i + 1) % 10 == 0 or (i + 1) == self._max_iter:
             y_true = y.toarray()
             y_pred = self.sig_mod(X, w0, w).toarray()
             ## MSE
             mse_value_new = mse(y_true, y_pred)
             mse_ratio = (mse_value - mse_value_new) / mse_value
             mse_value = mse_value_new
             ## LogLoss
             logloss_value_new = logloss(y_true, y_pred)
             logloss_ratio = (logloss_value -
                              logloss_value_new) / logloss_value
             logloss_value = logloss_value_new
             ## AUC
             auc_value = auc(y_true, y_pred)
             # self.logger.info("Iterator[%05d]: MSE[%.6f], LogLoss[%.6f], AUC[%.6f]" % (i+1, mse_value, logloss_value, auc_value))
             if mse_ratio <= self._tol or logloss_ratio <= self._tol or (
                     i + 1) == self._max_iter:
                 self.logger.info(
                     "Iterator[%05d]: MSE[%.6f], LogLoss[%.6f], AUC[%.6f]" %
                     (i + 1, mse_value, logloss_value, auc_value))
                 break
     ## save model
     self._w0 = w0
     self._w = w
     return self
예제 #3
0
파일: fm_tf.py 프로젝트: Datafruit/sugo-tag
 def fit(self, X, y, *args, **kwargs):
     n_row, n_col = X.shape
     w0 = tf.Variable([self._w0_default], dtype="float32")
     w = tf.Variable(np.full((n_col, 1), self._w_default), dtype="float32")
     v = tf.Variable(np.random.randn(n_col, self._v_length), dtype="float32")
     if self._penalty == "L2":
         penalty_w0 = self._C * tf.pow(w0, 2)
         penalty_w = self._C * tf.pow(w, 2)
         penalty_v = self._C * tf.pow(v, 2)
     elif self._penalty == "L1":
         penalty_w0 = tf.Variable([self._C], dtype="float32")
         penalty_w = tf.Variable(np.full((n_col, 1), self._C), dtype="float32")
         penalty_v = tf.Variable(np.full((n_col, self._v_length), self._C), dtype="float32")
     else:
         raise Exception("FM must have penalty with one of [L1, L2]")
     xs = tf.constant(np.matrix(X), dtype="float32")
     ys = tf.constant(np.matrix(y), dtype="float32")
     linear = tf.add(w0, tf.matmul(xs, w))
     nonlinear = tf.reduce_sum(tf.pow(tf.matmul(xs, v), 2) - tf.matmul(tf.pow(xs, 2), tf.pow(v, 2)), axis=1, keep_dims=True)
     y_pred = 1. / (1. + tf.exp(-tf.add(linear, 0.5 * nonlinear)))
     # log_loss = -(ys * tf.log(y_pred) + (1. - ys) * tf.log(1. - y_pred))
     square_loss = tf.pow(tf.subtract(ys, y_pred), 2)
     loss = tf.reduce_mean(square_loss) + tf.reduce_mean(penalty_w0) + tf.reduce_mean(penalty_w) + tf.reduce_mean(penalty_v)
     train_step = tf.train.GradientDescentOptimizer(self._step).minimize(loss)
     init = tf.initialize_all_variables()
     with tf.Session() as sess:
         sess.run(init)
         loss_value = sys.maxint
         mse_value = 1.0
         auc_value = 0.0
         for i in range(self._max_iter):
         # for i in range(10):
             ys_eval = ys.eval()
             y_pred_eval = y_pred.eval()
             ## LOSS
             loss_value_new = loss.eval()
             loss_ratio = (loss_value - loss_value_new) / loss_value
             loss_value = loss_value_new
             ## MSE
             mse_value_new = mse(ys_eval, y_pred_eval)
             mse_ratio = (mse_value - mse_value_new) / mse_value
             mse_value = mse_value_new
             ## AUC
             auc_value = auc(ys_eval, y_pred_eval)
             if (i+1) % 100 == 0:
                 self.logger.info("Iterator[%05d]: LOSS[%.6f], MSE[%.6f], AUC[%.6f]" % (i+1, loss_value, mse_value, auc_value))
             if loss_ratio <= self._tol or mse_ratio <= self._tol or (i+1) == self._max_iter:
                 self.logger.info("Iterator[%05d]: LOSS[%.6f], MSE[%.6f], AUC[%.6f]" % (i+1, loss_value, mse_value, auc_value))
                 break
             sess.run(train_step)
         ## save model
         self._w0 = np.matrix(w0.eval())
         self._w = np.matrix(w.eval())
         self._v = np.matrix(v.eval())
     return self
def run():
    data_home = TASK_DATA_HOME + "/algorithm_test/kaggle/hr_analytics"
    df = pd.read_csv(data_home + "/hr.csv")
    df.reset_index(inplace=True)
    id = "index"
    features_continuous = [
        "satisfaction_level",
        "last_evaluation",
        "number_project",
        "average_montly_hours",
        "time_spend_company",
    ]
    features_discrete = [
        "Work_accident", "promotion_last_5years", "sales", "salary"
    ]
    df["__label__"] = df["left"].apply(lambda x: 1 if int(x) == 1 else 0)
    label = "__label__"
    df_coding = df[[id, label]]
    for feature in features_continuous:
        df_coding_f = discretization_coding(df, id, feature)
        df_coding = df_coding.merge(df_coding_f, how="inner", on=id)
    for feature in features_discrete:
        df_coding_f = one_hot_coding(df, id, feature)
        df_coding = df_coding.merge(df_coding_f, how="inner", on=id)

    x_columns = filter(lambda x: x not in (id, label), df_coding.columns)
    y_columns = [label]
    df_train, df_test = train_test_split(df_coding, test_size=0.2)
    X_train = df_train[x_columns].reset_index(drop=True)
    X_test = df_test[x_columns].reset_index(drop=True)
    y_train = df_train[[label]].reset_index(drop=True)
    y_test = df_test[[label]].reset_index(drop=True)

    ## compare lr, gbdt, fm and deeplearning
    print "############ lr ############"
    algo_lr = LogisticRegressionAlgorithm()
    mse_lr, auc_lr, threshold_lr = kfold_cross_validation(
        algo_lr, X_train, y_train, 5)
    algo_lr = algo_lr.fit(X_train, y_train)
    y_predict_lr = pd.DataFrame(algo_lr.predict(X_test), columns=["__label__"])
    y_predict_lr["__class__"] = y_predict_lr["__label__"].apply(
        lambda x: 1 if x >= threshold_lr else 0)
    print "lr  : mse[%.6f], auc[%.6f]" % (mse(
        y_test,
        y_predict_lr["__label__"]), auc(y_test, y_predict_lr["__label__"]))
예제 #5
0
    y_columns = filter(lambda x: x not in ("item"), df_item_coding.columns)

    df_train, df_test = train_test_split(df_data, test_size=0.2)

    label = "Action"
    X_train = df_train[x_columns].reset_index(drop=True)
    X_test = df_test[x_columns].reset_index(drop=True)
    y_train = df_train[[label]].reset_index(drop=True)
    y_test = df_test[[label]].reset_index(drop=True)

    ## compare lr, gbdt, fm and deeplearning
    print "############ lr ############"
    algo_lr = LogisticRegressionAlgorithm()
    algo_lr = algo_lr.fit(X_train, y_train)
    y_predict_lr = pd.DataFrame(algo_lr.predict(X_test), columns=["__label__"])
    print "lr  : mse[%.6f], auc[%.6f]" % (mse(
        y_test, y_predict_lr), auc(y_test, y_predict_lr))

    print "############ gbdt ############"
    algo_gbdt = GradientBoostingRegressor()
    algo_gbdt = algo_gbdt.fit(X_train, y_train)
    y_predict_gbdt = pd.DataFrame(algo_gbdt.predict(X_test),
                                  columns=["__label__"])
    print "gbdt: mse[%.6f], auc[%.6f]" % (mse(
        y_test, y_predict_gbdt), auc(y_test, y_predict_gbdt))

    print "############ fm ############"
    algo_fm = FactorizationMachineAlgorithm()
    algo_fm = algo_fm.fit(X_train, y_train)
    y_predict_fm = pd.DataFrame(algo_fm.predict(X_test), columns=["__label__"])
    print "fm  : mse[%.6f], auc[%.6f]" % (mse(
        y_test, y_predict_fm), auc(y_test, y_predict_fm))
예제 #6
0
def run():
    data_home = TASK_DATA_HOME + "/algorithm_test/kaggle/creditcardfraud"
    df = pd.read_csv(data_home + "/creditcard.sub.csv")
    df.reset_index(inplace=True)
    id = "index"
    features_continuous = filter(lambda x: x not in ("Class", id),
                                 df.columns.tolist())
    features_discrete = []
    df["__label__"] = df["Class"].apply(lambda x: 1 if int(x) == 1 else 0)
    label = "__label__"
    df_coding = df[[id, label]]
    for feature in features_continuous:
        df_coding_f = discretization_coding(df, id, feature)
        df_coding = df_coding.merge(df_coding_f, how="inner", on=id)
    for feature in features_discrete:
        df_coding_f = one_hot_coding(df, id, feature)
        df_coding = df_coding.merge(df_coding_f, how="inner", on=id)
    df_coding = df_coding.sample(frac=1).reset_index(drop=True)

    x_columns = filter(lambda x: x not in (id, label), df_coding.columns)
    y_columns = [label]
    X = df_coding[x_columns]
    y = df_coding[y_columns]

    index_pos = y[y["__label__"] == 1].index.tolist()
    index_neg = y[y["__label__"] == 0].index.tolist()
    index_pos_train, index_pos_test = train_test_split(index_pos,
                                                       test_size=0.2)
    index_neg_train, index_neg_test = train_test_split(index_neg,
                                                       test_size=0.2)
    X_train = X.loc[index_pos_train + index_neg_train, :]
    X_test = X.loc[index_pos_test + index_neg_test, :]
    y_train = y.loc[index_pos_train + index_neg_train, :]
    y_test = y.loc[index_pos_test + index_neg_test, :]

    # print "############ lr ############"
    # algo_lr = LogisticRegressionAlgorithm()
    # algo_lr = algo_lr.fit(X_train, y_train)
    # algo_lr.predict(X_test)
    # y_predict_lr = pd.DataFrame(algo_lr.predict(X_test), columns=["__label__"])
    # print "lr  : mse[%.6f], auc[%.6f]" % (mse(y_test, y_predict_lr), auc(y_test, y_predict_lr))
    #
    # print "############ gbdt ############"
    # algo_gbdt = GradientBoostingRegressor()
    # algo_gbdt = algo_gbdt.fit(X_train, y_train)
    # y_predict_gbdt = pd.DataFrame(algo_gbdt.predict(X_test), columns=["__label__"])
    # print "gbdt: mse[%.6f], auc[%.6f]" % (mse(y_test, y_predict_gbdt), auc(y_test, y_predict_gbdt))
    #
    # print "############ fm ############"
    # algo_fm = FactorizationMachineAlgorithm()
    # algo_fm = algo_fm.fit(X_train, y_train)
    # y_predict_fm = pd.DataFrame(algo_fm.predict(X_test), columns=["__label__"])
    # print "fm  : mse[%.6f], auc[%.6f]" % (mse(y_test, y_predict_fm), auc(y_test, y_predict_fm))

    print "############ lp ############"
    algo_lp = LittleProbabilityModel(n_jobs=2)
    kfold_cross_validation(algo_lp, X_train, y_train, 5)
    algo_lp = algo_lp.fit(X_train, y_train)
    y_predict_lp = pd.DataFrame(algo_lp.predict(X_test), columns=["__label__"])
    print "lp  : mse[%.6f], auc[%.6f]" % (mse(
        y_test, y_predict_lp), auc(y_test, y_predict_lp))