def kfold_cross_validation(alg, X, y, k, valid_threshold=0.9): """ :param alg: 算法模型实例 :param X: :param y: :param k: kfold算法的数目 :param valid_threshold: 验证通过的阈值(训练集和测试集预测结果的余弦相似度) :return: mse, auc, threshold """ mse_kfold = {"train": [], "test": []} auc_kfold = {"train": [], "test": []} threshold_kfold = [] kf = KFold(n_splits=k, shuffle=True) for idx, (train_idx, test_idx) in enumerate(kf.split(y)): X_train = X.iloc[train_idx, :] y_train = y.iloc[train_idx, :] X_test = X.iloc[test_idx, :] y_test = y.iloc[test_idx, :] alg_cp = copy.deepcopy(alg) alg_cp = alg_cp.fit(X_train, y_train) y_train_pred = alg_cp.predict(X_train) y_test_pred = alg_cp.predict(X_test) mse_train_value = mse(y_train, y_train_pred) auc_train_value = auc(y_train, y_train_pred) mse_test_value = mse(y_test, y_test_pred) auc_test_value = auc(y_test, y_test_pred) fpr, tpr, threshold = roc_curve(y_test, y_test_pred) f1score = [] for thr in threshold: y_test_pred_class = np.apply_along_axis(func1d=lambda x: 1 if x >= thr else 0, axis=1, arr=y_test_pred) f1score.append(f1_score(y_test, y_test_pred_class)) threshold_optimal = threshold[np.argmax(f1score)] mse_kfold["train"].append(mse_train_value) mse_kfold["test"].append(mse_test_value) auc_kfold["train"].append(auc_train_value) auc_kfold["test"].append(auc_test_value) threshold_kfold.append(threshold_optimal) alg.logger.info( "cross validation fold[%d]: mse_train[%.6f], mse_test[%.6f], auc_train[%.6f], auc_test[%.6f], threshold[%.6f]" % (idx, mse_train_value, mse_test_value, auc_train_value, auc_test_value, threshold_optimal)) cos_sim_mse = \ np.dot(mse_kfold["train"], mse_kfold["test"]) / \ (np.linalg.norm(mse_kfold["train"]) * np.linalg.norm(mse_kfold["test"])) cos_sim_auc = \ np.dot(auc_kfold["train"], auc_kfold["test"]) / \ (np.linalg.norm(auc_kfold["train"]) * np.linalg.norm(auc_kfold["test"])) alg.logger.info("cross validation: cosine similarity of mse: %.6f" % cos_sim_mse) alg.logger.info("cross validation: cosine similarity of auc: %.6f" % cos_sim_auc) if cos_sim_mse < valid_threshold or cos_sim_auc < valid_threshold: alg.logger.error("cross validation: algorithm is overfit") raise Exception("%s cross validation: algorithm is overfit" % alg.__class__.__name__) return np.mean(mse_kfold["test"]), np.mean( auc_kfold["test"]), np.mean(threshold_kfold)
def fit(self, X, y, *args, **kwargs): n_row, n_col = X.shape ## convert to scipy.sparse_matrix X = csr_matrix(X, dtype=np.float64) y = csr_matrix(y, dtype=np.float64) x_sts = np.matrix(X.transpose().sum(axis=1)) x_ratio = csr_matrix( np.divide(1.0, x_sts, out=np.zeros_like(x_sts), where=x_sts != 0)) ## init factors w0 = self._w0_default w = csr_matrix(np.full((n_col, 1), self._w_default)) ## train model mse_value = 1.0 logloss_value = sys.maxint auc_value = 0.0 for i in xrange(self._max_iter): ## delta delta = self.sig_mod(X, w0, w) - y ## descent descent_w0 = np.sum(delta) descent_w = X.transpose().dot(delta) ## penalty if self._penalty == "L2": penalty_w0 = self._C * w0 penalty_w = self._C * w elif self._penalty == "L1": penalty_w0 = self._C penalty_w = csr_matrix(np.full((n_col, 1), self._C)) else: raise Exception("FM must have penalty with one of [L1, L2]") ## gradient descent w0 = w0 - self._step * (descent_w0 / float(n_row) + penalty_w0) w = w - self._step * (descent_w.multiply(x_ratio) + penalty_w) ## evaluate if (i + 1) % 10 == 0 or (i + 1) == self._max_iter: y_true = y.toarray() y_pred = self.sig_mod(X, w0, w).toarray() ## MSE mse_value_new = mse(y_true, y_pred) mse_ratio = (mse_value - mse_value_new) / mse_value mse_value = mse_value_new ## LogLoss logloss_value_new = logloss(y_true, y_pred) logloss_ratio = (logloss_value - logloss_value_new) / logloss_value logloss_value = logloss_value_new ## AUC auc_value = auc(y_true, y_pred) # self.logger.info("Iterator[%05d]: MSE[%.6f], LogLoss[%.6f], AUC[%.6f]" % (i+1, mse_value, logloss_value, auc_value)) if mse_ratio <= self._tol or logloss_ratio <= self._tol or ( i + 1) == self._max_iter: self.logger.info( "Iterator[%05d]: MSE[%.6f], LogLoss[%.6f], AUC[%.6f]" % (i + 1, mse_value, logloss_value, auc_value)) break ## save model self._w0 = w0 self._w = w return self
def fit(self, X, y, *args, **kwargs): n_row, n_col = X.shape w0 = tf.Variable([self._w0_default], dtype="float32") w = tf.Variable(np.full((n_col, 1), self._w_default), dtype="float32") v = tf.Variable(np.random.randn(n_col, self._v_length), dtype="float32") if self._penalty == "L2": penalty_w0 = self._C * tf.pow(w0, 2) penalty_w = self._C * tf.pow(w, 2) penalty_v = self._C * tf.pow(v, 2) elif self._penalty == "L1": penalty_w0 = tf.Variable([self._C], dtype="float32") penalty_w = tf.Variable(np.full((n_col, 1), self._C), dtype="float32") penalty_v = tf.Variable(np.full((n_col, self._v_length), self._C), dtype="float32") else: raise Exception("FM must have penalty with one of [L1, L2]") xs = tf.constant(np.matrix(X), dtype="float32") ys = tf.constant(np.matrix(y), dtype="float32") linear = tf.add(w0, tf.matmul(xs, w)) nonlinear = tf.reduce_sum(tf.pow(tf.matmul(xs, v), 2) - tf.matmul(tf.pow(xs, 2), tf.pow(v, 2)), axis=1, keep_dims=True) y_pred = 1. / (1. + tf.exp(-tf.add(linear, 0.5 * nonlinear))) # log_loss = -(ys * tf.log(y_pred) + (1. - ys) * tf.log(1. - y_pred)) square_loss = tf.pow(tf.subtract(ys, y_pred), 2) loss = tf.reduce_mean(square_loss) + tf.reduce_mean(penalty_w0) + tf.reduce_mean(penalty_w) + tf.reduce_mean(penalty_v) train_step = tf.train.GradientDescentOptimizer(self._step).minimize(loss) init = tf.initialize_all_variables() with tf.Session() as sess: sess.run(init) loss_value = sys.maxint mse_value = 1.0 auc_value = 0.0 for i in range(self._max_iter): # for i in range(10): ys_eval = ys.eval() y_pred_eval = y_pred.eval() ## LOSS loss_value_new = loss.eval() loss_ratio = (loss_value - loss_value_new) / loss_value loss_value = loss_value_new ## MSE mse_value_new = mse(ys_eval, y_pred_eval) mse_ratio = (mse_value - mse_value_new) / mse_value mse_value = mse_value_new ## AUC auc_value = auc(ys_eval, y_pred_eval) if (i+1) % 100 == 0: self.logger.info("Iterator[%05d]: LOSS[%.6f], MSE[%.6f], AUC[%.6f]" % (i+1, loss_value, mse_value, auc_value)) if loss_ratio <= self._tol or mse_ratio <= self._tol or (i+1) == self._max_iter: self.logger.info("Iterator[%05d]: LOSS[%.6f], MSE[%.6f], AUC[%.6f]" % (i+1, loss_value, mse_value, auc_value)) break sess.run(train_step) ## save model self._w0 = np.matrix(w0.eval()) self._w = np.matrix(w.eval()) self._v = np.matrix(v.eval()) return self
def run(): data_home = TASK_DATA_HOME + "/algorithm_test/kaggle/hr_analytics" df = pd.read_csv(data_home + "/hr.csv") df.reset_index(inplace=True) id = "index" features_continuous = [ "satisfaction_level", "last_evaluation", "number_project", "average_montly_hours", "time_spend_company", ] features_discrete = [ "Work_accident", "promotion_last_5years", "sales", "salary" ] df["__label__"] = df["left"].apply(lambda x: 1 if int(x) == 1 else 0) label = "__label__" df_coding = df[[id, label]] for feature in features_continuous: df_coding_f = discretization_coding(df, id, feature) df_coding = df_coding.merge(df_coding_f, how="inner", on=id) for feature in features_discrete: df_coding_f = one_hot_coding(df, id, feature) df_coding = df_coding.merge(df_coding_f, how="inner", on=id) x_columns = filter(lambda x: x not in (id, label), df_coding.columns) y_columns = [label] df_train, df_test = train_test_split(df_coding, test_size=0.2) X_train = df_train[x_columns].reset_index(drop=True) X_test = df_test[x_columns].reset_index(drop=True) y_train = df_train[[label]].reset_index(drop=True) y_test = df_test[[label]].reset_index(drop=True) ## compare lr, gbdt, fm and deeplearning print "############ lr ############" algo_lr = LogisticRegressionAlgorithm() mse_lr, auc_lr, threshold_lr = kfold_cross_validation( algo_lr, X_train, y_train, 5) algo_lr = algo_lr.fit(X_train, y_train) y_predict_lr = pd.DataFrame(algo_lr.predict(X_test), columns=["__label__"]) y_predict_lr["__class__"] = y_predict_lr["__label__"].apply( lambda x: 1 if x >= threshold_lr else 0) print "lr : mse[%.6f], auc[%.6f]" % (mse( y_test, y_predict_lr["__label__"]), auc(y_test, y_predict_lr["__label__"]))
y_columns = filter(lambda x: x not in ("item"), df_item_coding.columns) df_train, df_test = train_test_split(df_data, test_size=0.2) label = "Action" X_train = df_train[x_columns].reset_index(drop=True) X_test = df_test[x_columns].reset_index(drop=True) y_train = df_train[[label]].reset_index(drop=True) y_test = df_test[[label]].reset_index(drop=True) ## compare lr, gbdt, fm and deeplearning print "############ lr ############" algo_lr = LogisticRegressionAlgorithm() algo_lr = algo_lr.fit(X_train, y_train) y_predict_lr = pd.DataFrame(algo_lr.predict(X_test), columns=["__label__"]) print "lr : mse[%.6f], auc[%.6f]" % (mse( y_test, y_predict_lr), auc(y_test, y_predict_lr)) print "############ gbdt ############" algo_gbdt = GradientBoostingRegressor() algo_gbdt = algo_gbdt.fit(X_train, y_train) y_predict_gbdt = pd.DataFrame(algo_gbdt.predict(X_test), columns=["__label__"]) print "gbdt: mse[%.6f], auc[%.6f]" % (mse( y_test, y_predict_gbdt), auc(y_test, y_predict_gbdt)) print "############ fm ############" algo_fm = FactorizationMachineAlgorithm() algo_fm = algo_fm.fit(X_train, y_train) y_predict_fm = pd.DataFrame(algo_fm.predict(X_test), columns=["__label__"]) print "fm : mse[%.6f], auc[%.6f]" % (mse( y_test, y_predict_fm), auc(y_test, y_predict_fm))
def run(): data_home = TASK_DATA_HOME + "/algorithm_test/kaggle/creditcardfraud" df = pd.read_csv(data_home + "/creditcard.sub.csv") df.reset_index(inplace=True) id = "index" features_continuous = filter(lambda x: x not in ("Class", id), df.columns.tolist()) features_discrete = [] df["__label__"] = df["Class"].apply(lambda x: 1 if int(x) == 1 else 0) label = "__label__" df_coding = df[[id, label]] for feature in features_continuous: df_coding_f = discretization_coding(df, id, feature) df_coding = df_coding.merge(df_coding_f, how="inner", on=id) for feature in features_discrete: df_coding_f = one_hot_coding(df, id, feature) df_coding = df_coding.merge(df_coding_f, how="inner", on=id) df_coding = df_coding.sample(frac=1).reset_index(drop=True) x_columns = filter(lambda x: x not in (id, label), df_coding.columns) y_columns = [label] X = df_coding[x_columns] y = df_coding[y_columns] index_pos = y[y["__label__"] == 1].index.tolist() index_neg = y[y["__label__"] == 0].index.tolist() index_pos_train, index_pos_test = train_test_split(index_pos, test_size=0.2) index_neg_train, index_neg_test = train_test_split(index_neg, test_size=0.2) X_train = X.loc[index_pos_train + index_neg_train, :] X_test = X.loc[index_pos_test + index_neg_test, :] y_train = y.loc[index_pos_train + index_neg_train, :] y_test = y.loc[index_pos_test + index_neg_test, :] # print "############ lr ############" # algo_lr = LogisticRegressionAlgorithm() # algo_lr = algo_lr.fit(X_train, y_train) # algo_lr.predict(X_test) # y_predict_lr = pd.DataFrame(algo_lr.predict(X_test), columns=["__label__"]) # print "lr : mse[%.6f], auc[%.6f]" % (mse(y_test, y_predict_lr), auc(y_test, y_predict_lr)) # # print "############ gbdt ############" # algo_gbdt = GradientBoostingRegressor() # algo_gbdt = algo_gbdt.fit(X_train, y_train) # y_predict_gbdt = pd.DataFrame(algo_gbdt.predict(X_test), columns=["__label__"]) # print "gbdt: mse[%.6f], auc[%.6f]" % (mse(y_test, y_predict_gbdt), auc(y_test, y_predict_gbdt)) # # print "############ fm ############" # algo_fm = FactorizationMachineAlgorithm() # algo_fm = algo_fm.fit(X_train, y_train) # y_predict_fm = pd.DataFrame(algo_fm.predict(X_test), columns=["__label__"]) # print "fm : mse[%.6f], auc[%.6f]" % (mse(y_test, y_predict_fm), auc(y_test, y_predict_fm)) print "############ lp ############" algo_lp = LittleProbabilityModel(n_jobs=2) kfold_cross_validation(algo_lp, X_train, y_train, 5) algo_lp = algo_lp.fit(X_train, y_train) y_predict_lp = pd.DataFrame(algo_lp.predict(X_test), columns=["__label__"]) print "lp : mse[%.6f], auc[%.6f]" % (mse( y_test, y_predict_lp), auc(y_test, y_predict_lp))