from Data import DataExtract X_train, X_test, Y_train, Y_test = DataExtract.load_minist_csv() # 测试 LogisticRegression RandomForest xgboost from sklearn.linear_model import LogisticRegressionCV from sklearn.ensemble import RandomForestClassifier from xgboost import XGBClassifier models = [ ('LogisticRegression', LogisticRegressionCV(Cs=10, cv=5)), # Cs正则约束,越小越强 cv:交叉验证 ('RandomForest', RandomForestClassifier(n_estimators=50)), # , criterion='gini' ('XGBoost', XGBClassifier(max_depth=3, n_estimators=100, silent=True, objective='multi:softmax')) ] for name, model in models: model.fit(X_train, Y_train) print(name, '训练集正确率:', model.score(X_train, Y_train)) print(name, '测试集正确率:', model.score(X_test, Y_test)) # LogisticRegression 训练集正确率: 0.9294345238095238 # LogisticRegression 测试集正确率: 0.9158333333333334 # RandomForest 训练集正确率: 1.0 # RandomForest 测试集正确率: 0.8848809523809524 # XGBoost 训练集正确率: 0.9670238095238095 # XGBoost 测试集正确率: 0.9277380952380953
import numpy as np from datetime import datetime from Data import DataExtract, DataTransform from Minist.Common.Util import error_rate from sklearn.utils import shuffle # keras test error rate 0.036071. # Elapsted time for keras rmsprop: 0:00:25.107859 # tensorflow test error rate 0.025238. # Elapsted time for tensorflow rmsprop: 0:08:09.270659 X_train, X_test, Y_train, Y_test = DataExtract.load_minist_csv( pca=False) # pca=False class_num = 10 Y_train_onehot = DataTransform.y2one_hot(Y_train, class_num=class_num) Y_test_onehot = DataTransform.y2one_hot(Y_test, class_num=class_num) N, D = X_train.shape M = 512 batch_size = 300 epochs = 50 ##### keras ##### from keras.models import Sequential from keras.layers import Dense # input: N,D W1: D,M W2: M,class_num model = Sequential() model.add(Dense(units=M, input_shape=(D, ), activation='relu')) # input_dim = D, units为output个数 model.add(Dense(units=class_num))
from Data import DataExtract from sklearn.mixture import GaussianMixture from Data.DataTransform import purity, DBI X_train, _, Y_train, _ = DataExtract.load_minist_csv() model = GaussianMixture(n_components=10) model.fit(X_train) # VxD M = model.means_ var = model.covariances_ # 根据数据预测各组分的后验概率。 # NxV R = model.predict_proba(X_train) # 分类后的 print("Purity:", purity(Y_train, R)) # 2个聚类间std偏差和/聚类均值间距离的比值 越低越好 print("DBI:", DBI(X_train, M, R))