def write_result(c, c_type): file_name = wkzutils.get_path_sources("14.Titanic.test.csv") x, passenger_id = load_data(file_name, False) if type == 3: x = xgb.DMatrix(x) y = c.predict(x) y[y > 0.5] = 1 y[~(y > 0.5)] = 0 predictions_file = open("Prediction_%d.csv" % c_type, "wb") open_file_object = csv.writer(predictions_file) open_file_object.writerow(["PassengerId", "Survived"]) open_file_object.writerows(zip(passenger_id, y)) predictions_file.close()
p = 1.0 / (1.0 + np.exp(-y_hat)) # 目标函数J(f)的一阶导【这里导函数是自己设置】 g = p - y.get_label() # 目标函数J(f)的二阶导【这里导函数是自己设置(h实际是p的一阶导函数)】 h = p * (1.0 - p) return g, h def error_rate(y_hat, y): return 'error', float(sum(y.get_label() != (y_hat > 0.5))) / len(y_hat) if __name__ == "__main__": # 读取数据 data_train = xgb.DMatrix( wkzutils.get_path_sources("14.agaricus_train.txt")) data_test = xgb.DMatrix(wkzutils.get_path_sources("14.agaricus_test.txt")) print data_train print type(data_train) # 设置参数max_depth树深度,eta防止过拟合(衰减因子里面学习率v,推荐v<0.1,可以防止过拟合,但会造成计算次数增多), # silent是否输出其它多余信息,objective目标函数(二分类问题) param = { 'max_depth': 3, 'eta': 0.1, 'silent': 0, 'objective': 'binary:logistic' } # logitraw # param = {'max_depth': 3, 'eta': 0.3, 'silent': 1, 'objective': 'reg:logistic'} # 训练数据和测试数据一起封闭成元组 watchlist = [(data_test, 'eval'), (data_train, 'train')]
print tip + '正确率:%.2f%%' % (100 * np.mean(acc)) def save_image(im, i): im *= 15.9375 im = 255 - im a = im.astype(np.uint8) output_path = wkzutils.get_path_target("HandWritten") if not os.path.exists(output_path): os.mkdir(output_path) Image.fromarray(a).save(output_path + ('\\%d.png' % i)) if __name__ == "__main__": print 'Load Training File Start...' data = np.loadtxt(wkzutils.get_path_sources("16.optdigits.tra"), dtype=np.float, delimiter=',') x, y = np.split(data, (-1, ), axis=1) images = x.reshape(-1, 8, 8) y = y.ravel().astype(np.int) print 'Load Test Data Start...' data = np.loadtxt(wkzutils.get_path_sources("16.optdigits.tes"), dtype=np.float, delimiter=',') x_test, y_test = np.split(data, (-1, ), axis=1) images_test = x_test.reshape(-1, 8, 8) y_test = y_test.ravel().astype(np.int) print 'Load Data OK...'
from utils import wkzutils def iris_type(s): it = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2} return it[s] # 'sepal length', 'sepal width', 'petal length', 'petal width' iris_feature = u'花萼长度', u'花萼宽度', u'花瓣长度', u'花瓣宽度' if __name__ == "__main__": mpl.rcParams['font.sans-serif'] = [u'SimHei'] # 黑体 FangSong/KaiTi mpl.rcParams['axes.unicode_minus'] = False path = wkzutils.get_path_sources("10.iris.data") # 数据文件路径 data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type}) x_prime, y = np.split(data, (4, ), axis=1) feature_pairs = [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)] plt.figure(figsize=(10, 9), facecolor='#FFFFFF') for i, pair in enumerate(feature_pairs): # 准备数据 x = x_prime[:, pair] # 决策树学习 clf = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=3) dt_clf = clf.fit(x, y)
values.append(float(value)) r += 1 # scipy.sparse.csr_matrix((values, (row, col)))是稀疏数据;XX.toarray()是稠密数据 x = scipy.sparse.csr_matrix((values, (row, col))).toarray() y = np.array(y) return x, y def show_accuracy(a, b, tip): acc = a.ravel() == b.ravel() print acc print tip + '正确率:\t', float(acc.sum()) / a.size if __name__ == '__main__': x, y = read_data(wkzutils.get_path_sources("14.agaricus_train.txt")) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, train_size=0.6) # Logistic回归 lr = LogisticRegression(penalty='l2') lr.fit(x_train, y_train.ravel()) y_hat = lr.predict(x_test) show_accuracy(y_hat, y_test, 'Logistic回归 ') # XGBoost y_train[y_train == 3] = 0 y_test[y_test == 3] = 0 data_train = xgb.DMatrix(x_train, label=y_train)
if type == 3: x = xgb.DMatrix(x) y = c.predict(x) y[y > 0.5] = 1 y[~(y > 0.5)] = 0 predictions_file = open("Prediction_%d.csv" % c_type, "wb") open_file_object = csv.writer(predictions_file) open_file_object.writerow(["PassengerId", "Survived"]) open_file_object.writerows(zip(passenger_id, y)) predictions_file.close() if __name__ == "__main__": x, y = load_data(wkzutils.get_path_sources("14.Titanic.train.csv"), True) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=1) # # lr = LogisticRegression(penalty='l2') # lr.fit(x_train, y_train) # y_hat = lr.predict(x_test) # lr_rate = show_accuracy(y_hat, y_test, 'Logistic回归 ') # # write_result(lr, 1) rfc = RandomForestClassifier(n_estimators=100) rfc.fit(x_train, y_train) y_hat = rfc.predict(x_test) rfc_rate = show_accuracy(y_hat, y_test, '随机森林 ')
from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler from utils import wkzutils "葡萄酒识别分类" def show_accuracy(a, b, tip): acc = a.ravel() == b.ravel() print acc print tip + '正确率:\t', float(acc.sum()) / a.size if __name__ == "__main__": data = np.loadtxt(wkzutils.get_path_sources("14.wine.data"), dtype=float, delimiter=',') y, x = np.split(data, (1, ), axis=1) # x = StandardScaler().fit_transform(x) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=0.5) # Logistic回归 lr = LogisticRegression(penalty='l2') lr.fit(x_train, y_train.ravel()) y_hat = lr.predict(x_test) show_accuracy(y_hat, y_test, 'Logistic回归 ')
def save_result(model): data_test_hat = model.predict(data_test) with open('Prediction.csv', 'wb') as f: writer = csv.writer(f) writer.writerow(['ImageId', 'Label']) for i, d in enumerate(data_test_hat): writer.writerow([i, d]) # writer.writerows(zip(np.arange(1, len(data_test_hat) + 1), data_test_hat)) if __name__ == "__main__": classifier_type = 'SVM' print '载入训练数据...' t = time() data = pd.read_csv(wkzutils.get_path_sources("16.MNIST.train.csv"), header=0, dtype=np.int) print '载入完成,耗时%f秒' % (time() - t) y = data['label'].values x = data.values[:, 1:] print '图片个数:%d,图片像素数目:%d' % x.shape images = x.reshape(-1, 28, 28) y = y.ravel() print '载入测试数据...' t = time() data_test = pd.read_csv(wkzutils.get_path_sources("16.MNIST.test.csv"), header=0, dtype=np.int) data_test = data_test.values
from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.naive_bayes import GaussianNB, MultinomialNB from sklearn.pipeline import Pipeline from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from utils import wkzutils def iris_type(s): it = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2} return it[s] if __name__ == "__main__": data = np.loadtxt(wkzutils.get_path_sources("10.iris.data"), dtype=float, delimiter=',', converters={4: iris_type}) print data x, y = np.split(data, (4, ), axis=1) x = x[:, :2] print x print y train_test_split(x, y, train_size=0.8) gnb = Pipeline([('sc', StandardScaler()), ('clf', GaussianNB())]) # gnb = Pipeline([ # ('sc', MinMaxScaler()), # ('clf', MultinomialNB())])