예제 #1
0
def write_result(c, c_type):
    file_name = wkzutils.get_path_sources("14.Titanic.test.csv")
    x, passenger_id = load_data(file_name, False)

    if type == 3:
        x = xgb.DMatrix(x)
    y = c.predict(x)
    y[y > 0.5] = 1
    y[~(y > 0.5)] = 0

    predictions_file = open("Prediction_%d.csv" % c_type, "wb")
    open_file_object = csv.writer(predictions_file)
    open_file_object.writerow(["PassengerId", "Survived"])
    open_file_object.writerows(zip(passenger_id, y))
    predictions_file.close()
예제 #2
0
    p = 1.0 / (1.0 + np.exp(-y_hat))
    # 目标函数J(f)的一阶导【这里导函数是自己设置】
    g = p - y.get_label()
    # 目标函数J(f)的二阶导【这里导函数是自己设置(h实际是p的一阶导函数)】
    h = p * (1.0 - p)
    return g, h


def error_rate(y_hat, y):
    return 'error', float(sum(y.get_label() != (y_hat > 0.5))) / len(y_hat)


if __name__ == "__main__":
    # 读取数据
    data_train = xgb.DMatrix(
        wkzutils.get_path_sources("14.agaricus_train.txt"))
    data_test = xgb.DMatrix(wkzutils.get_path_sources("14.agaricus_test.txt"))
    print data_train
    print type(data_train)

    # 设置参数max_depth树深度,eta防止过拟合(衰减因子里面学习率v,推荐v<0.1,可以防止过拟合,但会造成计算次数增多),
    # silent是否输出其它多余信息,objective目标函数(二分类问题)
    param = {
        'max_depth': 3,
        'eta': 0.1,
        'silent': 0,
        'objective': 'binary:logistic'
    }  # logitraw
    # param = {'max_depth': 3, 'eta': 0.3, 'silent': 1, 'objective': 'reg:logistic'}
    # 训练数据和测试数据一起封闭成元组
    watchlist = [(data_test, 'eval'), (data_train, 'train')]
예제 #3
0
    print tip + '正确率:%.2f%%' % (100 * np.mean(acc))


def save_image(im, i):
    im *= 15.9375
    im = 255 - im
    a = im.astype(np.uint8)
    output_path = wkzutils.get_path_target("HandWritten")
    if not os.path.exists(output_path):
        os.mkdir(output_path)
    Image.fromarray(a).save(output_path + ('\\%d.png' % i))


if __name__ == "__main__":
    print 'Load Training File Start...'
    data = np.loadtxt(wkzutils.get_path_sources("16.optdigits.tra"),
                      dtype=np.float,
                      delimiter=',')
    x, y = np.split(data, (-1, ), axis=1)
    images = x.reshape(-1, 8, 8)
    y = y.ravel().astype(np.int)

    print 'Load Test Data Start...'
    data = np.loadtxt(wkzutils.get_path_sources("16.optdigits.tes"),
                      dtype=np.float,
                      delimiter=',')
    x_test, y_test = np.split(data, (-1, ), axis=1)
    images_test = x_test.reshape(-1, 8, 8)
    y_test = y_test.ravel().astype(np.int)
    print 'Load Data OK...'
예제 #4
0
from utils import wkzutils


def iris_type(s):
    it = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
    return it[s]


# 'sepal length', 'sepal width', 'petal length', 'petal width'
iris_feature = u'花萼长度', u'花萼宽度', u'花瓣长度', u'花瓣宽度'

if __name__ == "__main__":
    mpl.rcParams['font.sans-serif'] = [u'SimHei']  # 黑体 FangSong/KaiTi
    mpl.rcParams['axes.unicode_minus'] = False

    path = wkzutils.get_path_sources("10.iris.data")  # 数据文件路径
    data = np.loadtxt(path,
                      dtype=float,
                      delimiter=',',
                      converters={4: iris_type})
    x_prime, y = np.split(data, (4, ), axis=1)

    feature_pairs = [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]
    plt.figure(figsize=(10, 9), facecolor='#FFFFFF')
    for i, pair in enumerate(feature_pairs):
        # 准备数据
        x = x_prime[:, pair]

        # 决策树学习
        clf = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=3)
        dt_clf = clf.fit(x, y)
예제 #5
0
            values.append(float(value))
        r += 1
    # scipy.sparse.csr_matrix((values, (row, col)))是稀疏数据;XX.toarray()是稠密数据
    x = scipy.sparse.csr_matrix((values, (row, col))).toarray()
    y = np.array(y)
    return x, y


def show_accuracy(a, b, tip):
    acc = a.ravel() == b.ravel()
    print acc
    print tip + '正确率:\t', float(acc.sum()) / a.size


if __name__ == '__main__':
    x, y = read_data(wkzutils.get_path_sources("14.agaricus_train.txt"))
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        random_state=1,
                                                        train_size=0.6)

    # Logistic回归
    lr = LogisticRegression(penalty='l2')
    lr.fit(x_train, y_train.ravel())
    y_hat = lr.predict(x_test)
    show_accuracy(y_hat, y_test, 'Logistic回归 ')

    # XGBoost
    y_train[y_train == 3] = 0
    y_test[y_test == 3] = 0
    data_train = xgb.DMatrix(x_train, label=y_train)
예제 #6
0
    if type == 3:
        x = xgb.DMatrix(x)
    y = c.predict(x)
    y[y > 0.5] = 1
    y[~(y > 0.5)] = 0

    predictions_file = open("Prediction_%d.csv" % c_type, "wb")
    open_file_object = csv.writer(predictions_file)
    open_file_object.writerow(["PassengerId", "Survived"])
    open_file_object.writerows(zip(passenger_id, y))
    predictions_file.close()


if __name__ == "__main__":
    x, y = load_data(wkzutils.get_path_sources("14.Titanic.train.csv"), True)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=1)
    #
    # lr = LogisticRegression(penalty='l2')
    # lr.fit(x_train, y_train)
    # y_hat = lr.predict(x_test)
    # lr_rate = show_accuracy(y_hat, y_test, 'Logistic回归 ')
    # # write_result(lr, 1)

    rfc = RandomForestClassifier(n_estimators=100)
    rfc.fit(x_train, y_train)
    y_hat = rfc.predict(x_test)
    rfc_rate = show_accuracy(y_hat, y_test, '随机森林 ')
예제 #7
0
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from utils import wkzutils

"葡萄酒识别分类"


def show_accuracy(a, b, tip):
    acc = a.ravel() == b.ravel()
    print acc
    print tip + '正确率:\t', float(acc.sum()) / a.size


if __name__ == "__main__":
    data = np.loadtxt(wkzutils.get_path_sources("14.wine.data"),
                      dtype=float,
                      delimiter=',')
    y, x = np.split(data, (1, ), axis=1)
    # x = StandardScaler().fit_transform(x)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        random_state=1,
                                                        test_size=0.5)

    # Logistic回归
    lr = LogisticRegression(penalty='l2')
    lr.fit(x_train, y_train.ravel())
    y_hat = lr.predict(x_test)
    show_accuracy(y_hat, y_test, 'Logistic回归 ')
예제 #8
0
def save_result(model):
    data_test_hat = model.predict(data_test)
    with open('Prediction.csv', 'wb') as f:
        writer = csv.writer(f)
        writer.writerow(['ImageId', 'Label'])
        for i, d in enumerate(data_test_hat):
            writer.writerow([i, d])
        # writer.writerows(zip(np.arange(1, len(data_test_hat) + 1), data_test_hat))


if __name__ == "__main__":
    classifier_type = 'SVM'

    print '载入训练数据...'
    t = time()
    data = pd.read_csv(wkzutils.get_path_sources("16.MNIST.train.csv"),
                       header=0,
                       dtype=np.int)
    print '载入完成,耗时%f秒' % (time() - t)
    y = data['label'].values
    x = data.values[:, 1:]
    print '图片个数:%d,图片像素数目:%d' % x.shape
    images = x.reshape(-1, 28, 28)
    y = y.ravel()

    print '载入测试数据...'
    t = time()
    data_test = pd.read_csv(wkzutils.get_path_sources("16.MNIST.test.csv"),
                            header=0,
                            dtype=np.int)
    data_test = data_test.values
예제 #9
0
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

from utils import wkzutils


def iris_type(s):
    it = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
    return it[s]


if __name__ == "__main__":
    data = np.loadtxt(wkzutils.get_path_sources("10.iris.data"),
                      dtype=float,
                      delimiter=',',
                      converters={4: iris_type})
    print data
    x, y = np.split(data, (4, ), axis=1)
    x = x[:, :2]
    print x
    print y

    train_test_split(x, y, train_size=0.8)

    gnb = Pipeline([('sc', StandardScaler()), ('clf', GaussianNB())])
    # gnb = Pipeline([
    #     ('sc', MinMaxScaler()),
    #     ('clf', MultinomialNB())])