from sklearn.tree import DecisionTreeClassifier

import utils
import numpy as np
import matplotlib.pyplot as plot

data = x_train, x_test, y_train, y_test = utils.import_wine(n_samples=5000,
                                                            y_transform=None)

# WITHOUT TUNING

base_clf = DecisionTreeClassifier(random_state=0,
                                  criterion='entropy',
                                  max_leaf_nodes=20000,
                                  max_depth=50)
utils.boosting(*data, base_clf, verbose=True)

# 10-FOLD CROSSVALIDATION ON THE LEARNING RATE

scores = []
learning_rates = []

for learning_rate in np.linspace(1.3, 1.45, 5):
    print(learning_rate)
    learning_rates.append(learning_rate)
    scores.append(
        utils.boosting_crossval(*data, base_clf,
                                learning_rate=learning_rate)[1])

plot.style.use('seaborn-darkgrid')
plot.title('Influence of the learning rate on boosting')
示例#2
0
    utils.knn(*data, n_neighbors=k)

# INFLUENCE OF THE WEIGHTS

utils.knn(*data, n_neighbors=20, weights='distance')

# INFLUENCE OF THE METRICS

metrics = ['manhattan', 'chebyshev']

for metric in metrics:
    utils.knn(*data, n_neighbors=20, metric=metric)

# BEST MODEL

data = x_train, x_test, y_train, y_test = utils.import_wine(y_transform=None)

utils.knn(*data, n_neighbors=20)

# LEARNING CURVE

clf = KNeighborsClassifier(n_neighbors=20)
x = []
train = []
test = []
for i in [0.02, 0.1, 1, 2, 3, 5, 10, 15, 25, 32]:
    index = int(1000 * i)
    x.append(index)
    clf.fit(x_train[:index], y_train[:index].values.ravel())
    train.append(clf.score(x_train[:index], y_train[:index].values.ravel()))
    test.append(clf.score(x_test, y_test.values.ravel()))
        x.append(count)
        count += 1
    plot.bar(x, cataccs, color=color, width=0.75)
    x = []
    count = 1.25
    for _ in range(5):
        x.append(count)
        count += 1.5
    plot.xticks(x, ['None', 'PCA', 'ICA', 'RP', 'VAE'])
    plot.xlabel('Feature transformation method')
    plot.ylabel('Categorical accuracy (%)')
    plot.show()


x_adult, y_adult, x_adult_test, y_adult_test = utils.import_adult()
x_wine, y_wine, x_wine_test, y_wine_test = utils.import_wine()

# K-MEANS

silhouette('adult', range(2, 15), x_adult)  # 1025s
cluster_breakdown('adult_kmeans', x_adult,
                  KMeans(n_clusters=5,
                         random_state=0).fit_predict(x_adult))  # 10s
cataccs('adult', range(2, 15), 2, x_adult, y_adult)  # 91s

silhouette('wine', range(2, 15), x_wine)  # 2154s
cluster_breakdown('wine_kmeans', x_wine,
                  KMeans(n_clusters=3,
                         random_state=0).fit_predict(x_wine))  # 24s
cataccs('wine reviews', range(2, 15), 5, x_wine, y_wine)  # 577s
示例#4
0
from keras import Sequential
from keras.layers import Dense
import matplotlib.pyplot as plot
import utils

data = x_train, x_test, y_train, y_test = utils.import_wine(
    y_transform='to_categorical')

model = Sequential()

model.add(Dense(10, input_dim=len(x_train.keys()), activation='relu'))
model.add(Dense(15, activation='relu'))
model.add(Dense(len(y_train[0]), activation='softmax'))

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['categorical_accuracy'])
history = model.fit(x_train,
                    y_train,
                    validation_data=(x_test, y_test),
                    epochs=20,
                    batch_size=50)

utils.plot_ann_history(history)

plot.show()

y_predict = model.predict(x_test[:5])
for i in range(5):
    plot.bar([80, 84, 88, 92, 96, 100], [*y_test[i], 0], width=4, align='edge')
    plot.bar([80, 84, 88, 92, 96, 100], [*y_predict[i], 0],
示例#5
0
import utils

data = x_train, x_test, y_train, y_test = utils.import_wine()

# WITHOUT PRUNING

utils.dt_pruning(*data)

# PRUNING MANUALLY

utils.dt_pruning(*data, 1000, 40)
utils.dt_pruning(*data, 10000, 70)
utils.dt_pruning(*data, 40000, 90)
utils.dt_pruning(*data, 45000, 95)

# PRUNING WITH 10-FOLD CROSS-VALIDATION

clf = utils.dt_crossval(*data,
                        n_leaf_range=range(41250, 41751, 100),
                        n_depth_range=range(100, 101))