-
Notifications
You must be signed in to change notification settings - Fork 1
/
scipy-kmeans.py
109 lines (100 loc) · 4.13 KB
/
scipy-kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import pandas as pd
import numpy as np
import k_means_clustering
from sklearn.preprocessing import Normalizer
from db_interactor import DBInteractor
from preprocess import Preprocessor
from sklearn import mixture, metrics
from sklearn.cluster import KMeans, AffinityPropagation
from sklearn.datasets import make_blobs
from mpl_toolkits.mplot3d import Axes3D
from itertools import cycle
#import matplotlib.pyplot as plt
#fig = plt.figure(figsize=(10,10), dpi = 1000)
def fit_samples_gmm(samples, n_components):
gmix = mixture.GMM(n_components=n_components, covariance_type='diag')
gmix.fit(samples)
# print gmix.means_
# print gmix.predict(samples)
colors = ['r' if i==0 else 'g' for i in gmix.predict(samples)]
# ax = plt.gca()
# ax.scatter(samples[:,0], samples[:,1], alpha=0.8)
# plt.show()
def fit_samples_kmeans(samples, n_features, n_clusters):
n_samples = len(samples)/4
random_state = 150
X, y = make_blobs(n_samples=300, n_features=n_features, centers=n_clusters, random_state=random_state)
y_pred = KMeans(n_clusters=n_clusters, random_state=random_state).fit_predict(X)
print X[:, 0]
print X[:, 1]
# plt.subplot(111)
# plt.scatter(X[:, 0], X[:, 1], c=y_pred)
print len(y_pred)
#plt.show()
def fit_affinity_propagation(samples):
n_samples = len(samples)/10
random_state = 0
X, labels_true = make_blobs(n_samples=n_samples, cluster_std=0.5,
random_state=random_state)
af = AffinityPropagation(preference=-50).fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
n_clusters_ = len(cluster_centers_indices)
print('Estimated number of clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
% metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
% metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
% metrics.silhouette_score(X, labels, metric='sqeuclidean'))
##############################################################################
# Plot result
# plt.close('all')
# plt.figure(1)
# plt.clf()
#
# colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
# for k, col in zip(range(n_clusters_), colors):
# class_members = labels == k
# cluster_center = X[cluster_centers_indices[k]]
# plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
# plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
# markeredgecolor='k', markersize=14)
# for x in X[class_members]:
# plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)
#
# plt.title('Estimated number of clusters: %d' % n_clusters_)
# plt.show()
return af
if __name__ == '__main__':
np.set_printoptions(threshold=np.inf)
#create a DB interactor
interactor = DBInteractor("season_batting")
#gets the dataframe
df = interactor.get_current_data_frame()
#print(df)
#df = df.drop(['yearID','stint','stint','teamID','lgId','HBP', 'playerID'], axis=1)
arr_with_ids = interactor.df_to_numpy_matrix()
cols = ['playerID', 'yearID']
df = interactor.drop_useless_stuff(cols)
#converts it to a numpy matrix
arr = interactor.df_to_numpy_matrix()
arr = arr.astype(float)
#print arr
#print arr
#don't forget to disconnect
interactor.disconnect()
#create a preprocessor to preprocess the data
#this doesn't do anything very useful right now
p = Preprocessor(arr, df)
arr = p.preprocess(arr)
sample = arr
print arr.shape[1]
fit_samples_gmm(sample,1)
fit_samples_kmeans(sample, sample.shape[1], 10)
rand_indices = np.random.choice(np.arange(0,len(sample)), replace=False, size=len(sample))
rand_samples = sample[rand_indices]
af = fit_affinity_propagation(samples=rand_samples)