forked from trevor-smith/FDA_OpenData
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fda_clustering.py
124 lines (105 loc) · 3.81 KB
/
fda_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
### IMPORTS ###
import pandas as pd
import numpy as np
import json
from pprint import pprint
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from collections import Counter
import time
from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MeanShift
from sklearn.cluster import Birch
from matplotlib.mlab import PCA as mlabPCA
# reading in data
with open('/users/trevorsmith/Desktop/fda_topic_modeling/warning_labels_lda_15_dict.json') as data_file:
data = json.load(data_file)
# we now have a dictionary
# but each key has a value that is a list
# need to convert this list into another dictionary
data_test = {}
data_sub_test = {}
for key, value in data.items():
data_sub_test = {}
for i in value:
# print str(i[0]) + "~" + str(i[1])
data_sub_test[i[0]] = i[1]
data_test[key] = data_sub_test
# now our data is formatted correctly, let's read into a DF
df = pd.DataFrame.from_dict(data_test, orient='index')
# let's take a look at our data
print df.head()
# we now have our 25 columns...each representing a topic. SUCCESS!
# now we need to fill na's with 0's
df.fillna(0, inplace=True)
### CLUSTERING ###
# let's convert to numpy array for faster processing
all_samples = df.as_matrix()
# let's work with a subset first so we don't crash anything :)
small_sample = all_samples[:10000]
# ok, now let's loop through and find the optimal num of clusters
start_time = time.time()
max_clusters = 50
inertia_curve = []
silhouette_curve = []
colors = []
colours = []
for num in range (max_clusters):
color = np.random.rand(3,)
colours.append(color)
colors = list(colours)
for cluster in range(max_clusters)[2:]:
num_clusters = cluster
km = MiniBatchKMeans(init='k-means++', n_clusters=num_clusters)
km.fit_predict(small_sample)
clusters = km.labels_.tolist()
inertia = km.inertia_
inertia_curve.append(round(inertia,4))
cluster_range = range(cluster)[1:]
labels = km.labels_
silhouette_curve.append(metrics.silhouette_curve(small_sample, labels, metric='euclidean'))
print "Silhouette score:", silhouette_curve[:-1]
print "Number of clusters:", cluster
print Counter(clusters)
# now convert to array for plotting
plt.plot(cluster_range, inertia_curve, label = 'Inertia Curve')
plt.legend()
plt.show()
plt.plot(cluster_range, silhouette_curve, label = 'Silhouette Curve')
plt.legend()
plt.show()
# now choose optimal clusters and then plot via pca
num_clusters = 17
km = MiniBatchKMeans(init='k-means++', n_clusters=num_clusters)
km.fit_predict(small_sample)
clusters = km.labels_.tolist()
inertia = km.inertia_
inertia_curve.append(round(inertia,4))
cluster_range = range(cluster)[1:]
labels = km.labels_
mlab_pca = mlabPCA(small_sample)
clusters_array = np.array(clusters)
clusters_array_2 = clusters_array.reshape(10000,1)
d = np.concatenate((mlab_pca.Y, clusters_array_2), axis=1)
fig = plt.figure(figsize=(15,5))
ax1 = fig.add_subplot(131, projection='3d') # row-col-num
for num in range(cluster):
plt.plot(d[d[:,25]==num][:,0],d[d[:,25]==num][:,1],d[d[:,25]==num][:,2],'o', markersize=7, color=colors[num], alpha=0.5)#, label = labels)
# plt.zlabel('z_values')
plt.title('PCA and k-means clustering, n=10,000 drugs')
plt.xlim([-4,4])
plt.ylim([-4,4])
ax2 = fig.add_subplot(132) # row-col-num
for num in range(cluster):
plt.plot(d[d[:,25]==num][:,0],d[d[:,25]==num][:,1],'o', markersize=7, color=colors[num], alpha=0.5)#, label = labels)
plt.xlabel('x_values')
plt.ylabel('y_values')
plt.title('PCA and k-means clustering, n=10,000 drugs')
plt.xlim([-4,4])
plt.ylim([-4,4])