-
Notifications
You must be signed in to change notification settings - Fork 1
/
semiKMeans.py
207 lines (166 loc) · 7.54 KB
/
semiKMeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# -*- coding: utf-8 -*-
"""
Created on Sun Feb 28
@author: Anil Sener
"""
import collections
import numpy as np
from scipy import linalg, mat, dot
from sklearn.externals import six
from sklearn.base import BaseEstimator, ClassifierMixin,ClusterMixin, TransformerMixin
from abc import ABCMeta
from sklearn.linear_model import LogisticRegression as LR
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.validation import FLOAT_DTYPES
from sklearn.utils.extmath import row_norms, squared_norm
import scipy.sparse as sp
from sklearn.cluster import KMeans
from sklearn.cluster.k_means_ import _labels_inertia_precompute_dense
from sklearn.cluster import _k_means
class semiKMeans(six.with_metaclass(ABCMeta, BaseEstimator,ClusterMixin, TransformerMixin)):
def __init__(self,maxiter=100,fixedprec=1e-9,verbose=False):
self.maxiter=maxiter
self.verbose=verbose
self.fixedprec=fixedprec
self.labels=None
self.plattlr = None
self.cluster_centers_=None
def predict(self, X):
"""Predict the closest cluster each sample in X belongs to.
In the vector quantization literature, `cluster_centers_` is called
the code book and each value returned by `predict` is the index of
the closest code in the code book.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
New data to predict.
Returns
-------
labels : array, shape [n_samples,]
Index of the cluster each sample belongs to.
"""
#check_is_fitted(self, 'cluster_centers_')
X = self._check_test_data(X)
x_squared_norms = row_norms(X, squared=True)
return _labels_inertia(X, x_squared_norms, self.cluster_centers_)[0]
def _check_test_data(self, X):
X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES,
warn_on_dtype=True)
n_samples, n_features = X.shape
expected_n_features = self.cluster_centers_.shape[1]
if not n_features == expected_n_features:
raise ValueError("Incorrect number of features. "
"Got %d features, expected %d" % (
n_features, expected_n_features))
return X
def fit_transform(self,texts,labels):
# Initialize clusters with labeled data
clust_names = [x for x, y in collections.Counter(labels).items() if y > 0]
clust_names = sorted(clust_names)[1:]
#print(clust_names)
#centroids = np.zeros((len(clust_names),len(texts[1,:])))
centroids = np.zeros((len(clust_names),texts.shape[1]))
for unique_name in clust_names:
indices = [i for i, x in enumerate(labels) if x == unique_name]
aux = texts[indices,:]
#print(np.mean(aux,axis=0).shape)
#print(centroids[clust_names.index(unique_name),:].shape)
centroids[clust_names.index(unique_name),:] = np.mean(aux,axis=0)
texts = mat(texts)
centroids = mat(centroids)
new_labels = labels
cnt = 0
# Main loop
while cnt<self.maxiter:
cnt +=1
if self.verbose:
print('Iter: '+str(cnt))
# Assign data to nearest centroid (cosine distance)
dist = dot(texts,centroids.T)/linalg.norm(texts)/linalg.norm(centroids)
n_lab = dist.argmax(axis=1)
for ii in range(len(n_lab)):
new_labels[ii] = clust_names[n_lab[ii,0]]
# print [y for x, y in collections.Counter(new_labels).items() if y > 0]
# Recalculate clusters
new_centroids = np.zeros((len(clust_names),len(texts.T)))
for unique_name in clust_names:
indices = [i for i, x in enumerate(new_labels) if x == unique_name]
if len(indices)>0:
aux = texts[indices,:]
new_centroids[clust_names.index(unique_name),:] = aux.mean(0)
else:
new_centroids[clust_names.index(unique_name),:] = centroids[clust_names.index(unique_name),:]
# Check exit condition
difference = np.power((centroids-new_centroids),2)
if difference.sum()<self.fixedprec:
break;
else:
self.labels = new_labels
centroids = new_centroids
self.cluster_centers_=new_centroids
self.plattlr = LR()
preds = self.predict(texts[labels!=-1,:])
self.plattlr.fit( preds.reshape( -1, 1 ), labels[labels!=-1])
return (labels)
def predict_proba(self, X):
"""Compute probabilities of possible outcomes for samples in X.
The model need to have probability information computed at training
time: fit with attribute `probability` set to True.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Returns
-------
T : array-like, shape = [n_samples, n_classes]
Returns the probability of the sample for each class in
the model. The columns correspond to the classes in sorted
order, as they appear in the attribute `classes_`.
"""
preds = self.predict(X)
#################Should be Added by me################
#########################
return self.plattlr.predict_proba(preds.reshape( -1, 1 ))
def _labels_inertia(X, x_squared_norms, centers,
precompute_distances=True, distances=None):
"""E step of the K-means EM algorithm.
Compute the labels and the inertia of the given samples and centers.
This will compute the distances in-place.
Parameters
----------
X: float64 array-like or CSR sparse matrix, shape (n_samples, n_features)
The input samples to assign to the labels.
x_squared_norms: array, shape (n_samples,)
Precomputed squared euclidean norm of each data point, to speed up
computations.
centers: float64 array, shape (k, n_features)
The cluster centers.
precompute_distances : boolean, default: True
Precompute distances (faster but takes more memory).
distances: float64 array, shape (n_samples,)
Pre-allocated array to be filled in with each sample's distance
to the closest center.
Returns
-------
labels: int array of shape(n)
The resulting assignment
inertia : float
Sum of distances of samples to their closest cluster center.
"""
n_samples = X.shape[0]
# set the default value of centers to -1 to be able to detect any anomaly
# easily
labels = -np.ones(n_samples, np.int32)
if distances is None:
distances = np.zeros(shape=(0,), dtype=np.float64)
# distances will be changed in-place
if sp.issparse(X):
inertia = _k_means._assign_labels_csr(
X, x_squared_norms, centers, labels, distances=distances)
else:
if precompute_distances:
return _labels_inertia_precompute_dense(X, x_squared_norms,
centers, distances)
inertia = _k_means._assign_labels_array(
X, x_squared_norms, centers, labels, distances=distances)
return labels, inertia