forked from sugia/machineLearning
/
bicluster_newsgroups.py
169 lines (138 loc) · 5.27 KB
/
bicluster_newsgroups.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
'''
bicluster_newsgroups.py
Biclustering documents with the Spectral Co-clustering algorithm
This example demonstrates the Spectral Co-clustering algorithm on the
twenty newsgroups dataset. the 'comp.os.ms-windows.misc' category is
excluded because it contains many posts containing nothing but data.
The TF-IDF vectorized posts form a word frequency matrix, which is
then biclustered using Dhillon's Spectral Co-Clustering algorithm.
The resulting document-word biclusters indicate subsets words used
more often in those subsets documents.
For a few of the best biclusters, its most common document categories
and its then most important words get printed. The best biclusters
are determined by their normalized cut. The best words are determined
by comparing their sums inside and outside the bicluster.
For comparison, the documents are also clustered using MiniBatchKMeans.
The document clusters derived from the biclusters achieve a better
V-measure than clusters found by MiniBatchKMeans.
'''
from collections import defaultdict
import operator
import re
from time import time
import numpy as np
from sklearn.cluster.bicluster import SpectralCoclustering
from sklearn.cluster import MiniBatchKMeans
from sklearn.externals import six
from sklearn.datasets.twenty_newsgroups import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.cluster import v_measure_score
def number_aware_tokenizer(doc):
'''
Tokenizer that maps all numeric tokens to a placeholder.
For many applications, tokens that begin with a number are not
directly useful, but the fact that such a token exists can be
relevant. By applying this form of dimensionality reduction,
some methods may perform better.
'''
token_pattern = re.compile(u'(?u)\\b\\w\\w+\\b')
tokens = token_pattern.findall(doc)
tokens = ['#NUMBER' if token[0] in '0123456789_' else token
for token in tokens]
return tokens
#exclude 'comp.os.ms-windows.misc'
categories = ['alt.atheism', 'comp.graphics',
'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
'comp.windows.x', 'misc.forsale', 'rec.autos',
'rec.motorcycles', 'rec.sport.baseball',
'rec.sport.hockey', 'sci.crypt', 'sci.electronics',
'sci.med', 'sci.space', 'soc.religion.christian',
'talk.politics.guns', 'talk.politics.mideast',
'talk.politics.misc', 'talk.religion.misc'
]
newsgroups = fetch_20newsgroups(categories = categories)
y_true = newsgroups.target
vectorizer = TfidfVectorizer(
stop_words = 'english',
min_df = 5,
tokenizer = number_aware_tokenizer
)
cocluster = SpectralCoclustering(
n_clusters = len(categories),
svd_method = 'arpack',
random_state = 0
)
kmeans = MiniBatchKMeans(
n_clusters = len(categories),
batch_size = 20000,
random_state = 0
)
print('Vectorizing ...')
X = vectorizer.fit_transform(newsgroups.data)
print('Coclustering ...')
start_time = time()
cocluster.fit(X)
y_cocluster = cocluster.row_labels_
print('Done in {:.2f}s. V-measure: {:.4f}'.format(
time() - start_time,
v_measure_score(y_cocluster, y_true)))
print('MiniBatchKMeans ...')
start_time = time()
y_kmeans = kmeans.fit_predict(X)
print('Done in {:.2f}s. V-measure: {:.4f}'.format(
time() - start_time,
v_measure_score(y_kmeans, y_true)))
feature_names = vectorizer.get_feature_names()
document_names = list(newsgroups.target_names[i] for i in newsgroups.target)
def bicluster_ncut(i):
rows, cols = cocluster.get_indices(i)
if not (np.any(rows) and np.any(cols)):
import sys
return sys.float_info.max
row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0]
col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]
weight = X[rows[:, np.newaxis], cols].sum()
cut = (X[row_complement[:, np.newaxis], cols].sum() +
X[rows[:, np.newaxis], col_complement].sum())
return cut / weight
def most_common(d):
'''
Items of a defaultdict(int) with the heighest values.
Like Counter.most_common in Python >= 2.7.
'''
return sorted(
six.iteritems(d),
key = operator.itemgetter(1),
reverse = True
)
bicluster_ncuts = list(bicluster_ncut(i)
for i in xrange(len(newsgroups.target_names)))
best_idx = np.argsort(bicluster_ncuts)[:5]
print()
print('Best biclusters:')
print('----------------')
for idx, cluster in enumerate(best_idx):
n_rows, n_cols = cocluster.get_shape(cluster)
cluster_docs, cluster_words = cocluster.get_indices(cluster)
if not len(cluster_docs) or not len(cluster_words):
continue
# categories
counter = defaultdict(int)
for i in cluster_docs:
counter[document_names[i]] += 1
cat_string = ', '.join('{:.0f}% {}'.format(float(c) / n_rows * 100, name)
for name, c in most_common(counter)[:3])
# words
out_of_cluster_docs = cocluster.row_labels_ != cluster
out_of_cluster_docs = np.where(out_of_cluster_docs)[0]
word_col = X[:, cluster_words]
word_scores = np.array(
word_col[cluster_docs, :].sum(axis=0) -
word_col[out_of_cluster_docs, :].sum(axis=0))
word_scores = word_scores.ravel()
important_words = list(feature_names[cluster_words[i]]
for i in word_scores.argsort()[:-11:-1])
print('bicluster {} : {} documents, {} words'.format(
idx, n_rows, n_cols))
print('categories : {}'.format(cat_string))
print('words : {}\n'.format(', '.join(important_words)))