bicluster_newsgroups.py

'''
bicluster_newsgroups.py 

Biclustering documents with the Spectral Co-clustering algorithm 

This example demonstrates the Spectral Co-clustering algorithm on the 
twenty newsgroups dataset. the 'comp.os.ms-windows.misc' category is 
excluded because it contains many posts containing nothing but data.

The TF-IDF vectorized posts form a word frequency matrix, which is 
then biclustered using Dhillon's Spectral Co-Clustering algorithm.
The resulting document-word biclusters indicate subsets words used 
more often in those subsets documents.

For a few of the best biclusters, its most common document categories 
and its then most important words get printed. The best biclusters 
are determined by their normalized cut. The best words are determined 
by comparing their sums inside and outside the bicluster.

For comparison, the documents are also clustered using MiniBatchKMeans. 
The document clusters derived from the biclusters achieve a better 
V-measure than clusters found by MiniBatchKMeans.

'''

from collections import defaultdict 
import operator 
import re 
from time import time 

import numpy as np 

from sklearn.cluster.bicluster import SpectralCoclustering 
from sklearn.cluster import MiniBatchKMeans 
from sklearn.externals import six 
from sklearn.datasets.twenty_newsgroups import fetch_20newsgroups 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.cluster import v_measure_score 

def number_aware_tokenizer(doc):
	'''
	Tokenizer that maps all numeric tokens to a placeholder.
	For many applications, tokens that begin with a number are not 
	directly useful, but the fact that such a token exists can be 
	relevant. By applying this form of dimensionality reduction,
	some methods may perform better.

	'''

	token_pattern = re.compile(u'(?u)\\b\\w\\w+\\b')
	tokens = token_pattern.findall(doc)
	tokens = ['#NUMBER' if token[0] in '0123456789_' else token 
		for token in tokens] 
	return tokens 

#exclude 'comp.os.ms-windows.misc'
categories = ['alt.atheism', 'comp.graphics',
              'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
              'comp.windows.x', 'misc.forsale', 'rec.autos',
              'rec.motorcycles', 'rec.sport.baseball',
              'rec.sport.hockey', 'sci.crypt', 'sci.electronics',
              'sci.med', 'sci.space', 'soc.religion.christian',
              'talk.politics.guns', 'talk.politics.mideast',
              'talk.politics.misc', 'talk.religion.misc'
]

newsgroups = fetch_20newsgroups(categories = categories)
y_true = newsgroups.target 

vectorizer = TfidfVectorizer(
	stop_words = 'english',
	min_df = 5,
	tokenizer = number_aware_tokenizer
)

cocluster = SpectralCoclustering(
	n_clusters = len(categories),
	svd_method = 'arpack', 
	random_state = 0
)

kmeans = MiniBatchKMeans(
	n_clusters = len(categories),
	batch_size = 20000,
	random_state = 0
)

print('Vectorizing ...')
X = vectorizer.fit_transform(newsgroups.data)

print('Coclustering ...')
start_time = time()
cocluster.fit(X)
y_cocluster = cocluster.row_labels_
print('Done in {:.2f}s. V-measure: {:.4f}'.format(
	time() - start_time,
	v_measure_score(y_cocluster, y_true)))

print('MiniBatchKMeans ...')
start_time = time()
y_kmeans = kmeans.fit_predict(X)
print('Done in {:.2f}s. V-measure: {:.4f}'.format(
	time() - start_time,
	v_measure_score(y_kmeans, y_true)))

feature_names = vectorizer.get_feature_names()
document_names = list(newsgroups.target_names[i] for i in newsgroups.target)

def bicluster_ncut(i):
	rows, cols = cocluster.get_indices(i)
	if not (np.any(rows) and np.any(cols)):
		import sys 
		return sys.float_info.max 

	row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0] 
	col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]
	weight = X[rows[:, np.newaxis], cols].sum()
	cut = (X[row_complement[:, np.newaxis], cols].sum() + 
		X[rows[:, np.newaxis], col_complement].sum())
	return cut / weight 

def most_common(d):
	'''
	Items of a defaultdict(int) with the heighest values.
	Like Counter.most_common in Python >= 2.7.

	'''
	return sorted(
		six.iteritems(d), 
		key = operator.itemgetter(1), 
		reverse = True
	)


bicluster_ncuts = list(bicluster_ncut(i)
	for i in xrange(len(newsgroups.target_names))) 
best_idx = np.argsort(bicluster_ncuts)[:5]

print()
print('Best biclusters:')
print('----------------')
for idx, cluster in enumerate(best_idx):
	n_rows, n_cols = cocluster.get_shape(cluster)
	cluster_docs, cluster_words = cocluster.get_indices(cluster)
	if not len(cluster_docs) or not len(cluster_words):
		continue 

	# categories 
	counter = defaultdict(int)
	for i in cluster_docs:
		counter[document_names[i]] += 1
	cat_string = ', '.join('{:.0f}% {}'.format(float(c) / n_rows * 100, name)
		for name, c in most_common(counter)[:3])

	# words 
	out_of_cluster_docs = cocluster.row_labels_ != cluster 
	out_of_cluster_docs = np.where(out_of_cluster_docs)[0] 
	word_col = X[:, cluster_words]
	word_scores = np.array(
		word_col[cluster_docs, :].sum(axis=0) - 
		word_col[out_of_cluster_docs, :].sum(axis=0))
	word_scores = word_scores.ravel()
	important_words = list(feature_names[cluster_words[i]]
		for i in word_scores.argsort()[:-11:-1])

	print('bicluster {} : {} documents, {} words'.format(
		idx, n_rows, n_cols))
	print('categories   : {}'.format(cat_string))
	print('words        : {}\n'.format(', '.join(important_words)))