Python KMeans示例，scikits.learn.cluster.KMeans Python示例

示例#1

0

显示文件

文件： extract.py 项目： athuras/attention

def cluster_centroids(x, k=32, max_iter=300, km_kwargs={}):
    '''Return norm-ordered centroids'''
    km = KMeans(k, init='k-means++', max_iter=300, **km_kwargs)
    trained = km.fit(x)
    centroids = trained.cluster_centers_
    ind = np.argsort(np.linalg.norm(centroids, axis=1))
    return centroids[ind]

示例#2

0

显示文件

文件： utils.py 项目： lincolnloop/django-geotagging-new

def cluster_objects(objects, optimize_within_clusters=False, round_trip=False,
                    initial=None):
    """
    Return a list of objects clustered by geographical position.

    :param objects: The list of objects or a queryset. The objects
    must be an instance of PointGeoTag or implement
    `get_point_coordinates(self, as_string=False, inverted=False)` to
    obtain the coordinates

    :param optimize_within_clusters: a boolean specifying if the
    clusters should be ordered based on the (near-)optimal route.

    :returns: A list of clusters. Example: [[<p1>, <p2>], [<p3>, <p4>, <p5>]]
    """
    X = np.array([list(i.get_point_coordinates(as_string=False, inverted=True))
                  for i in objects 
                  if i.get_point_coordinates(as_string=False, inverted=True)])

    # Afinity propagation. 
    # This way we can determine the number of clusters automatically
    # X_norms = np.sum(X*X, axis=1)
    # S = - X_norms[:,np.newaxis] - X_norms[np.newaxis,:] + 2 * np.dot(X, X.T)
    # p = 10*np.median(S)
    # af = AffinityPropagation()
    # af.fit(S, p)
    # n_clusters_ = len(af.cluster_centers_indices_)

    n_items = len(X)
    max_items = getattr(settings, 'ITEMS_PER_BUCKET', 10) - 1
    n_clusters = n_items / max_items
    #n_clusters += n_items % max_items == 0 and 0 or 1

    # KMeans. 
    # If we want a pre-specified number of clusters this is the way to go 
    km = KMeans(k=n_clusters, init='k-means++')
    km.fit(X)

    cluster_dict = defaultdict(list)
    for i, cluster_id in enumerate(km.labels_):
        cluster_dict[cluster_id].append(objects[i])

    clusters = cluster_dict.values()
    if optimize_within_clusters:
        if initial:
            result = []
            for cluster in clusters:
                if initial in cluster:
                    cluster.remove(initial)
                    cluster.insert(0, initial)
                    result.insert(0, google_TSP(cluster, round_trip=round_trip))
                else:
                    result.append(google_TSP(cluster, round_trip=round_trip))
            return result
        else:
            return [google_TSP(cluster, round_trip=round_trip) for cluster in clusters]
    return clusters

示例#3

0

显示文件

文件： run.py 项目： shyam15287/infuse

 def __init__(self):
     self.output_path = OUTPUT_PATH
     self._processor = None
     self._usernames = None
     self._rankings = None
     self._default_processor = lambda: TextProcessor(
         store_docs=True, clusters={"kmeans": lambda: KMeans(5)})

示例#4

0

显示文件

文件： run.py 项目： shyam15287/infuse

    def text_profiles_similarity(self):
        """Compute and return similarity scores between profiles, based on text 
        features and KMeans clustering.
        """

        # Text (TF-IDF)
        processor = TextProcessor(store_docs=True,
                                  clusters={'kmeans': lambda: KMeans(5)})
        processor.run()

        # dictionary containing metrics for the profiles
        docs = []
        for username, cluster in processor.clusters["kmeans"].items():
            # for each cluster, build up a new dataset, we will then use it to
            # compare the profiles
            for label in np.unique(cluster.labels_):
                # get only the documents with this label
                docs.append(" ".join([
                    processor.stored_docs[username][i]
                    for i, val in enumerate(cluster.labels_ == label) if val
                ]))

        features = processor.get_features(docs)
        self._processor = processor
        return euclidean_distances(features, features)

示例#5

0

显示文件

文件： conj_basic.py 项目： vene/misc-nlp

def k_clusters(k, infinitives):
    data, _ = extract_features(infinitives, 3, False)
    kmeans = KMeans(k=k).fit(data)
    print kmeans.inertia_
    nn = NeighborsClassifier(1).fit(data, np.zeros(data.shape[0]))
    _, idx = nn.kneighbors(kmeans.cluster_centers_)
    for inf in infinitives[idx.flatten()]:
        print inf

示例#6

0

显示文件

    def _kmeans(*ks):
        """utility function to return instances of kmeans with a predefined
        number of clusters.

        the passed list is the K value for the clusters to return
        
        """
        if not ks:
            ks = [5, 10, 20, 50]

        instances = []
        for k in ks:
            instances.append(KMeans(k))
        return instances

示例#7

0

显示文件

文件： plot_k_means.py 项目： mosi/scikit-learn

np.random.seed(0)

n_points_per_cluster = 250
n_clusters = 3
n_points = n_points_per_cluster*n_clusters
means = np.array([[1,1],[-1,-1],[1,-1]])
std = .6
clustMed = []

X = np.empty((0, 2))
for i in range(n_clusters):
    X = np.r_[X, means[i] + std * np.random.randn(n_points_per_cluster, 2)]

################################################################################
# Compute clustering with KMeans
km = KMeans(init='k-means++', k=3, n_init=1)
km.fit(X);

labels = km.labels_
cluster_centers = km.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print "number of estimated clusters : %d" % n_clusters_

################################################################################
# Plot result
import pylab as pl
from itertools import cycle

示例#8

0

显示文件

def get_profiles_similarity(usernames, N):
    """
    Return a matrix of similarity between the users.

    :usernames:
        The list of usernames in the system

    :N: 
        the number of profiles to find for each user
    """

    # all the documents per profile will be stored in this variable
    doc_profiles = []

    # all the urls for each profiles will be put in this array
    urls = []

    # For each user, get his views
    for username in usernames:
        print "processing %s" % username

        # don't use generators are we want to access it multiple times, so we
        # actually need to store it in memory
        views = list(db.views.find({"user.username": username, 'url': {
            '$nin': list(db.resources.find({'blacklisted': True}).distinct('url'))
        }}))

        features = get_views_features(views)

        # Run a clustering algorithm on the view
        np_features = np.array(features)
        #bandwidth = estimate_bandwidth(np_features, quantile=0.3)
        #algo = MeanShift(bandwidth=bandwidth).fit(np_features)

        # The distribution from the KMeans algorithm is better because we get
        # more balanced clusters. MeanShift comes with a lot of clusters with 
        # less than 2 elements.
        with mesure("clustering the context to find %s profiles" % N, indent=1):
            algo = KMeans(N).fit(np_features)

        # for each cluster, get the matching views
        # this means iterating N times (where N is the number of cluster found)
        for label in np.unique(algo.labels_):

            profile_urls = []
            for i, matches in enumerate(algo.labels_ == label):
                view = views[i]
                if matches and view['url'] not in profile_urls:
                    profile_urls.append(view['url'])

            # save the urls of this profile for later use
            urls.append(profile_urls)
            
            resources = db.resources.find({
                'url': {'$in': profile_urls}, # get the resources for those urls
                'blacklisted': False, 'processed': True})

            # Append the contents for this profile together
            doc_profiles.append(" ".join([r['content'] for r in resources]))

    # train the vectorizer on a big and sparse set of documents
    # the vectorizer is loaded from disk to avoid recomputing it each time
    with open(os.path.join(OUTPUT_PATH, "pickle", "vecnewsgroup.pickle")) as f:
        vec = pickle.load(f)
    
    # Same for the principal component analysis (PCA)
    with open(os.path.join(OUTPUT_PATH, "pickle", "pca100-newsgroup.pickle")) as f:
        pca = pickle.load(f)

    # At this stage, all the documents are stored into memory, sometimes
    # more than once for each resource. We want to vectorize them all and thus
    # it can take some time.
    with mesure("vectorizing %s profiles" % len(doc_profiles)):
        vec_profiles = pca.transform(vec.transform(doc_profiles))

    # Compute their similarity score
    return euclidean_distances(vec_profiles, vec_profiles), urls

示例#9

0

显示文件

文件： som_digits.py 项目： scampion/scikit-learn

print

################################################################################
# Digits dataset clustering using Self-Organizing Map

print "Self-Organizing Map "
t0 = time()
grid_width = 4
som = SelfOrganizingMap(size=grid_width, n_iterations=n_samples*5,
                        learning_rate=1)
som.fit(data)
print "done in %0.3fs" % (time() - t0)
print

F = pseudo_F(data, som.labels_, som.neurons_)
print 'pseudo_F %0.2f | %0.2f%%' % (F, 100 * (F / (1 + F)))
print

################################################################################
# Digits dataset clustering using Kmeans

print "KMeans "
t0 = time()
km = KMeans(init='k-means++', k=grid_width**2, n_init=10)
km.fit(data)
print "done in %0.3fs" % (time() - t0)
print

F = pseudo_F(data, km.labels_, km.cluster_centers_)
print 'pseudo_F %0.2f | %0.2f%%' % (F, 100 * (F / (1 + F)))

示例#10

0

显示文件

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
from __future__ import division
import numpy as np
import pickle
from scikits.learn.cluster import KMeans
import logging 
import time

logging.basicConfig(level=logging.DEBUG)

LEARN_SIZE = 100
K = 8
ITER = 10 

moto,plane  = [pickle.load(open(file)) 
               for file in ['moto','plane']]

logging.info('Data loaded') 

m   = np.vstack([v for f,v in  moto.items()[0:LEARN_SIZE]])
p   = np.vstack([v for f,v in plane.items()[0:LEARN_SIZE]])
all = np.vstack([m,p])

km = KMeans(k=K,max_iter=ITER)
km.fit(all) 

filename = 'centroids_%d_%d_%d' % (LEARN_SIZE,K,ITER)
pickle.dump(km.cluster_centers_,open(filename,'wb'))

示例#11

0

显示文件

def cluster_users(features=None):
    """Cluster the users, without using information about profiles.

    Different features can be used to do so, at least text features and context 
    features.
    """
    training_set = "newsgroup"
    docs = None

    vec_filename = os.path.join(OUTPUT_PATH,
                                "pickle/vec-%s.pickle" % training_set)
    pca_filename = os.path.join(OUTPUT_PATH,
                                "pickle/pca-%s.pickle" % training_set)

    # get the training set, transform it to N dimensions
    with mesure("  loading vectors"):
        if os.path.isfile(vec_filename):
            vec = _load_obj(vec_filename)
        else:
            docs = _load_docs(docs, training_set)
            vec = Vectorizer().fit(
                docs)  # equivalent to CountVectorizer + TfIdf
            _save_obj(vec, vec_filename)

    with mesure("  loading PCA"):
        if os.path.isfile(pca_filename):
            pca = _load_obj(pca_filename)
        else:
            docs = _load_docs(docs, training_set)

            print "  reduce the dimentionality of the dataset to 100 components"
            # whiten=True ensure that the variance of each dim of the data in the
            # transformed space is scaled to 1.0
            pca = RandomizedPCA(n_components=100,
                                whiten=True).fit(vec.transform(docs))
            _save_obj(pca, pca_filename)

    # for each user, get the contents related to him.
    users_content = []
    users_labels = []
    for username in list(db.users.find().distinct('username')):
        # get all the resources for this user
        urls = db.views.find({"user.username": username}).distinct("url")
        if not urls:
            continue  # if we don't have any url for this user, go to the next one!

        resources = list(
            db.resources.find({
                'url': {
                    '$in': urls
                },
                'blacklisted': False,
                'processed': True
            }))
        if not resources:
            continue
        print "processing %s (%s docs)" % (username, len(resources))

        # get the docs content and names
        users_labels.append(username)
        users_content.append(" ".join([res['content'] for res in resources]))

    with mesure("  vectorise and reduce the dataset dimensions to 100"):
        transformed_content = pca.transform(vec.transform(users_content))

    # at the end, compute the similarity between users using different metrics
    # kmeans 3 clusters
    cluster = KMeans(3).fit(transformed_content)
    plot_pie(cluster, "all", "kmeans", "text")
    plot_2d(cluster, transformed_content, "all", "kmeans", "text")
    user_list = [[
        users_labels[idx]
        for idx, _ in enumerate(cluster.labels_ == cluster_id) if _
    ] for cluster_id in np.unique(cluster.labels_)]

    # compute similarity scores
    from ipdb import set_trace
    set_trace()

示例#12

0

显示文件

def find_profiles_context(algo=None, user=None):
    """Find profiles based on:
        * location of the views
        * time of the day of the views
        * time of the day
        * day of the week
    """
    if not algo:
        algo = "all"

    # get all users
    for username in db.users.distinct("username"):
        if user and user != username:
            continue

        urls = db.views.find({"user.username": username}).distinct("url")
        resources = []
        if not urls:
            continue
        print "processing %s (%s docs)" % (username, len(urls))

        t0 = time.time()
        progress = ProgressBar(widgets=[
            "  building the matrix for %s" % username,
            Percentage(),
            Bar()
        ])
        for url in progress(urls):
            # get the views related to this user and this url
            views = db.views.find({"user.username": username, "url": url})

            views = list(views)
            indicators = ['average', 'mean', 'median', 'var', 'std']

            row = [len(views), sum([int(v['duration']) for v in views])]
            # TODO add location

            daytimes = []
            weekdays = []
            for view in views:
                daytimes.append(view['daytime'])
                weekdays.append(view['weekday'])

            for indicator in indicators:
                row.append(getattr(np, indicator)(daytimes))
                row.append(getattr(np, indicator)(weekdays))

            resources.append(row)

        resources = np.array(resources)
        print "matrix generation took %s" % (time.time() - t0)

        # project X on 2D
        # print "  project the dataset into 2d"
        # pca_2d = RandomizedPCA(n_components=2, whiten=True).fit(resources)
        # docs_2d = pca_2d.transform(resources)

        # run the clustering algorithm
        if algo in ["kmeans", "all"]:
            with mesure("  kmeans(5)"):
                cluster = KMeans(k=5).fit(resources)
            plot_2d(cluster, resources, username, "kmeans", "Context")
            plot_pie(cluster, username, "kmeans", "Context")

        if algo in ["meanshift", "all"]:
            with mesure("  meanshift"):
                cluster = MeanShift().fit(resources)
            plot_2d(cluster, resources, username, "meanshift", "Context")
            plot_pie(cluster, username, "meanshift", "Context")

        if algo in ["affinity", "all"]:
            with mesure("  affinity propagation"):
                cluster = AffinityPropagation().fit(
                    euclidean_distances(resources, resources))
            # plot_2d(cluster, resources, username, "affinity", "Context")
            plot_pie(cluster, username, "affinity", "Context")

示例#13

0

显示文件

文件： plot_k_means.py 项目： mosi/scikit-learn

np.random.seed(0)

n_points_per_cluster = 250
n_clusters = 3
n_points = n_points_per_cluster * n_clusters
means = np.array([[1, 1], [-1, -1], [1, -1]])
std = .6
clustMed = []

X = np.empty((0, 2))
for i in range(n_clusters):
    X = np.r_[X, means[i] + std * np.random.randn(n_points_per_cluster, 2)]

################################################################################
# Compute clustering with KMeans
km = KMeans(init='k-means++', k=3, n_init=1)
km.fit(X)

labels = km.labels_
cluster_centers = km.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print "number of estimated clusters : %d" % n_clusters_

################################################################################
# Plot result
import pylab as pl
from itertools import cycle

示例#14

0

显示文件

from scikits.learn.metrics.pairwise import euclidian_distances
from scikits.learn.datasets.samples_generator import make_blobs

##############################################################################
# Generate sample data
np.random.seed(0)

batch_size = 45
centers = [[1, 1], [-1, -1], [1, -1]]
n_clusters = len(centers)
X, labels_true = make_blobs(n_samples=1200, centers=centers, cluster_std=0.7)

##############################################################################
# Compute clustering with Means

k_means = KMeans(init='k-means++', k=3)
t0 = time.time()
k_means.fit(X)
t_batch = time.time() - t0
k_means_labels = k_means.labels_
k_means_cluster_centers = k_means.cluster_centers_
k_means_labels_unique = np.unique(k_means_labels)

##############################################################################
# Compute clustering with MiniBatchKMeans

mbk = MiniBatchKMeans(init='k-means++', k=3, chunk_size=batch_size)
t0 = time.time()
mbk.fit(X)
t_mini_batch = time.time() - t0
mbk_means_labels = mbk.labels_

示例#15

0

显示文件

文件： utils.py 项目： lincolnloop/django-geotagging-new

def cluster_objects(objects,
                    optimize_within_clusters=False,
                    round_trip=False,
                    initial=None):
    """
    Return a list of objects clustered by geographical position.

    :param objects: The list of objects or a queryset. The objects
    must be an instance of PointGeoTag or implement
    `get_point_coordinates(self, as_string=False, inverted=False)` to
    obtain the coordinates

    :param optimize_within_clusters: a boolean specifying if the
    clusters should be ordered based on the (near-)optimal route.

    :returns: A list of clusters. Example: [[<p1>, <p2>], [<p3>, <p4>, <p5>]]
    """
    X = np.array([
        list(i.get_point_coordinates(as_string=False, inverted=True))
        for i in objects
        if i.get_point_coordinates(as_string=False, inverted=True)
    ])

    # Afinity propagation.
    # This way we can determine the number of clusters automatically
    # X_norms = np.sum(X*X, axis=1)
    # S = - X_norms[:,np.newaxis] - X_norms[np.newaxis,:] + 2 * np.dot(X, X.T)
    # p = 10*np.median(S)
    # af = AffinityPropagation()
    # af.fit(S, p)
    # n_clusters_ = len(af.cluster_centers_indices_)

    n_items = len(X)
    max_items = getattr(settings, 'ITEMS_PER_BUCKET', 10) - 1
    n_clusters = n_items / max_items
    #n_clusters += n_items % max_items == 0 and 0 or 1

    # KMeans.
    # If we want a pre-specified number of clusters this is the way to go
    km = KMeans(k=n_clusters, init='k-means++')
    km.fit(X)

    cluster_dict = defaultdict(list)
    for i, cluster_id in enumerate(km.labels_):
        cluster_dict[cluster_id].append(objects[i])

    clusters = cluster_dict.values()
    if optimize_within_clusters:
        if initial:
            result = []
            for cluster in clusters:
                if initial in cluster:
                    cluster.remove(initial)
                    cluster.insert(0, initial)
                    result.insert(0, google_TSP(cluster,
                                                round_trip=round_trip))
                else:
                    result.append(google_TSP(cluster, round_trip=round_trip))
            return result
        else:
            return [
                google_TSP(cluster, round_trip=round_trip)
                for cluster in clusters
            ]
    return clusters

示例#16

0

显示文件

文件： plot_mini_batch_kmeans.py 项目： jolos/scikit-learn

from scikits.learn.metrics.pairwise import euclidean_distances
from scikits.learn.datasets.samples_generator import make_blobs

##############################################################################
# Generate sample data
np.random.seed(0)

batch_size = 45
centers = [[1, 1], [-1, -1], [1, -1]]
n_clusters = len(centers)
X, labels_true = make_blobs(n_samples=1200, centers=centers, cluster_std=0.7)

##############################################################################
# Compute clustering with Means

k_means = KMeans(init='k-means++', k=3)
t0 = time.time()
k_means.fit(X)
t_batch = time.time() - t0
k_means_labels = k_means.labels_
k_means_cluster_centers = k_means.cluster_centers_
k_means_labels_unique = np.unique(k_means_labels)

##############################################################################
# Compute clustering with MiniBatchKMeans

mbk = MiniBatchKMeans(init='k-means++', k=3, chunk_size=batch_size)
t0 = time.time()
mbk.fit(X)
t_mini_batch = time.time() - t0
mbk_means_labels = mbk.labels_

示例#17

0

显示文件

np.random.seed(42)

digits = load_digits()
data = scale(digits.data)

n_samples, n_features = data.shape
n_digits = len(np.unique(digits.target))

print "n_digits: %d" % n_digits
print "n_features: %d" % n_features
print "n_samples: %d" % n_samples
print

print "Raw k-means with k-means++ init..."
t0 = time()
km = KMeans(init='k-means++', k=n_digits, n_init=10).fit(data)
print "done in %0.3fs" % (time() - t0)
print "inertia: %f" % km.inertia_
print

print "Raw k-means with random centroid init..."
t0 = time()
km = KMeans(init='random', k=n_digits, n_init=10).fit(data)
print "done in %0.3fs" % (time() - t0)
print "inertia: %f" % km.inertia_
print

print "Raw k-means with PCA-based centroid init..."
# in this case the seeding of the centers is deterministic, hence we run the
# kmeans algorithm only once with n_init=1
t0 = time()

示例#18

0

显示文件

def find_profiles_text(algo=None, training_set=None, user=None):
    """Find different user profiles using the TF/IDF metric (Term Frequency / 
    Inverse Document Frequency).

    The stages of the pipeline are: 1. Vectorizer => 2. RandomizedPCA => 3. KMeans
    The use of the randomized PCA is useful here to reduce the dimensionality of the
    vectors space.

    As we lack some data, the dimentionality reduction is made using an already 
    existing dataset, the 20 newsgroup dataset.

    :parm algo: the algorithm to chose. Can be kmeans, meanshift or both (specified
                by "all")
    :param training_set: the training set to use for the word vectorisation.
                         The default setting is to use the 20 newsgroup dataset, 
                         it is possible to use the documents by specifying "docs"
    """
    # init some vars
    if not algo:
        algo = "all"
    if not training_set:
        training_set = "newsgroup"

    print "Computing clusters using the TF-IDF scores,"\
          " using %s algo and the %s training dataset" % (algo, training_set)

    # we first train the pca with all the dataset to have a most representative
    # model. Download the dataset and train the pca and the vector only if a
    # pickled version is not available (i.e only during the first run).
    wide_dataset = docs = None

    vec_filename = os.path.join(OUTPUT_PATH,
                                "pickle/vec-%s.pickle" % training_set)
    pca_filename = os.path.join(OUTPUT_PATH,
                                "pickle/pca-%s.pickle" % training_set)
    pca2d_filename = os.path.join(OUTPUT_PATH,
                                  "pickle/pca2d-%s.pickle" % training_set)

    with mesure("  loading vectors"):
        if os.path.isfile(vec_filename):
            vec = _load_obj(vec_filename)
        else:
            docs = _load_docs(docs, training_set)
            vec = Vectorizer().fit(
                docs)  # equivalent to CountVectorizer + TfIdf
            _save_obj(vec, vec_filename)

    with mesure("  loading PCA"):
        if os.path.isfile(pca_filename):
            pca = _load_obj(pca_filename)
        else:
            docs = _load_docs(docs, training_set)

            print "  reduce the dimentionality of the dataset to 100 components"
            # whiten=True ensure that the variance of each dim of the data in the
            # transformed space is scaled to 1.0
            pca = RandomizedPCA(n_components=100,
                                whiten=True).fit(vec.transform(docs))
            _save_obj(pca, pca_filename)

    # To visualize the data, we will project it on 2 dimensions. To do so, we
    # will use a Principal Component Analysis (as we made in the first steps),
    # but projecting on 2 dimensions.
    with mesure("  loading PCA 2D"):
        if os.path.isfile(pca2d_filename):
            pca_2d = _load_obj(pca2d_filename)
        else:
            docs = _load_docs(docs, training_set)
            print "  reduce the dimensionality of the dataset to 2 components"
            pca_2d = RandomizedPCA(n_components=2,
                                   whiten=True).fit(vec.transform(docs))
            _save_obj(pca_2d, pca2d_filename)

    # Now, go trough the whole resources for each users and try to find user
    # profiles regarding TF-IDF
    # as the process can take some time, there is a progressbar to keep the user
    # updated about the status of the operation
    for username in list(db.users.find().distinct('username')):
        if user and user != username:
            continue
        # get all the resources for this user
        urls = db.views.find({"user.username": username}).distinct("url")
        if not urls:
            continue  # if we don't have any url for this user, go to the next one!

        resources = list(
            db.resources.find({
                'url': {
                    '$in': urls
                },
                'blacklisted': False,
                'processed': True
            }))
        if not resources:
            continue
        print "processing %s (%s docs)" % (username, len(resources))

        # get the docs content and names
        docs = [res['content'] for res in resources]
        urls = [res['url'] for res in resources]

        # fit the contents to the new set of features the PCA determined
        with mesure("  reduce dataset dimensions to 100"):
            docs_transformed = pca.transform(vec.transform(docs))

        # what we do have now is a matrix with 100 dimentions, which is not really
        # useful for representation. Keeping this for later analysis is a good
        # thing so let's save this model for comparing profiles against resources
        # later
        # TODO pickle the kmeans into mongodb ?

        # project X onto 2D
        with mesure("  reduce dataset dimensions to 2"):
            docs_2d = pca_2d.transform(vec.transform(docs))

        # run the clustering algorithm
        if algo in ["kmeans", "all"]:
            with mesure("  kmeans(5)"):
                cluster = KMeans(k=5).fit(docs_transformed)

            # get_words_from_clusters(cluster, 10, docs, vec)
            # print "ngrams for km on %s" % username
            # get_n_bigrams_from_clusters(cluster, docs, 5)
            plot_2d(cluster, docs_2d, username, "kmeans",
                    "Text-%s" % training_set)
            plot_pie(cluster, username, "kmeans", "Text-%s" % training_set)

        if algo in ["meanshift", "all"]:
            with mesure("  meanshift"):
                cluster = MeanShift().fit(docs_transformed)
            # print "ngrams for ms on %s" % username
            # get_n_bigrams_from_clusters(cluster, docs, 3)
            plot_2d(cluster, docs_2d, username, "meanshift",
                    "Text-%s" % training_set)
            plot_pie(cluster, username, "meanshift", "Text-%s" % training_set)

        if algo in ["affinity", "all"]:
            with mesure("  affinity propagation"):
                cluster = AffinityPropagation().fit(
                    euclidean_distances(docs_transformed, docs_transformed))
            plot_pie(cluster, username, "affinity", "Text-%s" % training_set)