예제 #1
0
def test_pickle():
    for obj in (CountVectorizer(), SparseCountVectorizer(),
                TfidfTransformer(), SparseTfidfTransformer(),
                Vectorizer(), SparseVectorizer()):

        s = pickle.dumps(obj)
        assert_equal(type(pickle.loads(s)), obj.__class__)

if 'MLCOMP_DATASETS_HOME' not in os.environ:
    print "MLCOMP_DATASETS_HOME not set; please follow the above instructions"
    sys.exit(0)

# Load the training set
print "Loading 20 newsgroups training set... "
news_train = load_mlcomp('20news-18828', 'train')
print news_train.DESCR
print "%d documents" % len(news_train.filenames)
print "%d categories" % len(news_train.target_names)

print "Extracting features from the dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer()
X_train = vectorizer.fit_transform((open(f).read()
                                    for f in news_train.filenames))
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
assert sp.issparse(X_train)
y_train = news_train.target

print "Loading 20 newsgroups test set... "
news_test = load_mlcomp('20news-18828', 'test')
t0 = time()
print "done in %fs" % (time() - t0)

print "Predicting the labels of the test set..."
print "%d documents" % len(news_test.filenames)
print "%d categories" % len(news_test.target_names)
예제 #3
0
 def _load_vec(self, *args):
     # equivalent to CountVectorizer + TfIdf
     return Vectorizer().fit(self.docs)
                              shuffle=True, random_state=42)
print 'data loaded'

categories = data_train.target_names    # for case categories == None

print "%d documents (training set)" % len(data_train.data)
print "%d documents (testing set)" % len(data_test.data)
print "%d categories" % len(categories)
print

# split a training set and a test set
y_train, y_test = data_train.target, data_test.target

print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer()
X_train = vectorizer.fit_transform(data_train.data)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
print

print "Extracting features from the test dataset using the same vectorizer"
t0 = time()
X_test = vectorizer.transform(data_test.data)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_test.shape
print

if opts.select_chi2:
    print ("Extracting %d best features by a chi-squared test" %
           opts.select_chi2)
예제 #5
0
def test_vectorizer():
    # results to be compared
    res = []

    # raw documents as an iterator
    train_data = iter(ALL_FOOD_DOCS[:-1])
    test_data = [ALL_FOOD_DOCS[-1]]
    n_train = len(ALL_FOOD_DOCS) - 1

    # test without vocabulary
    v1 = CountVectorizer(max_df=0.5)
    counts_train = v1.fit_transform(train_data)
    if hasattr(counts_train, 'tocsr'):
        counts_train = counts_train.tocsr()
    assert_equal(counts_train[0, v1.vocabulary[u"pizza"]], 2)

    # build a vectorizer v1 with the same vocabulary as the one fitted by v1
    v2 = CountVectorizer(vocabulary=v1.vocabulary)

    # compare that the two vectorizer give the same output on the test sample
    for v in (v1, v2):
        counts_test = v.transform(test_data)
        if hasattr(counts_test, 'tocsr'):
            counts_test = counts_test.tocsr()

        assert_equal(counts_test[0, v.vocabulary[u"salad"]], 1)
        assert_equal(counts_test[0, v.vocabulary[u"tomato"]], 1)
        assert_equal(counts_test[0, v.vocabulary[u"water"]], 1)

        # stop word from the fixed list
        assert_false(u"the" in v.vocabulary)

        # stop word found automatically by the vectorizer DF thresholding
        # words that are high frequent across the complete corpus are likely
        # to be not informative (either real stop words of extraction
        # artifacts)
        assert_false(u"copyright" in v.vocabulary)

        # not present in the sample
        assert_equal(counts_test[0, v.vocabulary[u"coke"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"burger"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"beer"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"pizza"]], 0)

    # test tf-idf
    t1 = TfidfTransformer(norm='l1')
    tfidf = toarray(t1.fit(counts_train).transform(counts_train))
    assert_equal(len(t1.idf_), len(v1.vocabulary))
    assert_equal(tfidf.shape, (n_train, len(v1.vocabulary)))

    res.append(tfidf)
    res.append(t1.idf_)

    # test tf-idf with new data
    tfidf_test = toarray(t1.transform(counts_test))
    assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary)))

    # test tf alone
    t2 = TfidfTransformer(norm='l1', use_idf=False)
    tf = toarray(t2.fit(counts_train).transform(counts_train))
    assert_equal(t2.idf_, None)

    # L1-normalized term frequencies sum to one
    assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)

    # test the direct tfidf vectorizer
    # (equivalent to term count vectorizer + tfidf transformer)
    train_data = iter(ALL_FOOD_DOCS[:-1])
    tv = Vectorizer(norm='l1')
    tv.tc.max_df = v1.max_df
    tfidf2 = toarray(tv.fit_transform(train_data))
    assert_array_almost_equal(tfidf, tfidf2)

    # test the direct tfidf vectorizer with new data
    tfidf_test2 = toarray(tv.transform(test_data))
    assert_array_almost_equal(tfidf_test, tfidf_test2)

    return res
예제 #6
0
data_test = fetch_20newsgroups(subset='test', categories=categories,
                              shuffle=True, random_state=42)

print "%d documents (training set)" % len(data_train.filenames)
print "%d documents (testing set)" % len(data_test.filenames)
print "%d categories" % len(data_train.target_names)
print

# split a training set and a test set
filenames_train, filenames_test = data_train.filenames, data_test.filenames
y_train, y_test = data_train.target, data_test.target

print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer()
X_train = vectorizer.fit_transform((open(f).read() for f in filenames_train))
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
print

print "Extracting features from the test dataset using the same vectorizer"
t0 = time()
X_test = vectorizer.transform((open(f).read() for f in filenames_test))
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_test.shape
print


################################################################################
# Benchmark classifiers
예제 #7
0
def test_vectorizer():
    # raw documents as an iterator
    train_data = iter(ALL_FOOD_DOCS[:-1])
    test_data = [ALL_FOOD_DOCS[-1]]
    n_train = len(ALL_FOOD_DOCS) - 1

    # test without vocabulary
    v1 = CountVectorizer(max_df=0.5)
    counts_train = v1.fit_transform(train_data)
    if hasattr(counts_train, 'tocsr'):
        counts_train = counts_train.tocsr()
    assert_equal(counts_train[0, v1.vocabulary[u"pizza"]], 2)

    # build a vectorizer v1 with the same vocabulary as the one fitted by v1
    v2 = CountVectorizer(vocabulary=v1.vocabulary)

    # compare that the two vectorizer give the same output on the test sample
    for v in (v1, v2):
        counts_test = v.transform(test_data)
        if hasattr(counts_test, 'tocsr'):
            counts_test = counts_test.tocsr()

        assert_equal(counts_test[0, v.vocabulary[u"salad"]], 1)
        assert_equal(counts_test[0, v.vocabulary[u"tomato"]], 1)
        assert_equal(counts_test[0, v.vocabulary[u"water"]], 1)

        # stop word from the fixed list
        assert_false(u"the" in v.vocabulary)

        # stop word found automatically by the vectorizer DF thresholding
        # words that are high frequent across the complete corpus are likely
        # to be not informative (either real stop words of extraction
        # artifacts)
        assert_false(u"copyright" in v.vocabulary)

        # not present in the sample
        assert_equal(counts_test[0, v.vocabulary[u"coke"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"burger"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"beer"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"pizza"]], 0)

    # test tf-idf
    t1 = TfidfTransformer(norm='l1')
    tfidf = toarray(t1.fit(counts_train).transform(counts_train))
    assert_equal(len(t1.idf_), len(v1.vocabulary))
    assert_equal(tfidf.shape, (n_train, len(v1.vocabulary)))

    # test tf-idf with new data
    tfidf_test = toarray(t1.transform(counts_test))
    assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary)))

    # test tf alone
    t2 = TfidfTransformer(norm='l1', use_idf=False)
    tf = toarray(t2.fit(counts_train).transform(counts_train))
    assert_equal(t2.idf_, None)

    # L1-normalized term frequencies sum to one
    assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)

    # test the direct tfidf vectorizer
    # (equivalent to term count vectorizer + tfidf transformer)
    train_data = iter(ALL_FOOD_DOCS[:-1])
    tv = Vectorizer(norm='l1')
    tv.tc.max_df = v1.max_df
    tfidf2 = toarray(tv.fit_transform(train_data))
    assert_array_almost_equal(tfidf, tfidf2)

    # test the direct tfidf vectorizer with new data
    tfidf_test2 = toarray(tv.transform(test_data))
    assert_array_almost_equal(tfidf_test, tfidf_test2)

    # test empty vocabulary
    v3 = CountVectorizer(vocabulary=None)
    assert_raises(ValueError, v3.transform, train_data)
예제 #8
0
파일: profiles.py 프로젝트: almet/infuse
def find_profiles_text(algo=None, training_set=None, user=None):
    """Find different user profiles using the TF/IDF metric (Term Frequency / 
    Inverse Document Frequency).

    The stages of the pipeline are: 1. Vectorizer => 2. RandomizedPCA => 3. KMeans
    The use of the randomized PCA is useful here to reduce the dimensionality of the
    vectors space.

    As we lack some data, the dimentionality reduction is made using an already 
    existing dataset, the 20 newsgroup dataset.

    :parm algo: the algorithm to chose. Can be kmeans, meanshift or both (specified
                by "all")
    :param training_set: the training set to use for the word vectorisation.
                         The default setting is to use the 20 newsgroup dataset, 
                         it is possible to use the documents by specifying "docs"
    """
    # init some vars
    if not algo:
        algo = "all"
    if not training_set:
        training_set = "newsgroup"

    print "Computing clusters using the TF-IDF scores,"\
          " using %s algo and the %s training dataset" % (algo, training_set)

    # we first train the pca with all the dataset to have a most representative
    # model. Download the dataset and train the pca and the vector only if a 
    # pickled version is not available (i.e only during the first run).
    wide_dataset = docs = None

    vec_filename = os.path.join(OUTPUT_PATH, "pickle/vec-%s.pickle" % training_set)
    pca_filename = os.path.join(OUTPUT_PATH, "pickle/pca-%s.pickle" % training_set)
    pca2d_filename = os.path.join(OUTPUT_PATH, "pickle/pca2d-%s.pickle" % training_set)

    with mesure("  loading vectors"):
        if os.path.isfile(vec_filename):
            vec = _load_obj(vec_filename)
        else:
            docs = _load_docs(docs, training_set)
            vec = Vectorizer().fit(docs) # equivalent to CountVectorizer + TfIdf
            _save_obj(vec, vec_filename)

    with mesure("  loading PCA"):
        if os.path.isfile(pca_filename):
            pca = _load_obj(pca_filename)
        else:
            docs = _load_docs(docs, training_set)

            print "  reduce the dimentionality of the dataset to 100 components"
            # whiten=True ensure that the variance of each dim of the data in the 
            # transformed space is scaled to 1.0
            pca = RandomizedPCA(n_components=100, whiten=True).fit(vec.transform(docs))
            _save_obj(pca, pca_filename)

    # To visualize the data, we will project it on 2 dimensions. To do so, we 
    # will use a Principal Component Analysis (as we made in the first steps), 
    # but projecting on 2 dimensions.
    with mesure("  loading PCA 2D"):
        if os.path.isfile(pca2d_filename):
            pca_2d = _load_obj(pca2d_filename)
        else:
            docs = _load_docs(docs, training_set)
            print "  reduce the dimensionality of the dataset to 2 components"
            pca_2d = RandomizedPCA(n_components=2, whiten=True).fit(vec.transform(docs))
            _save_obj(pca_2d, pca2d_filename)

    # Now, go trough the whole resources for each users and try to find user 
    # profiles regarding TF-IDF
    # as the process can take some time, there is a progressbar to keep the user 
    # updated about the status of the operation
    for username in list(db.users.find().distinct('username')):
        if user and user != username:
            continue
        # get all the resources for this user
        urls = db.views.find({"user.username": username}).distinct("url")
        if not urls:
            continue # if we don't have any url for this user, go to the next one!

        resources = list(db.resources.find({'url': {'$in': urls }, 
            'blacklisted': False, 'processed': True}))
        if not resources:
            continue
        print "processing %s (%s docs)" % (username, len(resources))

        # get the docs content and names
        docs = [res['content'] for res in resources]
        urls = [res['url'] for res in resources]

        # fit the contents to the new set of features the PCA determined
        with mesure("  reduce dataset dimensions to 100"):
            docs_transformed = pca.transform(vec.transform(docs))

        # what we do have now is a matrix with 100 dimentions, which is not really 
        # useful for representation. Keeping this for later analysis is a good
        # thing so let's save this model for comparing profiles against resources
        # later
        # TODO pickle the kmeans into mongodb ?

        # project X onto 2D
        with mesure("  reduce dataset dimensions to 2"):
            docs_2d = pca_2d.transform(vec.transform(docs))

        # run the clustering algorithm
        if algo in ["kmeans", "all"]:
            with mesure("  kmeans(5)"):
                cluster = KMeans(k=5).fit(docs_transformed)

            # get_words_from_clusters(cluster, 10, docs, vec)
            # print "ngrams for km on %s" % username
            # get_n_bigrams_from_clusters(cluster, docs, 5)
            plot_2d(cluster, docs_2d, username, "kmeans", "Text-%s" % training_set)
            plot_pie(cluster, username, "kmeans", "Text-%s" % training_set)

        if algo in ["meanshift", "all"]:
            with mesure("  meanshift"):
                cluster = MeanShift().fit(docs_transformed) 
            # print "ngrams for ms on %s" % username
            # get_n_bigrams_from_clusters(cluster, docs, 3)
            plot_2d(cluster, docs_2d, username, "meanshift", "Text-%s" % training_set)
            plot_pie(cluster, username, "meanshift", "Text-%s" % training_set)

        if algo in ["affinity", "all"]:
            with mesure("  affinity propagation"):
                cluster = AffinityPropagation().fit(euclidean_distances(docs_transformed, docs_transformed))
            plot_pie(cluster, username, "affinity", "Text-%s" % training_set)
예제 #9
0
파일: profiles.py 프로젝트: almet/infuse
def cluster_users(features=None):
    """Cluster the users, without using information about profiles.

    Different features can be used to do so, at least text features and context 
    features.
    """
    training_set="newsgroup"
    docs = None

    vec_filename = os.path.join(OUTPUT_PATH, "pickle/vec-%s.pickle" % training_set)
    pca_filename = os.path.join(OUTPUT_PATH, "pickle/pca-%s.pickle" % training_set)

    # get the training set, transform it to N dimensions
    with mesure("  loading vectors"):
        if os.path.isfile(vec_filename):
            vec = _load_obj(vec_filename)
        else:
            docs = _load_docs(docs, training_set)
            vec = Vectorizer().fit(docs) # equivalent to CountVectorizer + TfIdf
            _save_obj(vec, vec_filename)

    with mesure("  loading PCA"):
        if os.path.isfile(pca_filename):
            pca = _load_obj(pca_filename)
        else:
            docs = _load_docs(docs, training_set)

            print "  reduce the dimentionality of the dataset to 100 components"
            # whiten=True ensure that the variance of each dim of the data in the 
            # transformed space is scaled to 1.0
            pca = RandomizedPCA(n_components=100, whiten=True).fit(vec.transform(docs))
            _save_obj(pca, pca_filename)

    # for each user, get the contents related to him.
    users_content = []
    users_labels = []
    for username in list(db.users.find().distinct('username')):
        # get all the resources for this user
        urls = db.views.find({"user.username": username}).distinct("url")
        if not urls:
            continue # if we don't have any url for this user, go to the next one!

        resources = list(db.resources.find({'url': {'$in': urls }, 
            'blacklisted': False, 'processed': True}))
        if not resources:
            continue
        print "processing %s (%s docs)" % (username, len(resources))

        # get the docs content and names
        users_labels.append(username)
        users_content.append(" ".join([res['content'] for res in resources]))
    
    with mesure("  vectorise and reduce the dataset dimensions to 100"):
        transformed_content = pca.transform(vec.transform(users_content))

    # at the end, compute the similarity between users using different metrics
    # kmeans 3 clusters
    cluster = KMeans(3).fit(transformed_content)
    plot_pie(cluster, "all", "kmeans", "text")
    plot_2d(cluster, transformed_content, "all", "kmeans", "text")
    user_list = [[users_labels[idx] for idx, _ in enumerate(cluster.labels_ == cluster_id) if _] for cluster_id in np.unique(cluster.labels_)]

    # compute similarity scores
    from ipdb import set_trace; set_trace()
예제 #10
0
                               shuffle=True, random_state=42)

filenames = np.concatenate((data_train.filenames, data_test.filenames))
target_names = set(data_train.target_names + data_test.target_names)

print "%d documents" % len(filenames)
print "%d categories" % len(target_names)
print

# split a training set and a test set
labels = np.concatenate((data_train.target, data_test.target))
true_k = np.unique(labels).shape[0]

print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer(max_features=10000)
X = vectorizer.fit_transform((open(f).read() for f in filenames))

X = Normalizer(norm="l2", copy=False).transform(X)

print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X.shape
print


###############################################################################
# Now sparse MiniBatchKmeans

print "_" * 80

mbkm = MiniBatchKMeans(init="random", k=true_k, max_iter=10, random_state=13,
예제 #11
0
def find_profiles_text(algo=None, training_set=None, user=None):
    """Find different user profiles using the TF/IDF metric (Term Frequency / 
    Inverse Document Frequency).

    The stages of the pipeline are: 1. Vectorizer => 2. RandomizedPCA => 3. KMeans
    The use of the randomized PCA is useful here to reduce the dimensionality of the
    vectors space.

    As we lack some data, the dimentionality reduction is made using an already 
    existing dataset, the 20 newsgroup dataset.

    :parm algo: the algorithm to chose. Can be kmeans, meanshift or both (specified
                by "all")
    :param training_set: the training set to use for the word vectorisation.
                         The default setting is to use the 20 newsgroup dataset, 
                         it is possible to use the documents by specifying "docs"
    """
    # init some vars
    if not algo:
        algo = "all"
    if not training_set:
        training_set = "newsgroup"

    print "Computing clusters using the TF-IDF scores,"\
          " using %s algo and the %s training dataset" % (algo, training_set)

    # we first train the pca with all the dataset to have a most representative
    # model. Download the dataset and train the pca and the vector only if a
    # pickled version is not available (i.e only during the first run).
    wide_dataset = docs = None

    vec_filename = os.path.join(OUTPUT_PATH,
                                "pickle/vec-%s.pickle" % training_set)
    pca_filename = os.path.join(OUTPUT_PATH,
                                "pickle/pca-%s.pickle" % training_set)
    pca2d_filename = os.path.join(OUTPUT_PATH,
                                  "pickle/pca2d-%s.pickle" % training_set)

    with mesure("  loading vectors"):
        if os.path.isfile(vec_filename):
            vec = _load_obj(vec_filename)
        else:
            docs = _load_docs(docs, training_set)
            vec = Vectorizer().fit(
                docs)  # equivalent to CountVectorizer + TfIdf
            _save_obj(vec, vec_filename)

    with mesure("  loading PCA"):
        if os.path.isfile(pca_filename):
            pca = _load_obj(pca_filename)
        else:
            docs = _load_docs(docs, training_set)

            print "  reduce the dimentionality of the dataset to 100 components"
            # whiten=True ensure that the variance of each dim of the data in the
            # transformed space is scaled to 1.0
            pca = RandomizedPCA(n_components=100,
                                whiten=True).fit(vec.transform(docs))
            _save_obj(pca, pca_filename)

    # To visualize the data, we will project it on 2 dimensions. To do so, we
    # will use a Principal Component Analysis (as we made in the first steps),
    # but projecting on 2 dimensions.
    with mesure("  loading PCA 2D"):
        if os.path.isfile(pca2d_filename):
            pca_2d = _load_obj(pca2d_filename)
        else:
            docs = _load_docs(docs, training_set)
            print "  reduce the dimensionality of the dataset to 2 components"
            pca_2d = RandomizedPCA(n_components=2,
                                   whiten=True).fit(vec.transform(docs))
            _save_obj(pca_2d, pca2d_filename)

    # Now, go trough the whole resources for each users and try to find user
    # profiles regarding TF-IDF
    # as the process can take some time, there is a progressbar to keep the user
    # updated about the status of the operation
    for username in list(db.users.find().distinct('username')):
        if user and user != username:
            continue
        # get all the resources for this user
        urls = db.views.find({"user.username": username}).distinct("url")
        if not urls:
            continue  # if we don't have any url for this user, go to the next one!

        resources = list(
            db.resources.find({
                'url': {
                    '$in': urls
                },
                'blacklisted': False,
                'processed': True
            }))
        if not resources:
            continue
        print "processing %s (%s docs)" % (username, len(resources))

        # get the docs content and names
        docs = [res['content'] for res in resources]
        urls = [res['url'] for res in resources]

        # fit the contents to the new set of features the PCA determined
        with mesure("  reduce dataset dimensions to 100"):
            docs_transformed = pca.transform(vec.transform(docs))

        # what we do have now is a matrix with 100 dimentions, which is not really
        # useful for representation. Keeping this for later analysis is a good
        # thing so let's save this model for comparing profiles against resources
        # later
        # TODO pickle the kmeans into mongodb ?

        # project X onto 2D
        with mesure("  reduce dataset dimensions to 2"):
            docs_2d = pca_2d.transform(vec.transform(docs))

        # run the clustering algorithm
        if algo in ["kmeans", "all"]:
            with mesure("  kmeans(5)"):
                cluster = KMeans(k=5).fit(docs_transformed)

            # get_words_from_clusters(cluster, 10, docs, vec)
            # print "ngrams for km on %s" % username
            # get_n_bigrams_from_clusters(cluster, docs, 5)
            plot_2d(cluster, docs_2d, username, "kmeans",
                    "Text-%s" % training_set)
            plot_pie(cluster, username, "kmeans", "Text-%s" % training_set)

        if algo in ["meanshift", "all"]:
            with mesure("  meanshift"):
                cluster = MeanShift().fit(docs_transformed)
            # print "ngrams for ms on %s" % username
            # get_n_bigrams_from_clusters(cluster, docs, 3)
            plot_2d(cluster, docs_2d, username, "meanshift",
                    "Text-%s" % training_set)
            plot_pie(cluster, username, "meanshift", "Text-%s" % training_set)

        if algo in ["affinity", "all"]:
            with mesure("  affinity propagation"):
                cluster = AffinityPropagation().fit(
                    euclidean_distances(docs_transformed, docs_transformed))
            plot_pie(cluster, username, "affinity", "Text-%s" % training_set)
예제 #12
0
def cluster_users(features=None):
    """Cluster the users, without using information about profiles.

    Different features can be used to do so, at least text features and context 
    features.
    """
    training_set = "newsgroup"
    docs = None

    vec_filename = os.path.join(OUTPUT_PATH,
                                "pickle/vec-%s.pickle" % training_set)
    pca_filename = os.path.join(OUTPUT_PATH,
                                "pickle/pca-%s.pickle" % training_set)

    # get the training set, transform it to N dimensions
    with mesure("  loading vectors"):
        if os.path.isfile(vec_filename):
            vec = _load_obj(vec_filename)
        else:
            docs = _load_docs(docs, training_set)
            vec = Vectorizer().fit(
                docs)  # equivalent to CountVectorizer + TfIdf
            _save_obj(vec, vec_filename)

    with mesure("  loading PCA"):
        if os.path.isfile(pca_filename):
            pca = _load_obj(pca_filename)
        else:
            docs = _load_docs(docs, training_set)

            print "  reduce the dimentionality of the dataset to 100 components"
            # whiten=True ensure that the variance of each dim of the data in the
            # transformed space is scaled to 1.0
            pca = RandomizedPCA(n_components=100,
                                whiten=True).fit(vec.transform(docs))
            _save_obj(pca, pca_filename)

    # for each user, get the contents related to him.
    users_content = []
    users_labels = []
    for username in list(db.users.find().distinct('username')):
        # get all the resources for this user
        urls = db.views.find({"user.username": username}).distinct("url")
        if not urls:
            continue  # if we don't have any url for this user, go to the next one!

        resources = list(
            db.resources.find({
                'url': {
                    '$in': urls
                },
                'blacklisted': False,
                'processed': True
            }))
        if not resources:
            continue
        print "processing %s (%s docs)" % (username, len(resources))

        # get the docs content and names
        users_labels.append(username)
        users_content.append(" ".join([res['content'] for res in resources]))

    with mesure("  vectorise and reduce the dataset dimensions to 100"):
        transformed_content = pca.transform(vec.transform(users_content))

    # at the end, compute the similarity between users using different metrics
    # kmeans 3 clusters
    cluster = KMeans(3).fit(transformed_content)
    plot_pie(cluster, "all", "kmeans", "text")
    plot_2d(cluster, transformed_content, "all", "kmeans", "text")
    user_list = [[
        users_labels[idx]
        for idx, _ in enumerate(cluster.labels_ == cluster_id) if _
    ] for cluster_id in np.unique(cluster.labels_)]

    # compute similarity scores
    from ipdb import set_trace
    set_trace()