def affinity(infinitives): print "Extracting features..." X, _ = extract_features(infinitives, 3, False) X_norms = np.sum(X * X, axis=1) S = -X_norms[:, np.newaxis] - X_norms[np.newaxis, :] + 2 * np.dot(X, X.T) p = 10 * np.median(S) print "Fitting affinity propagation clustering..." af = AffinityPropagation().fit(S, p) indices = af.cluster_centers_indices_ for i, idx in enumerate(indices): print i, infinitives[idx] n_clusters_ = len(indices) print "Fitting PCA..." X = RandomizedPCA(2).fit(X).transform(X) print "Plotting..." pl.figure(1) pl.clf() colors = cycle('bgrcmyk') for k, col in zip(range(n_clusters_), colors): class_members = af.labels_ == k cluster_center = X[indices[k]] pl.plot(X[class_members,0], X[class_members,1], col+'.') pl.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) for x in X[class_members]: pl.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col) pl.title('Estimated number of clusters: %d' % n_clusters_) pl.show()
def _compare_clusters(**datasets): for name, dataset in datasets.items(): pca = RandomizedPCA(2) pca.fit(dataset) X = pca.transform(dataset) instances = _kmeans() for instance in instances: instance.fit(dataset) # reduce to 2d for visualisation draw_cluster_2d(instance, X, filename="%s-kmeans-%s.png" % (name, instance.k)) ms_instances = _meanshift(dataset) for instance in ms_instances: instance.fit(dataset) compare_pies( [_get_distribution(i) for i in instances] + [_get_distribution(i) for i in ms_instances], ["KMeans(%s)" % i.k for i in instances] + ["MeanShift(%s)" % round(i.bandwidth) for i in ms_instances], filename="%s-pie.png" % name)
def main(): dataset = [] # create a random dataset with points on the X=Y axis for i in range(100): for n in range(randint(1, 10)): dataset.append((i + randint(-5, +5), i + randint(-5, +5))) dataset = np.array(dataset) draw(dataset, 'before.png') # run a PCA to 2 dimensions for this dataset transformed_dataset = RandomizedPCA(2).fit(dataset).transform(dataset) from ipdb import set_trace set_trace() draw_2d(transformed_dataset, 'after.png')
def _load_pca_2d(self): return RandomizedPCA(n_components=2, whiten=True).fit( self.vec.transform(self.docs))
def _load_pca(self, N, *args): return RandomizedPCA(n_components=N, whiten=True).fit( self.vec.transform(self.docs))
print "n_features: %d" % n_features # Split the dataset into a training and test set split = n_samples * 3 / 4 X_train, X_test = X[:split], X[split:] y_train, y_test = y[:split], y[split:] # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled # dataset): unsupervised feature extraction / dimensionality reduction n_components = 150 print "Extracting the top %d eigenfaces" % n_components pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) eigenfaces = pca.components_.T.reshape((n_components, 64, 64)) # project the input data on the eigenfaces orthonormal basis X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) # Train a SVM classification model print "Fitting the classifier to the training set" param_grid = {"C": [1, 5, 10, 100], "gamma": [0.0001, 0.001, 0.01, 0.1]} clf = GridSearchCV(SVC(kernel="rbf"), param_grid, fit_params={"class_weight": "auto"}, n_jobs=-1) clf = clf.fit(X_train_pca, y_train) print "Best estimator found by grid search:"
X = faces.reshape((n_samples, h * w)) n_features = X.shape[1] # the label to predict is the id of the person y = lfw_people.target target_names = lfw_people.target_names # split into a training and testing set train, test = iter(StratifiedKFold(y, k=4)).next() X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test] # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled # dataset): unsupervised feature extraction / dimensionality reduction n_components = 150 pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) eigenfaces = pca.components_.reshape((n_components, h, w)) X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) # Train a SVM classification model param_grid = dict(C=[1, 5, 10, 50, 100], gamma=[0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]) clf = GridSearchCV(SVC(kernel='rbf'), param_grid, fit_params={'class_weight': 'auto'}, verbose=1) clf = clf.fit(X_train_pca, y_train) print clf.best_estimator # Quantitative evaluation of the model quality on the test set
def _draw(dataset, filename, title): pca = RandomizedPCA(2) pca.fit(dataset) X = pca.transform(dataset) draw_2d(X, filename, title)
def find_profiles_text(algo=None, training_set=None, user=None): """Find different user profiles using the TF/IDF metric (Term Frequency / Inverse Document Frequency). The stages of the pipeline are: 1. Vectorizer => 2. RandomizedPCA => 3. KMeans The use of the randomized PCA is useful here to reduce the dimensionality of the vectors space. As we lack some data, the dimentionality reduction is made using an already existing dataset, the 20 newsgroup dataset. :parm algo: the algorithm to chose. Can be kmeans, meanshift or both (specified by "all") :param training_set: the training set to use for the word vectorisation. The default setting is to use the 20 newsgroup dataset, it is possible to use the documents by specifying "docs" """ # init some vars if not algo: algo = "all" if not training_set: training_set = "newsgroup" print "Computing clusters using the TF-IDF scores,"\ " using %s algo and the %s training dataset" % (algo, training_set) # we first train the pca with all the dataset to have a most representative # model. Download the dataset and train the pca and the vector only if a # pickled version is not available (i.e only during the first run). wide_dataset = docs = None vec_filename = os.path.join(OUTPUT_PATH, "pickle/vec-%s.pickle" % training_set) pca_filename = os.path.join(OUTPUT_PATH, "pickle/pca-%s.pickle" % training_set) pca2d_filename = os.path.join(OUTPUT_PATH, "pickle/pca2d-%s.pickle" % training_set) with mesure(" loading vectors"): if os.path.isfile(vec_filename): vec = _load_obj(vec_filename) else: docs = _load_docs(docs, training_set) vec = Vectorizer().fit(docs) # equivalent to CountVectorizer + TfIdf _save_obj(vec, vec_filename) with mesure(" loading PCA"): if os.path.isfile(pca_filename): pca = _load_obj(pca_filename) else: docs = _load_docs(docs, training_set) print " reduce the dimentionality of the dataset to 100 components" # whiten=True ensure that the variance of each dim of the data in the # transformed space is scaled to 1.0 pca = RandomizedPCA(n_components=100, whiten=True).fit(vec.transform(docs)) _save_obj(pca, pca_filename) # To visualize the data, we will project it on 2 dimensions. To do so, we # will use a Principal Component Analysis (as we made in the first steps), # but projecting on 2 dimensions. with mesure(" loading PCA 2D"): if os.path.isfile(pca2d_filename): pca_2d = _load_obj(pca2d_filename) else: docs = _load_docs(docs, training_set) print " reduce the dimensionality of the dataset to 2 components" pca_2d = RandomizedPCA(n_components=2, whiten=True).fit(vec.transform(docs)) _save_obj(pca_2d, pca2d_filename) # Now, go trough the whole resources for each users and try to find user # profiles regarding TF-IDF # as the process can take some time, there is a progressbar to keep the user # updated about the status of the operation for username in list(db.users.find().distinct('username')): if user and user != username: continue # get all the resources for this user urls = db.views.find({"user.username": username}).distinct("url") if not urls: continue # if we don't have any url for this user, go to the next one! resources = list(db.resources.find({'url': {'$in': urls }, 'blacklisted': False, 'processed': True})) if not resources: continue print "processing %s (%s docs)" % (username, len(resources)) # get the docs content and names docs = [res['content'] for res in resources] urls = [res['url'] for res in resources] # fit the contents to the new set of features the PCA determined with mesure(" reduce dataset dimensions to 100"): docs_transformed = pca.transform(vec.transform(docs)) # what we do have now is a matrix with 100 dimentions, which is not really # useful for representation. Keeping this for later analysis is a good # thing so let's save this model for comparing profiles against resources # later # TODO pickle the kmeans into mongodb ? # project X onto 2D with mesure(" reduce dataset dimensions to 2"): docs_2d = pca_2d.transform(vec.transform(docs)) # run the clustering algorithm if algo in ["kmeans", "all"]: with mesure(" kmeans(5)"): cluster = KMeans(k=5).fit(docs_transformed) # get_words_from_clusters(cluster, 10, docs, vec) # print "ngrams for km on %s" % username # get_n_bigrams_from_clusters(cluster, docs, 5) plot_2d(cluster, docs_2d, username, "kmeans", "Text-%s" % training_set) plot_pie(cluster, username, "kmeans", "Text-%s" % training_set) if algo in ["meanshift", "all"]: with mesure(" meanshift"): cluster = MeanShift().fit(docs_transformed) # print "ngrams for ms on %s" % username # get_n_bigrams_from_clusters(cluster, docs, 3) plot_2d(cluster, docs_2d, username, "meanshift", "Text-%s" % training_set) plot_pie(cluster, username, "meanshift", "Text-%s" % training_set) if algo in ["affinity", "all"]: with mesure(" affinity propagation"): cluster = AffinityPropagation().fit(euclidean_distances(docs_transformed, docs_transformed)) plot_pie(cluster, username, "affinity", "Text-%s" % training_set)
def cluster_users(features=None): """Cluster the users, without using information about profiles. Different features can be used to do so, at least text features and context features. """ training_set="newsgroup" docs = None vec_filename = os.path.join(OUTPUT_PATH, "pickle/vec-%s.pickle" % training_set) pca_filename = os.path.join(OUTPUT_PATH, "pickle/pca-%s.pickle" % training_set) # get the training set, transform it to N dimensions with mesure(" loading vectors"): if os.path.isfile(vec_filename): vec = _load_obj(vec_filename) else: docs = _load_docs(docs, training_set) vec = Vectorizer().fit(docs) # equivalent to CountVectorizer + TfIdf _save_obj(vec, vec_filename) with mesure(" loading PCA"): if os.path.isfile(pca_filename): pca = _load_obj(pca_filename) else: docs = _load_docs(docs, training_set) print " reduce the dimentionality of the dataset to 100 components" # whiten=True ensure that the variance of each dim of the data in the # transformed space is scaled to 1.0 pca = RandomizedPCA(n_components=100, whiten=True).fit(vec.transform(docs)) _save_obj(pca, pca_filename) # for each user, get the contents related to him. users_content = [] users_labels = [] for username in list(db.users.find().distinct('username')): # get all the resources for this user urls = db.views.find({"user.username": username}).distinct("url") if not urls: continue # if we don't have any url for this user, go to the next one! resources = list(db.resources.find({'url': {'$in': urls }, 'blacklisted': False, 'processed': True})) if not resources: continue print "processing %s (%s docs)" % (username, len(resources)) # get the docs content and names users_labels.append(username) users_content.append(" ".join([res['content'] for res in resources])) with mesure(" vectorise and reduce the dataset dimensions to 100"): transformed_content = pca.transform(vec.transform(users_content)) # at the end, compute the similarity between users using different metrics # kmeans 3 clusters cluster = KMeans(3).fit(transformed_content) plot_pie(cluster, "all", "kmeans", "text") plot_2d(cluster, transformed_content, "all", "kmeans", "text") user_list = [[users_labels[idx] for idx, _ in enumerate(cluster.labels_ == cluster_id) if _] for cluster_id in np.unique(cluster.labels_)] # compute similarity scores from ipdb import set_trace; set_trace()
def lininit(self): """X = UsigmaWT, XTX = Wsigma^2WT, T = XW = Usigma Further, we can get lower ranks by using just few of the eigen vevtors T(2) = U(2)sigma(2) = XW(2) ---> 2 is the number of selected eigenvectors This is how we initialize the map, just by using the first two first eigen vals and eigenvectors Further, we create a linear combination of them in the new map by giving values from -1 to 1 in each Direction of SOM map It shoud be noted that here, X is the covariance matrix of original data """ msize = getattr(self, 'mapsize') rows = msize[0] cols = msize[1] nnodes = getattr(self, 'nnodes') if np.min(msize) > 1: coord = np.zeros((nnodes, 2)) for i in range(0, nnodes): coord[i, 0] = int(i / cols) # x coord[i, 1] = int(i % cols) # y mx = np.max(coord, axis=0) mn = np.min(coord, axis=0) coord = (coord - mn) / (mx - mn) coord = (coord - .5) * 2 data = getattr(self, 'data') me = np.mean(data, 0) data = (data - me) codebook = np.tile(me, (nnodes, 1)) pca = RandomizedPCA(n_components=2) # Randomized PCA is scalable # pca = PCA(n_components=2) pca.fit(data) eigvec = pca.components_ eigval = pca.explained_variance_ norms = np.sqrt(np.einsum('ij, ij->i', eigvec, eigvec)) eigvec = ((eigvec.T / norms) * eigval).T; eigvec.shape for j in range(nnodes): for i in range(eigvec.shape[0]): codebook[j, :] = codebook[j, :] + coord[j, i] * eigvec[i, :] return np.around(codebook, decimals=6) elif np.min(msize) == 1: coord = np.zeros((nnodes, 1)) for i in range(0, nnodes): # coord[i, 0] = int(i / cols) # x coord[i, 0] = int(i % cols) # y mx = np.max(coord, axis=0) mn = np.min(coord, axis=0) # print coord coord = (coord - mn) / (mx - mn) coord = (coord - 0.5) * 2 # print coord data = getattr(self, 'data') me = np.mean(data, 0) data = (data - me) codebook = np.tile(me, (nnodes, 1)) pca = RandomizedPCA(n_components=1) # Randomized PCA is scalable # pca = PCA(n_components=2) pca.fit(data) eigvec = pca.components_ eigval = pca.explained_variance_ norms = np.sqrt(np.einsum('ij, ij->i', eigvec, eigvec)) eigvec = ((eigvec.T / norms) * eigval).T; eigvec.shape for j in range(nnodes): for i in range(eigvec.shape[0]): codebook[j, :] = codebook[j, :] + coord[j, i] & eigvec[i, :] return np.around(codebook, decimals=6)
def find_profiles_text(algo=None, training_set=None, user=None): """Find different user profiles using the TF/IDF metric (Term Frequency / Inverse Document Frequency). The stages of the pipeline are: 1. Vectorizer => 2. RandomizedPCA => 3. KMeans The use of the randomized PCA is useful here to reduce the dimensionality of the vectors space. As we lack some data, the dimentionality reduction is made using an already existing dataset, the 20 newsgroup dataset. :parm algo: the algorithm to chose. Can be kmeans, meanshift or both (specified by "all") :param training_set: the training set to use for the word vectorisation. The default setting is to use the 20 newsgroup dataset, it is possible to use the documents by specifying "docs" """ # init some vars if not algo: algo = "all" if not training_set: training_set = "newsgroup" print "Computing clusters using the TF-IDF scores,"\ " using %s algo and the %s training dataset" % (algo, training_set) # we first train the pca with all the dataset to have a most representative # model. Download the dataset and train the pca and the vector only if a # pickled version is not available (i.e only during the first run). wide_dataset = docs = None vec_filename = os.path.join(OUTPUT_PATH, "pickle/vec-%s.pickle" % training_set) pca_filename = os.path.join(OUTPUT_PATH, "pickle/pca-%s.pickle" % training_set) pca2d_filename = os.path.join(OUTPUT_PATH, "pickle/pca2d-%s.pickle" % training_set) with mesure(" loading vectors"): if os.path.isfile(vec_filename): vec = _load_obj(vec_filename) else: docs = _load_docs(docs, training_set) vec = Vectorizer().fit( docs) # equivalent to CountVectorizer + TfIdf _save_obj(vec, vec_filename) with mesure(" loading PCA"): if os.path.isfile(pca_filename): pca = _load_obj(pca_filename) else: docs = _load_docs(docs, training_set) print " reduce the dimentionality of the dataset to 100 components" # whiten=True ensure that the variance of each dim of the data in the # transformed space is scaled to 1.0 pca = RandomizedPCA(n_components=100, whiten=True).fit(vec.transform(docs)) _save_obj(pca, pca_filename) # To visualize the data, we will project it on 2 dimensions. To do so, we # will use a Principal Component Analysis (as we made in the first steps), # but projecting on 2 dimensions. with mesure(" loading PCA 2D"): if os.path.isfile(pca2d_filename): pca_2d = _load_obj(pca2d_filename) else: docs = _load_docs(docs, training_set) print " reduce the dimensionality of the dataset to 2 components" pca_2d = RandomizedPCA(n_components=2, whiten=True).fit(vec.transform(docs)) _save_obj(pca_2d, pca2d_filename) # Now, go trough the whole resources for each users and try to find user # profiles regarding TF-IDF # as the process can take some time, there is a progressbar to keep the user # updated about the status of the operation for username in list(db.users.find().distinct('username')): if user and user != username: continue # get all the resources for this user urls = db.views.find({"user.username": username}).distinct("url") if not urls: continue # if we don't have any url for this user, go to the next one! resources = list( db.resources.find({ 'url': { '$in': urls }, 'blacklisted': False, 'processed': True })) if not resources: continue print "processing %s (%s docs)" % (username, len(resources)) # get the docs content and names docs = [res['content'] for res in resources] urls = [res['url'] for res in resources] # fit the contents to the new set of features the PCA determined with mesure(" reduce dataset dimensions to 100"): docs_transformed = pca.transform(vec.transform(docs)) # what we do have now is a matrix with 100 dimentions, which is not really # useful for representation. Keeping this for later analysis is a good # thing so let's save this model for comparing profiles against resources # later # TODO pickle the kmeans into mongodb ? # project X onto 2D with mesure(" reduce dataset dimensions to 2"): docs_2d = pca_2d.transform(vec.transform(docs)) # run the clustering algorithm if algo in ["kmeans", "all"]: with mesure(" kmeans(5)"): cluster = KMeans(k=5).fit(docs_transformed) # get_words_from_clusters(cluster, 10, docs, vec) # print "ngrams for km on %s" % username # get_n_bigrams_from_clusters(cluster, docs, 5) plot_2d(cluster, docs_2d, username, "kmeans", "Text-%s" % training_set) plot_pie(cluster, username, "kmeans", "Text-%s" % training_set) if algo in ["meanshift", "all"]: with mesure(" meanshift"): cluster = MeanShift().fit(docs_transformed) # print "ngrams for ms on %s" % username # get_n_bigrams_from_clusters(cluster, docs, 3) plot_2d(cluster, docs_2d, username, "meanshift", "Text-%s" % training_set) plot_pie(cluster, username, "meanshift", "Text-%s" % training_set) if algo in ["affinity", "all"]: with mesure(" affinity propagation"): cluster = AffinityPropagation().fit( euclidean_distances(docs_transformed, docs_transformed)) plot_pie(cluster, username, "affinity", "Text-%s" % training_set)
def cluster_users(features=None): """Cluster the users, without using information about profiles. Different features can be used to do so, at least text features and context features. """ training_set = "newsgroup" docs = None vec_filename = os.path.join(OUTPUT_PATH, "pickle/vec-%s.pickle" % training_set) pca_filename = os.path.join(OUTPUT_PATH, "pickle/pca-%s.pickle" % training_set) # get the training set, transform it to N dimensions with mesure(" loading vectors"): if os.path.isfile(vec_filename): vec = _load_obj(vec_filename) else: docs = _load_docs(docs, training_set) vec = Vectorizer().fit( docs) # equivalent to CountVectorizer + TfIdf _save_obj(vec, vec_filename) with mesure(" loading PCA"): if os.path.isfile(pca_filename): pca = _load_obj(pca_filename) else: docs = _load_docs(docs, training_set) print " reduce the dimentionality of the dataset to 100 components" # whiten=True ensure that the variance of each dim of the data in the # transformed space is scaled to 1.0 pca = RandomizedPCA(n_components=100, whiten=True).fit(vec.transform(docs)) _save_obj(pca, pca_filename) # for each user, get the contents related to him. users_content = [] users_labels = [] for username in list(db.users.find().distinct('username')): # get all the resources for this user urls = db.views.find({"user.username": username}).distinct("url") if not urls: continue # if we don't have any url for this user, go to the next one! resources = list( db.resources.find({ 'url': { '$in': urls }, 'blacklisted': False, 'processed': True })) if not resources: continue print "processing %s (%s docs)" % (username, len(resources)) # get the docs content and names users_labels.append(username) users_content.append(" ".join([res['content'] for res in resources])) with mesure(" vectorise and reduce the dataset dimensions to 100"): transformed_content = pca.transform(vec.transform(users_content)) # at the end, compute the similarity between users using different metrics # kmeans 3 clusters cluster = KMeans(3).fit(transformed_content) plot_pie(cluster, "all", "kmeans", "text") plot_2d(cluster, transformed_content, "all", "kmeans", "text") user_list = [[ users_labels[idx] for idx, _ in enumerate(cluster.labels_ == cluster_id) if _ ] for cluster_id in np.unique(cluster.labels_)] # compute similarity scores from ipdb import set_trace set_trace()
digits = datasets.load_digits() # reshape the data using the traditional (n_samples, n_features) shape n_samples = len(digits.images) X = digits.images.reshape((n_samples, -1)) n_features = X.shape[1] n_components = 16 ###################################################################### # Compute a PCA (eigendigits) on the digit dataset print "Extracting the top %d eigendigits from %d images" % ( n_components, X.shape[0]) t0 = time() pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X) print "done in %0.3fs" % (time() - t0) eigendigits = pca.components_ ###################################################################### # Compute a NMF on the digit dataset print "Extracting %d non-negative features from %d images" % ( n_components, X.shape[0]) t0 = time() nmf = NMF(n_components=n_components, init='nndsvd', beta=5, tol=1e-2, sparseness="components").fit(X) print "done in %0.3fs" % (time() - t0) nmfdigits = nmf.components_