def context_users_similarity(self): """Compute the similarity between users using context features""" filename = os.path.join(OUTPUT_PATH, "pickle", "context_users_features.pickle") if os.path.isfile(filename): with open(filename) as f: features = pickle.load(f) else: self._processor = ContextProcessor() features = [] # get all the features for each user for user, docs in self._processor.iterate(): features.append(self._processor.get_features(docs, user)) with open(filename, "w+") as f: pickle.dump(features, f) reduced_features = [] for doc in features: reduced_features.append(np.mean(doc, axis=1)) from ipdb import set_trace; set_trace() # it is possible to cluster each user's documents # # for alexis, let's print the similarity matrix of his documents draw_matrix(euclidean_distances(features[0], features[0]), "context_alexis", OUTPUT_PATH)
def context_users_similarity(self): """Compute the similarity between users using context features""" filename = os.path.join(OUTPUT_PATH, "pickle", "context_users_features.pickle") if os.path.isfile(filename): with open(filename) as f: features = pickle.load(f) else: self._processor = ContextProcessor() features = [] # get all the features for each user for user, docs in self._processor.iterate(): features.append(self._processor.get_features(docs, user)) with open(filename, "w+") as f: pickle.dump(features, f) reduced_features = [] for doc in features: reduced_features.append(np.mean(doc, axis=1)) from ipdb import set_trace set_trace() # it is possible to cluster each user's documents # # for alexis, let's print the similarity matrix of his documents draw_matrix(euclidean_distances(features[0], features[0]), "context_alexis", OUTPUT_PATH)
def text_profiles_similarity(self): """Compute and return similarity scores between profiles, based on text features and KMeans clustering. """ # Text (TF-IDF) processor = TextProcessor(store_docs=True, clusters={'kmeans': lambda: KMeans(5)}) processor.run() # dictionary containing metrics for the profiles docs = [] for username, cluster in processor.clusters["kmeans"].items(): # for each cluster, build up a new dataset, we will then use it to # compare the profiles for label in np.unique(cluster.labels_): # get only the documents with this label docs.append(" ".join([ processor.stored_docs[username][i] for i, val in enumerate(cluster.labels_ == label) if val ])) features = processor.get_features(docs) self._processor = processor return euclidean_distances(features, features)
def text_users_similarity(self): """Compute the similarity between users using text features""" processor = self._processor = TextProcessor() features = [] for user, docs in processor.iterate(): features.append(processor.get_features(docs, user)) # draw the matrix for alexis draw_matrix(euclidean_distances(features[0], features[0]), "text_alexis", OUTPUT_PATH)
def text_cluster_users_similarity(self): """Compute and return similarity scores between users, based on text features. """ self._processor = ClusterUsers(store_docs=True) # for each user, we want to have a set of features representing it features = [] for name, docs in self.processor.iterate(): features = self.processor.get_features(docs) # there is only one tuple (name, docs) so we return here return euclidean_distances(features, features)
def text_profiles_similarity(self): """Compute and return similarity scores between profiles, based on text features and KMeans clustering. """ # Text (TF-IDF) processor = TextProcessor(store_docs=True, clusters={'kmeans': lambda: KMeans(5)} ) processor.run() # dictionary containing metrics for the profiles docs = [] for username, cluster in processor.clusters["kmeans"].items(): # for each cluster, build up a new dataset, we will then use it to # compare the profiles for label in np.unique(cluster.labels_): # get only the documents with this label docs.append(" ".join([processor.stored_docs[username][i] for i, val in enumerate(cluster.labels_ == label) if val])) features = processor.get_features(docs) self._processor = processor return euclidean_distances(features, features)
mbk_means_labels = mbk.labels_ mbk_means_cluster_centers = mbk.cluster_centers_ mbk_means_labels_unique = np.unique(mbk_means_labels) ############################################################################## # Plot result fig = pl.figure() colors = ['#4EACC5', '#FF9C34', '#4E9A06'] # We want to have the same colors for the same cluster from the # MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per # closest one. distance = euclidean_distances(k_means_cluster_centers, mbk_means_cluster_centers, squared=True) order = distance.argmin(axis=1) # KMeans ax = fig.add_subplot(1, 3, 1) for k, col in zip(range(n_clusters), colors): my_members = k_means_labels == k cluster_center = k_means_cluster_centers[k] ax.plot(X[my_members, 0], X[my_members, 1], 'w', markerfacecolor=col, marker='.') ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6) ax.set_title('KMeans') pl.text(-3.5, 2.7, 'train time: %.2fs' % t_batch)
def find_profiles_text(algo=None, training_set=None, user=None): """Find different user profiles using the TF/IDF metric (Term Frequency / Inverse Document Frequency). The stages of the pipeline are: 1. Vectorizer => 2. RandomizedPCA => 3. KMeans The use of the randomized PCA is useful here to reduce the dimensionality of the vectors space. As we lack some data, the dimentionality reduction is made using an already existing dataset, the 20 newsgroup dataset. :parm algo: the algorithm to chose. Can be kmeans, meanshift or both (specified by "all") :param training_set: the training set to use for the word vectorisation. The default setting is to use the 20 newsgroup dataset, it is possible to use the documents by specifying "docs" """ # init some vars if not algo: algo = "all" if not training_set: training_set = "newsgroup" print "Computing clusters using the TF-IDF scores,"\ " using %s algo and the %s training dataset" % (algo, training_set) # we first train the pca with all the dataset to have a most representative # model. Download the dataset and train the pca and the vector only if a # pickled version is not available (i.e only during the first run). wide_dataset = docs = None vec_filename = os.path.join(OUTPUT_PATH, "pickle/vec-%s.pickle" % training_set) pca_filename = os.path.join(OUTPUT_PATH, "pickle/pca-%s.pickle" % training_set) pca2d_filename = os.path.join(OUTPUT_PATH, "pickle/pca2d-%s.pickle" % training_set) with mesure(" loading vectors"): if os.path.isfile(vec_filename): vec = _load_obj(vec_filename) else: docs = _load_docs(docs, training_set) vec = Vectorizer().fit(docs) # equivalent to CountVectorizer + TfIdf _save_obj(vec, vec_filename) with mesure(" loading PCA"): if os.path.isfile(pca_filename): pca = _load_obj(pca_filename) else: docs = _load_docs(docs, training_set) print " reduce the dimentionality of the dataset to 100 components" # whiten=True ensure that the variance of each dim of the data in the # transformed space is scaled to 1.0 pca = RandomizedPCA(n_components=100, whiten=True).fit(vec.transform(docs)) _save_obj(pca, pca_filename) # To visualize the data, we will project it on 2 dimensions. To do so, we # will use a Principal Component Analysis (as we made in the first steps), # but projecting on 2 dimensions. with mesure(" loading PCA 2D"): if os.path.isfile(pca2d_filename): pca_2d = _load_obj(pca2d_filename) else: docs = _load_docs(docs, training_set) print " reduce the dimensionality of the dataset to 2 components" pca_2d = RandomizedPCA(n_components=2, whiten=True).fit(vec.transform(docs)) _save_obj(pca_2d, pca2d_filename) # Now, go trough the whole resources for each users and try to find user # profiles regarding TF-IDF # as the process can take some time, there is a progressbar to keep the user # updated about the status of the operation for username in list(db.users.find().distinct('username')): if user and user != username: continue # get all the resources for this user urls = db.views.find({"user.username": username}).distinct("url") if not urls: continue # if we don't have any url for this user, go to the next one! resources = list(db.resources.find({'url': {'$in': urls }, 'blacklisted': False, 'processed': True})) if not resources: continue print "processing %s (%s docs)" % (username, len(resources)) # get the docs content and names docs = [res['content'] for res in resources] urls = [res['url'] for res in resources] # fit the contents to the new set of features the PCA determined with mesure(" reduce dataset dimensions to 100"): docs_transformed = pca.transform(vec.transform(docs)) # what we do have now is a matrix with 100 dimentions, which is not really # useful for representation. Keeping this for later analysis is a good # thing so let's save this model for comparing profiles against resources # later # TODO pickle the kmeans into mongodb ? # project X onto 2D with mesure(" reduce dataset dimensions to 2"): docs_2d = pca_2d.transform(vec.transform(docs)) # run the clustering algorithm if algo in ["kmeans", "all"]: with mesure(" kmeans(5)"): cluster = KMeans(k=5).fit(docs_transformed) # get_words_from_clusters(cluster, 10, docs, vec) # print "ngrams for km on %s" % username # get_n_bigrams_from_clusters(cluster, docs, 5) plot_2d(cluster, docs_2d, username, "kmeans", "Text-%s" % training_set) plot_pie(cluster, username, "kmeans", "Text-%s" % training_set) if algo in ["meanshift", "all"]: with mesure(" meanshift"): cluster = MeanShift().fit(docs_transformed) # print "ngrams for ms on %s" % username # get_n_bigrams_from_clusters(cluster, docs, 3) plot_2d(cluster, docs_2d, username, "meanshift", "Text-%s" % training_set) plot_pie(cluster, username, "meanshift", "Text-%s" % training_set) if algo in ["affinity", "all"]: with mesure(" affinity propagation"): cluster = AffinityPropagation().fit(euclidean_distances(docs_transformed, docs_transformed)) plot_pie(cluster, username, "affinity", "Text-%s" % training_set)
def find_profiles_context(algo=None, user=None): """Find profiles based on: * location of the views * time of the day of the views * time of the day * day of the week """ if not algo: algo = "all" # get all users for username in db.users.distinct("username"): if user and user != username: continue urls = db.views.find({"user.username": username}).distinct("url") resources = [] if not urls: continue print "processing %s (%s docs)" % (username, len(urls)) t0 = time.time() progress = ProgressBar( widgets=[" building the matrix for %s" % username, Percentage(), Bar()]) for url in progress(urls): # get the views related to this user and this url views = db.views.find({"user.username": username, "url": url}) views = list(views) indicators = ['average', 'mean', 'median', 'var', 'std'] row = [len(views), sum([int(v['duration']) for v in views])] # TODO add location daytimes = [] weekdays = [] for view in views: daytimes.append(view['daytime']) weekdays.append(view['weekday']) for indicator in indicators: row.append(getattr(np, indicator)(daytimes)) row.append(getattr(np, indicator)(weekdays)) resources.append(row) resources = np.array(resources) print "matrix generation took %s" % (time.time() - t0) # project X on 2D # print " project the dataset into 2d" # pca_2d = RandomizedPCA(n_components=2, whiten=True).fit(resources) # docs_2d = pca_2d.transform(resources) # run the clustering algorithm if algo in ["kmeans", "all"]: with mesure(" kmeans(5)"): cluster = KMeans(k=5).fit(resources) plot_2d(cluster, resources, username, "kmeans", "Context") plot_pie(cluster, username, "kmeans", "Context") if algo in ["meanshift", "all"]: with mesure(" meanshift"): cluster = MeanShift().fit(resources) plot_2d(cluster, resources, username, "meanshift", "Context") plot_pie(cluster, username, "meanshift", "Context") if algo in ["affinity", "all"]: with mesure(" affinity propagation"): cluster = AffinityPropagation().fit(euclidean_distances(resources, resources)) # plot_2d(cluster, resources, username, "affinity", "Context") plot_pie(cluster, username, "affinity", "Context")
def get_profiles_similarity(usernames, N): """ Return a matrix of similarity between the users. :usernames: The list of usernames in the system :N: the number of profiles to find for each user """ # all the documents per profile will be stored in this variable doc_profiles = [] # all the urls for each profiles will be put in this array urls = [] # For each user, get his views for username in usernames: print "processing %s" % username # don't use generators are we want to access it multiple times, so we # actually need to store it in memory views = list(db.views.find({"user.username": username, 'url': { '$nin': list(db.resources.find({'blacklisted': True}).distinct('url')) }})) features = get_views_features(views) # Run a clustering algorithm on the view np_features = np.array(features) #bandwidth = estimate_bandwidth(np_features, quantile=0.3) #algo = MeanShift(bandwidth=bandwidth).fit(np_features) # The distribution from the KMeans algorithm is better because we get # more balanced clusters. MeanShift comes with a lot of clusters with # less than 2 elements. with mesure("clustering the context to find %s profiles" % N, indent=1): algo = KMeans(N).fit(np_features) # for each cluster, get the matching views # this means iterating N times (where N is the number of cluster found) for label in np.unique(algo.labels_): profile_urls = [] for i, matches in enumerate(algo.labels_ == label): view = views[i] if matches and view['url'] not in profile_urls: profile_urls.append(view['url']) # save the urls of this profile for later use urls.append(profile_urls) resources = db.resources.find({ 'url': {'$in': profile_urls}, # get the resources for those urls 'blacklisted': False, 'processed': True}) # Append the contents for this profile together doc_profiles.append(" ".join([r['content'] for r in resources])) # train the vectorizer on a big and sparse set of documents # the vectorizer is loaded from disk to avoid recomputing it each time with open(os.path.join(OUTPUT_PATH, "pickle", "vecnewsgroup.pickle")) as f: vec = pickle.load(f) # Same for the principal component analysis (PCA) with open(os.path.join(OUTPUT_PATH, "pickle", "pca100-newsgroup.pickle")) as f: pca = pickle.load(f) # At this stage, all the documents are stored into memory, sometimes # more than once for each resource. We want to vectorize them all and thus # it can take some time. with mesure("vectorizing %s profiles" % len(doc_profiles)): vec_profiles = pca.transform(vec.transform(doc_profiles)) # Compute their similarity score return euclidean_distances(vec_profiles, vec_profiles), urls
def find_profiles_text(algo=None, training_set=None, user=None): """Find different user profiles using the TF/IDF metric (Term Frequency / Inverse Document Frequency). The stages of the pipeline are: 1. Vectorizer => 2. RandomizedPCA => 3. KMeans The use of the randomized PCA is useful here to reduce the dimensionality of the vectors space. As we lack some data, the dimentionality reduction is made using an already existing dataset, the 20 newsgroup dataset. :parm algo: the algorithm to chose. Can be kmeans, meanshift or both (specified by "all") :param training_set: the training set to use for the word vectorisation. The default setting is to use the 20 newsgroup dataset, it is possible to use the documents by specifying "docs" """ # init some vars if not algo: algo = "all" if not training_set: training_set = "newsgroup" print "Computing clusters using the TF-IDF scores,"\ " using %s algo and the %s training dataset" % (algo, training_set) # we first train the pca with all the dataset to have a most representative # model. Download the dataset and train the pca and the vector only if a # pickled version is not available (i.e only during the first run). wide_dataset = docs = None vec_filename = os.path.join(OUTPUT_PATH, "pickle/vec-%s.pickle" % training_set) pca_filename = os.path.join(OUTPUT_PATH, "pickle/pca-%s.pickle" % training_set) pca2d_filename = os.path.join(OUTPUT_PATH, "pickle/pca2d-%s.pickle" % training_set) with mesure(" loading vectors"): if os.path.isfile(vec_filename): vec = _load_obj(vec_filename) else: docs = _load_docs(docs, training_set) vec = Vectorizer().fit( docs) # equivalent to CountVectorizer + TfIdf _save_obj(vec, vec_filename) with mesure(" loading PCA"): if os.path.isfile(pca_filename): pca = _load_obj(pca_filename) else: docs = _load_docs(docs, training_set) print " reduce the dimentionality of the dataset to 100 components" # whiten=True ensure that the variance of each dim of the data in the # transformed space is scaled to 1.0 pca = RandomizedPCA(n_components=100, whiten=True).fit(vec.transform(docs)) _save_obj(pca, pca_filename) # To visualize the data, we will project it on 2 dimensions. To do so, we # will use a Principal Component Analysis (as we made in the first steps), # but projecting on 2 dimensions. with mesure(" loading PCA 2D"): if os.path.isfile(pca2d_filename): pca_2d = _load_obj(pca2d_filename) else: docs = _load_docs(docs, training_set) print " reduce the dimensionality of the dataset to 2 components" pca_2d = RandomizedPCA(n_components=2, whiten=True).fit(vec.transform(docs)) _save_obj(pca_2d, pca2d_filename) # Now, go trough the whole resources for each users and try to find user # profiles regarding TF-IDF # as the process can take some time, there is a progressbar to keep the user # updated about the status of the operation for username in list(db.users.find().distinct('username')): if user and user != username: continue # get all the resources for this user urls = db.views.find({"user.username": username}).distinct("url") if not urls: continue # if we don't have any url for this user, go to the next one! resources = list( db.resources.find({ 'url': { '$in': urls }, 'blacklisted': False, 'processed': True })) if not resources: continue print "processing %s (%s docs)" % (username, len(resources)) # get the docs content and names docs = [res['content'] for res in resources] urls = [res['url'] for res in resources] # fit the contents to the new set of features the PCA determined with mesure(" reduce dataset dimensions to 100"): docs_transformed = pca.transform(vec.transform(docs)) # what we do have now is a matrix with 100 dimentions, which is not really # useful for representation. Keeping this for later analysis is a good # thing so let's save this model for comparing profiles against resources # later # TODO pickle the kmeans into mongodb ? # project X onto 2D with mesure(" reduce dataset dimensions to 2"): docs_2d = pca_2d.transform(vec.transform(docs)) # run the clustering algorithm if algo in ["kmeans", "all"]: with mesure(" kmeans(5)"): cluster = KMeans(k=5).fit(docs_transformed) # get_words_from_clusters(cluster, 10, docs, vec) # print "ngrams for km on %s" % username # get_n_bigrams_from_clusters(cluster, docs, 5) plot_2d(cluster, docs_2d, username, "kmeans", "Text-%s" % training_set) plot_pie(cluster, username, "kmeans", "Text-%s" % training_set) if algo in ["meanshift", "all"]: with mesure(" meanshift"): cluster = MeanShift().fit(docs_transformed) # print "ngrams for ms on %s" % username # get_n_bigrams_from_clusters(cluster, docs, 3) plot_2d(cluster, docs_2d, username, "meanshift", "Text-%s" % training_set) plot_pie(cluster, username, "meanshift", "Text-%s" % training_set) if algo in ["affinity", "all"]: with mesure(" affinity propagation"): cluster = AffinityPropagation().fit( euclidean_distances(docs_transformed, docs_transformed)) plot_pie(cluster, username, "affinity", "Text-%s" % training_set)
def find_profiles_context(algo=None, user=None): """Find profiles based on: * location of the views * time of the day of the views * time of the day * day of the week """ if not algo: algo = "all" # get all users for username in db.users.distinct("username"): if user and user != username: continue urls = db.views.find({"user.username": username}).distinct("url") resources = [] if not urls: continue print "processing %s (%s docs)" % (username, len(urls)) t0 = time.time() progress = ProgressBar(widgets=[ " building the matrix for %s" % username, Percentage(), Bar() ]) for url in progress(urls): # get the views related to this user and this url views = db.views.find({"user.username": username, "url": url}) views = list(views) indicators = ['average', 'mean', 'median', 'var', 'std'] row = [len(views), sum([int(v['duration']) for v in views])] # TODO add location daytimes = [] weekdays = [] for view in views: daytimes.append(view['daytime']) weekdays.append(view['weekday']) for indicator in indicators: row.append(getattr(np, indicator)(daytimes)) row.append(getattr(np, indicator)(weekdays)) resources.append(row) resources = np.array(resources) print "matrix generation took %s" % (time.time() - t0) # project X on 2D # print " project the dataset into 2d" # pca_2d = RandomizedPCA(n_components=2, whiten=True).fit(resources) # docs_2d = pca_2d.transform(resources) # run the clustering algorithm if algo in ["kmeans", "all"]: with mesure(" kmeans(5)"): cluster = KMeans(k=5).fit(resources) plot_2d(cluster, resources, username, "kmeans", "Context") plot_pie(cluster, username, "kmeans", "Context") if algo in ["meanshift", "all"]: with mesure(" meanshift"): cluster = MeanShift().fit(resources) plot_2d(cluster, resources, username, "meanshift", "Context") plot_pie(cluster, username, "meanshift", "Context") if algo in ["affinity", "all"]: with mesure(" affinity propagation"): cluster = AffinityPropagation().fit( euclidean_distances(resources, resources)) # plot_2d(cluster, resources, username, "affinity", "Context") plot_pie(cluster, username, "affinity", "Context")