def text_profiles_similarity(self): """Compute and return similarity scores between profiles, based on text features and KMeans clustering. """ # Text (TF-IDF) processor = TextProcessor(store_docs=True, clusters={'kmeans': lambda: KMeans(5)}) processor.run() # dictionary containing metrics for the profiles docs = [] for username, cluster in processor.clusters["kmeans"].items(): # for each cluster, build up a new dataset, we will then use it to # compare the profiles for label in np.unique(cluster.labels_): # get only the documents with this label docs.append(" ".join([ processor.stored_docs[username][i] for i, val in enumerate(cluster.labels_ == label) if val ])) features = processor.get_features(docs) self._processor = processor return euclidean_distances(features, features)
def __init__(self): self.output_path = OUTPUT_PATH self._processor = None self._usernames = None self._rankings = None self._default_processor = lambda: TextProcessor( store_docs=True, clusters={"kmeans": lambda: KMeans(5)})
def text_users_similarity(self): """Compute the similarity between users using text features""" processor = self._processor = TextProcessor() features = [] for user, docs in processor.iterate(): features.append(processor.get_features(docs, user)) # draw the matrix for alexis draw_matrix(euclidean_distances(features[0], features[0]), "text_alexis", OUTPUT_PATH)
def compare_pca(self): """Compare the clusters generated with different values for the dimensions of the PCA """ processors = (TextProcessor(N=50, algorithms=["kmeans"]), TextProcessor(N=100, algorithms=["kmeans"]), TextProcessor(N=200, algorithms=["kmeans"])) users_cluster = defaultdict(list) for processor in processors: # don't use random centers for kmeans to be able to compare them processor._particular_user = "******" processor.run() for user, cluster in processor.clusters['kmeans'].items(): users_cluster[user].append(np.bincount(cluster.labels_)) for user, bincounts in users_cluster.items(): compare_pies(bincounts, "compare_%s.png" % user, self.output_path)
def get_topics(self, n_topics=4): """Print the topics using a RandomizedPCA""" tp = TextProcessor("docs") inverse_vocabulary = dict([(y, x) for (x, y) in tp.vec.vocabulary.items()]) for user, docs in tp.iterate(): transformed_docs = tp.get_features(docs, user) print "top %s topics for %s" % (n_topics, user) for i in range(n_topics): top_words = [ inverse_vocabulary[n] for n in transformed_docs[i].argsort()[-10:][::-1] ] print " - " + ", ".join(top_words) print "---"
def run_processors(self): args = {'draw_2d': True, 'draw_pie': True} processors = (TextProcessor(**args), ContextProcessor(**args)) for processor in processors: processor.run()