def text_profiles_similarity(self): """Compute and return similarity scores between profiles, based on text features and KMeans clustering. """ # Text (TF-IDF) processor = TextProcessor(store_docs=True, clusters={'kmeans': lambda: KMeans(5)}) processor.run() # dictionary containing metrics for the profiles docs = [] for username, cluster in processor.clusters["kmeans"].items(): # for each cluster, build up a new dataset, we will then use it to # compare the profiles for label in np.unique(cluster.labels_): # get only the documents with this label docs.append(" ".join([ processor.stored_docs[username][i] for i, val in enumerate(cluster.labels_ == label) if val ])) features = processor.get_features(docs) self._processor = processor return euclidean_distances(features, features)
def get_topics(self, n_topics=4): """Print the topics using a RandomizedPCA""" tp = TextProcessor("docs") inverse_vocabulary = dict([(y, x) for (x, y) in tp.vec.vocabulary.items()]) for user, docs in tp.iterate(): transformed_docs = tp.get_features(docs, user) print "top %s topics for %s" % (n_topics, user) for i in range(n_topics): top_words = [inverse_vocabulary[n] for n in transformed_docs[i].argsort()[-10:][::-1]] print " - " + ", ".join(top_words) print "---"
def get_topics(self, n_topics=4): """Print the topics using a RandomizedPCA""" tp = TextProcessor("docs") inverse_vocabulary = dict([(y, x) for (x, y) in tp.vec.vocabulary.items()]) for user, docs in tp.iterate(): transformed_docs = tp.get_features(docs, user) print "top %s topics for %s" % (n_topics, user) for i in range(n_topics): top_words = [ inverse_vocabulary[n] for n in transformed_docs[i].argsort()[-10:][::-1] ] print " - " + ", ".join(top_words) print "---"
def text_profiles_similarity(self): """Compute and return similarity scores between profiles, based on text features and KMeans clustering. """ # Text (TF-IDF) processor = TextProcessor(store_docs=True, clusters={'kmeans': lambda: KMeans(5)} ) processor.run() # dictionary containing metrics for the profiles docs = [] for username, cluster in processor.clusters["kmeans"].items(): # for each cluster, build up a new dataset, we will then use it to # compare the profiles for label in np.unique(cluster.labels_): # get only the documents with this label docs.append(" ".join([processor.stored_docs[username][i] for i, val in enumerate(cluster.labels_ == label) if val])) features = processor.get_features(docs) self._processor = processor return euclidean_distances(features, features)