예제 #1
0
파일: run.py 프로젝트: shyam15287/infuse
    def text_profiles_similarity(self):
        """Compute and return similarity scores between profiles, based on text 
        features and KMeans clustering.
        """

        # Text (TF-IDF)
        processor = TextProcessor(store_docs=True,
                                  clusters={'kmeans': lambda: KMeans(5)})
        processor.run()

        # dictionary containing metrics for the profiles
        docs = []
        for username, cluster in processor.clusters["kmeans"].items():
            # for each cluster, build up a new dataset, we will then use it to
            # compare the profiles
            for label in np.unique(cluster.labels_):
                # get only the documents with this label
                docs.append(" ".join([
                    processor.stored_docs[username][i]
                    for i, val in enumerate(cluster.labels_ == label) if val
                ]))

        features = processor.get_features(docs)
        self._processor = processor
        return euclidean_distances(features, features)
예제 #2
0
파일: run.py 프로젝트: almet/infuse
    def get_topics(self, n_topics=4):
        """Print the topics using a RandomizedPCA"""
        tp = TextProcessor("docs")
        inverse_vocabulary = dict([(y, x) for (x, y) in tp.vec.vocabulary.items()])

        for user, docs in tp.iterate():
            transformed_docs = tp.get_features(docs, user)
            print "top %s topics for %s" % (n_topics, user)
            for i in range(n_topics):
                top_words = [inverse_vocabulary[n] for n in 
                    transformed_docs[i].argsort()[-10:][::-1]]
                print "  - " + ", ".join(top_words)
            print "---"
예제 #3
0
파일: run.py 프로젝트: shyam15287/infuse
    def get_topics(self, n_topics=4):
        """Print the topics using a RandomizedPCA"""
        tp = TextProcessor("docs")
        inverse_vocabulary = dict([(y, x)
                                   for (x, y) in tp.vec.vocabulary.items()])

        for user, docs in tp.iterate():
            transformed_docs = tp.get_features(docs, user)
            print "top %s topics for %s" % (n_topics, user)
            for i in range(n_topics):
                top_words = [
                    inverse_vocabulary[n]
                    for n in transformed_docs[i].argsort()[-10:][::-1]
                ]
                print "  - " + ", ".join(top_words)
            print "---"
예제 #4
0
파일: run.py 프로젝트: shyam15287/infuse
 def __init__(self):
     self.output_path = OUTPUT_PATH
     self._processor = None
     self._usernames = None
     self._rankings = None
     self._default_processor = lambda: TextProcessor(
         store_docs=True, clusters={"kmeans": lambda: KMeans(5)})
예제 #5
0
파일: run.py 프로젝트: shyam15287/infuse
    def text_users_similarity(self):
        """Compute the similarity between users using text features"""

        processor = self._processor = TextProcessor()
        features = []
        for user, docs in processor.iterate():
            features.append(processor.get_features(docs, user))

        # draw the matrix for alexis
        draw_matrix(euclidean_distances(features[0], features[0]),
                    "text_alexis", OUTPUT_PATH)
예제 #6
0
파일: run.py 프로젝트: shyam15287/infuse
    def compare_pca(self):
        """Compare the clusters generated with different values for the dimensions
        of the PCA
        """

        processors = (TextProcessor(N=50, algorithms=["kmeans"]),
                      TextProcessor(N=100, algorithms=["kmeans"]),
                      TextProcessor(N=200, algorithms=["kmeans"]))

        users_cluster = defaultdict(list)
        for processor in processors:
            # don't use random centers for kmeans to be able to compare them
            processor._particular_user = "******"

            processor.run()
            for user, cluster in processor.clusters['kmeans'].items():
                users_cluster[user].append(np.bincount(cluster.labels_))

        for user, bincounts in users_cluster.items():
            compare_pies(bincounts, "compare_%s.png" % user, self.output_path)
예제 #7
0
파일: run.py 프로젝트: almet/infuse
    def text_profiles_similarity(self):
        """Compute and return similarity scores between profiles, based on text 
        features and KMeans clustering.
        """

        # Text (TF-IDF)
        processor = TextProcessor(store_docs=True, 
                clusters={'kmeans': lambda: KMeans(5)} )
        processor.run()
        
        # dictionary containing metrics for the profiles
        docs = []
        for username, cluster in processor.clusters["kmeans"].items():
            # for each cluster, build up a new dataset, we will then use it to 
            # compare the profiles
            for label in np.unique(cluster.labels_):
                # get only the documents with this label
                docs.append(" ".join([processor.stored_docs[username][i] for i, val 
                    in enumerate(cluster.labels_ == label) if val]))

        features = processor.get_features(docs)
        self._processor = processor
        return euclidean_distances(features, features)
예제 #8
0
파일: run.py 프로젝트: shyam15287/infuse
 def run_processors(self):
     args = {'draw_2d': True, 'draw_pie': True}
     processors = (TextProcessor(**args), ContextProcessor(**args))
     for processor in processors:
         processor.run()