Python TextProcessor 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: processors

클래스/타입: TextProcessor

hotexamples.com에서의 예제들: 8

Python TextProcessor - 8개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 processors.TextProcessor에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

TextProcessor(6)

get_features(2)

iterate(1)

run(1)

예제 #1

파일 보기

파일: run.py 프로젝트: shyam15287/infuse

    def text_profiles_similarity(self):
        """Compute and return similarity scores between profiles, based on text 
        features and KMeans clustering.
        """

        # Text (TF-IDF)
        processor = TextProcessor(store_docs=True,
                                  clusters={'kmeans': lambda: KMeans(5)})
        processor.run()

        # dictionary containing metrics for the profiles
        docs = []
        for username, cluster in processor.clusters["kmeans"].items():
            # for each cluster, build up a new dataset, we will then use it to
            # compare the profiles
            for label in np.unique(cluster.labels_):
                # get only the documents with this label
                docs.append(" ".join([
                    processor.stored_docs[username][i]
                    for i, val in enumerate(cluster.labels_ == label) if val
                ]))

        features = processor.get_features(docs)
        self._processor = processor
        return euclidean_distances(features, features)

예제 #2

파일 보기

파일: run.py 프로젝트: almet/infuse

    def get_topics(self, n_topics=4):
        """Print the topics using a RandomizedPCA"""
        tp = TextProcessor("docs")
        inverse_vocabulary = dict([(y, x) for (x, y) in tp.vec.vocabulary.items()])

        for user, docs in tp.iterate():
            transformed_docs = tp.get_features(docs, user)
            print "top %s topics for %s" % (n_topics, user)
            for i in range(n_topics):
                top_words = [inverse_vocabulary[n] for n in 
                    transformed_docs[i].argsort()[-10:][::-1]]
                print "  - " + ", ".join(top_words)
            print "---"

예제 #3

파일 보기

파일: run.py 프로젝트: shyam15287/infuse

    def get_topics(self, n_topics=4):
        """Print the topics using a RandomizedPCA"""
        tp = TextProcessor("docs")
        inverse_vocabulary = dict([(y, x)
                                   for (x, y) in tp.vec.vocabulary.items()])

        for user, docs in tp.iterate():
            transformed_docs = tp.get_features(docs, user)
            print "top %s topics for %s" % (n_topics, user)
            for i in range(n_topics):
                top_words = [
                    inverse_vocabulary[n]
                    for n in transformed_docs[i].argsort()[-10:][::-1]
                ]
                print "  - " + ", ".join(top_words)
            print "---"

예제 #4

파일 보기

파일: run.py 프로젝트: shyam15287/infuse

 def __init__(self):
     self.output_path = OUTPUT_PATH
     self._processor = None
     self._usernames = None
     self._rankings = None
     self._default_processor = lambda: TextProcessor(
         store_docs=True, clusters={"kmeans": lambda: KMeans(5)})

예제 #5

파일 보기

파일: run.py 프로젝트: shyam15287/infuse

    def text_users_similarity(self):
        """Compute the similarity between users using text features"""

        processor = self._processor = TextProcessor()
        features = []
        for user, docs in processor.iterate():
            features.append(processor.get_features(docs, user))

        # draw the matrix for alexis
        draw_matrix(euclidean_distances(features[0], features[0]),
                    "text_alexis", OUTPUT_PATH)

예제 #6

파일 보기

파일: run.py 프로젝트: shyam15287/infuse

    def compare_pca(self):
        """Compare the clusters generated with different values for the dimensions
        of the PCA
        """

        processors = (TextProcessor(N=50, algorithms=["kmeans"]),
                      TextProcessor(N=100, algorithms=["kmeans"]),
                      TextProcessor(N=200, algorithms=["kmeans"]))

        users_cluster = defaultdict(list)
        for processor in processors:
            # don't use random centers for kmeans to be able to compare them
            processor._particular_user = "******"

            processor.run()
            for user, cluster in processor.clusters['kmeans'].items():
                users_cluster[user].append(np.bincount(cluster.labels_))

        for user, bincounts in users_cluster.items():
            compare_pies(bincounts, "compare_%s.png" % user, self.output_path)

예제 #7

파일 보기

파일: run.py 프로젝트: almet/infuse

    def text_profiles_similarity(self):
        """Compute and return similarity scores between profiles, based on text 
        features and KMeans clustering.
        """

        # Text (TF-IDF)
        processor = TextProcessor(store_docs=True, 
                clusters={'kmeans': lambda: KMeans(5)} )
        processor.run()
        
        # dictionary containing metrics for the profiles
        docs = []
        for username, cluster in processor.clusters["kmeans"].items():
            # for each cluster, build up a new dataset, we will then use it to 
            # compare the profiles
            for label in np.unique(cluster.labels_):
                # get only the documents with this label
                docs.append(" ".join([processor.stored_docs[username][i] for i, val 
                    in enumerate(cluster.labels_ == label) if val]))

        features = processor.get_features(docs)
        self._processor = processor
        return euclidean_distances(features, features)

예제 #8

파일 보기

파일: run.py 프로젝트: shyam15287/infuse

 def run_processors(self):
     args = {'draw_2d': True, 'draw_pie': True}
     processors = (TextProcessor(**args), ContextProcessor(**args))
     for processor in processors:
         processor.run()