Exemplo n.º 1
0
    def _transform(self, corpus, source_dict=None):
        temp_corpus = list(corpus.ngrams_iterator(' ', include_postags=True))
        dic = corpora.Dictionary(
            temp_corpus, prune_at=None) if not source_dict else source_dict
        temp_corpus = [dic.doc2bow(doc) for doc in temp_corpus]
        model = models.TfidfModel(temp_corpus,
                                  normalize=False,
                                  wlocal=self.wlocals[self.wlocal],
                                  wglobal=self.wglobals[self.wglobal])

        X = matutils.corpus2csc(model[temp_corpus],
                                dtype=np.float,
                                num_terms=len(dic)).T
        norm = self.norms[self.norm]
        if norm:
            X = norm(X)

        # set compute values
        shared_cv = SharedTransform(self,
                                    corpus.used_preprocessor,
                                    source_dict=dic)
        cv = [
            VectorizationComputeValue(shared_cv, dic[i])
            for i in range(len(dic))
        ]

        corpus = self.add_features(corpus,
                                   X,
                                   dic,
                                   cv,
                                   var_attrs={'bow-feature': True})
        return corpus
Exemplo n.º 2
0
    def transform(self, corpus):
        scores = self.get_scores(corpus)
        X = np.array(scores).reshape((-1, len(self.sentiments)))

        # set compute values
        shared_cv = SharedTransform(self, corpus.used_preprocessor)
        cv = [
            VectorizationComputeValue(shared_cv, col)
            for col in self.sentiments
        ]

        corpus = corpus.extend_attributes(X,
                                          self.sentiments,
                                          compute_values=cv)
        return corpus
Exemplo n.º 3
0
    def transform(self, corpus, copy=True):
        scores = []
        for text in corpus.documents:
            pol_sc = self.vader.polarity_scores(text)
            scores.append([pol_sc[x] for x in self.sentiments])
        X = np.array(scores).reshape((-1, len(self.sentiments)))

        # set  compute values
        shared_cv = SharedTransform(self)
        cv = [VectorizationComputeValue(shared_cv, col)
              for col in self.sentiments]

        if copy:
            corpus = corpus.copy()
        corpus.extend_attributes(X, self.sentiments, compute_values=cv)
        return corpus
Exemplo n.º 4
0
    def transform(self, corpus, copy=True):
        scores = []
        tokenizer = WordPunctTokenizer()
        tokens = tokenizer(corpus.documents)

        for doc in tokens:
            pos_words = sum(word in self.positive for word in doc)
            neg_words = sum(word in self.negative for word in doc)
            scores.append([100*(pos_words - neg_words)/max(len(doc), 1)])
        X = np.array(scores).reshape((-1, len(self.sentiments)))

        # set  compute values
        shared_cv = SharedTransform(self)
        cv = [VectorizationComputeValue(shared_cv, col)
              for col in self.sentiments]

        if copy:
            corpus = corpus.copy()
        corpus.extend_attributes(X, self.sentiments, compute_values=cv)
        return corpus