def _transform(self, corpus, source_dict=None): temp_corpus = list(corpus.ngrams_iterator(' ', include_postags=True)) dic = corpora.Dictionary( temp_corpus, prune_at=None) if not source_dict else source_dict temp_corpus = [dic.doc2bow(doc) for doc in temp_corpus] model = models.TfidfModel(temp_corpus, normalize=False, wlocal=self.wlocals[self.wlocal], wglobal=self.wglobals[self.wglobal]) X = matutils.corpus2csc(model[temp_corpus], dtype=np.float, num_terms=len(dic)).T norm = self.norms[self.norm] if norm: X = norm(X) # set compute values shared_cv = SharedTransform(self, corpus.used_preprocessor, source_dict=dic) cv = [ VectorizationComputeValue(shared_cv, dic[i]) for i in range(len(dic)) ] corpus = self.add_features(corpus, X, dic, cv, var_attrs={'bow-feature': True}) return corpus
def transform(self, corpus): scores = self.get_scores(corpus) X = np.array(scores).reshape((-1, len(self.sentiments))) # set compute values shared_cv = SharedTransform(self, corpus.used_preprocessor) cv = [ VectorizationComputeValue(shared_cv, col) for col in self.sentiments ] corpus = corpus.extend_attributes(X, self.sentiments, compute_values=cv) return corpus
def transform(self, corpus, copy=True): scores = [] for text in corpus.documents: pol_sc = self.vader.polarity_scores(text) scores.append([pol_sc[x] for x in self.sentiments]) X = np.array(scores).reshape((-1, len(self.sentiments))) # set compute values shared_cv = SharedTransform(self) cv = [VectorizationComputeValue(shared_cv, col) for col in self.sentiments] if copy: corpus = corpus.copy() corpus.extend_attributes(X, self.sentiments, compute_values=cv) return corpus
def transform(self, corpus, copy=True): scores = [] tokenizer = WordPunctTokenizer() tokens = tokenizer(corpus.documents) for doc in tokens: pos_words = sum(word in self.positive for word in doc) neg_words = sum(word in self.negative for word in doc) scores.append([100*(pos_words - neg_words)/max(len(doc), 1)]) X = np.array(scores).reshape((-1, len(self.sentiments))) # set compute values shared_cv = SharedTransform(self) cv = [VectorizationComputeValue(shared_cv, col) for col in self.sentiments] if copy: corpus = corpus.copy() corpus.extend_attributes(X, self.sentiments, compute_values=cv) return corpus