def _count_vocab(self, analyzed_docs): """Create sparse feature matrix, and vocabulary where fixed_vocab=False """ vocabulary = self.vocabulary_ j_indices = _make_int_array() indptr = _make_int_array() indptr.append(0) for doc in analyzed_docs: for feature in doc: try: j_indices.append(vocabulary[feature]) except KeyError: # Ignore out-of-vocabulary items for fixed_vocab=True continue indptr.append(len(j_indices)) j_indices = frombuffer_empty(j_indices, dtype=np.intc) indptr = np.frombuffer(indptr, dtype=np.intc) values = np.ones(len(j_indices)) X = sp.csr_matrix((values, j_indices, indptr), shape=(len(indptr) - 1, len(vocabulary)), dtype=self.dtype) X.sum_duplicates() if self.binary: X.data.fill(1) return X
def _count_vocab(self, raw_documents, fixed_vocab): """Create sparse feature matrix, and vocabulary where fixed_vocab=False """ if fixed_vocab: vocabulary = self.vocabulary_ else: # Add a new value when a new vocabulary item is seen vocabulary = defaultdict() vocabulary.default_factory = vocabulary.__len__ j_indices = _make_int_array() indptr = _make_int_array() values = _make_float_array() if self.apply_socal_mask else None indptr.append(0) for doc in raw_documents: # doc: meu cajado eh muito grande # [1, 1, 1, 0, 2] if self.apply_socal_mask is True: doc_mask = self.socal.mask(doc) for index, feature in enumerate(doc): try: if feature in self.stopwords: continue # j_incides for a doc: [2, 10, 9, 102, 65] if not fixed_vocab or feature in vocabulary: j_indices.append(vocabulary[feature]) if self.apply_socal_mask: values.append(doc_mask[index]) except KeyError: # Ignore out-of-vocabulary items for fixed_vocab=True continue indptr.append(len(j_indices)) if not fixed_vocab: # disable defaultdict behaviour vocabulary = dict(vocabulary) if not vocabulary: raise ValueError("empty vocabulary; perhaps the documents only" " contain stop words") j_indices = frombuffer_empty(j_indices, dtype=np.intc) indptr = np.frombuffer(indptr, dtype=np.intc) values = values if values else np.ones(len(j_indices)) X = sp.csr_matrix((values, j_indices, indptr), shape=(len(indptr) - 1, len(vocabulary)), dtype=self.dtype) X.sum_duplicates() return vocabulary, X
def _count_analyzed_vocab(self, analyzed_documents, fixed_vocab): """Create sparse feature matrix, and vocabulary where fixed_vocab=False. For consistency this is a slightly edited version of feature_extraction.text._count_vocab with the document analysis factored out. """ if fixed_vocab: vocabulary = self.vocabulary_ else: # Add a new value when a new vocabulary item is seen vocabulary = defaultdict() vocabulary.default_factory = vocabulary.__len__ j_indices = [] indptr = _make_int_array() values = _make_int_array() indptr.append(0) for analyzed_doc in analyzed_documents: feature_counter = {} for feature in analyzed_doc: try: feature_idx = vocabulary[feature] if feature_idx not in feature_counter: feature_counter[feature_idx] = 1 else: feature_counter[feature_idx] += 1 except KeyError: # Ignore out-of-vocabulary items for fixed_vocab=True continue j_indices.extend(feature_counter.keys()) values.extend(feature_counter.values()) indptr.append(len(j_indices)) j_indices = np.asarray(j_indices, dtype=np.intc) indptr = np.frombuffer(indptr, dtype=np.intc) values = np.frombuffer(values, dtype=np.intc) X = sp.csr_matrix((values, j_indices, indptr), shape=(len(indptr) - 1, len(vocabulary)), dtype=self.dtype) X.sort_indices() return vocabulary, X
def _count_vocab(self, raw_documents, fixed_vocab): """Create sparse feature matrix, and vocabulary where fixed_vocab=False """ if fixed_vocab: vocabulary = self.vocabulary_ else: # Add a new value when a new vocabulary item is seen vocabulary = defaultdict() vocabulary.default_factory = vocabulary.__len__ analyze = self.build_analyzer() j_indices = [] indptr = _make_int_array() values = _make_float_array() indptr.append(0) for doc in raw_documents: feature_counter = {} current_num = 0 for feature in analyze(doc): maybe_float = try_float(feature) if maybe_float > 0 and maybe_float <= 200: current_num = maybe_float continue try: if current_num == 0: continue feature_idx = vocabulary[feature] if feature_idx not in feature_counter: feature_counter[feature_idx] = current_num / 200 current_num = 0 except KeyError: # Ignore out-of-vocabulary items for fixed_vocab=True continue j_indices.extend(feature_counter.keys()) values.extend(feature_counter.values()) indptr.append(len(j_indices)) if not fixed_vocab: # disable defaultdict behaviour vocabulary = dict(vocabulary) if not vocabulary: raise ValueError("empty vocabulary; perhaps the documents only" " contain stop words") j_indices = np.asarray(j_indices, dtype=np.intc) indptr = np.frombuffer(indptr, dtype=np.intc) values = np.frombuffer(values, dtype=np.float32) X = sp.csr_matrix((values, j_indices, indptr), shape=(len(indptr) - 1, len(vocabulary)), dtype=np.float32) X.sort_indices() return vocabulary, X
def _count_vocab(self, raw_documents, fixed_vocab): """Create sparse feature matrix, and vocabulary where fixed_vocab=False """ if fixed_vocab: raise NotImplementedError() vocabulary = self.trie_cls() analyze = self.build_analyzer() j_indices = _make_int_array() indptr = _make_int_array() indptr.append(0) for doc in raw_documents: for feature in analyze(doc): if feature not in vocabulary: idx = len(vocabulary) vocabulary[feature] = idx j_indices.append(idx) else: try: j_indices.append(vocabulary[feature]) except KeyError: # Ignore out-of-vocabulary items for fixed_vocab=True continue indptr.append(len(j_indices)) # some Python/Scipy versions won't accept an array.array: if j_indices: j_indices = np.frombuffer(j_indices, dtype=np.intc) else: j_indices = np.array([], dtype=np.int32) indptr = np.frombuffer(indptr, dtype=np.intc) values = np.ones(len(j_indices)) X = sp.csr_matrix((values, j_indices, indptr), shape=(len(indptr) - 1, len(vocabulary)), dtype=self.dtype) X.sum_duplicates() return vocabulary, X
def _count_vocab(self, raw_documents, fixed_vocab, y=None): if not y: return super(DeltaTfidfVectorizer, self)._count_vocab(raw_documents, fixed_vocab) if fixed_vocab: vocabulary = self.vocabulary_ else: # Add a new value when a new vocabulary item is seen vocabulary = defaultdict() vocabulary.default_factory = vocabulary.__len__ analyze = self.build_analyzer() j_indices = [] indptr = _make_int_array() values = _make_int_array() pos_values = _make_int_array() neg_values = _make_int_array() indptr.append(0) for i, doc in enumerate(raw_documents): feature_counter = defaultdict(int) pos_feature_counter = defaultdict(int) neg_feature_counter = defaultdict(int) for feature in analyze(doc): try: feature_idx = vocabulary[feature] feature_counter[feature_idx] += 1 pos_feature_counter[feature_idx] += int(y[i] == 1) neg_feature_counter[feature_idx] += int(y[i] == -1) except KeyError: # Ignore out-of-vocabulary items for fixed_vocab=True continue j_indices.extend(feature_counter.keys()) values.extend(feature_counter.values()) pos_values.extend(pos_feature_counter.values()) neg_values.extend(neg_feature_counter.values()) indptr.append(len(j_indices)) if not fixed_vocab: # disable defaultdict behaviour vocabulary = dict(vocabulary) if not vocabulary: raise ValueError("empty vocabulary; perhaps the documents only" " contain stop words") j_indices = np.asarray(j_indices, dtype=np.intc) indptr = np.frombuffer(indptr, dtype=np.intc) values = np.frombuffer(values, dtype=np.intc) pos_values = np.frombuffer(pos_values, dtype=np.intc) neg_values = np.frombuffer(neg_values, dtype=np.intc) X = sp.csr_matrix((values, j_indices, indptr), shape=(len(indptr) - 1, len(vocabulary)), dtype=self.dtype) X.sort_indices() X_pos = sp.csr_matrix((pos_values, j_indices, indptr), shape=(len(indptr) - 1, len(vocabulary)), dtype=self.dtype) X_pos.sort_indices() X_neg = sp.csr_matrix((neg_values, j_indices, indptr), shape=(len(indptr) - 1, len(vocabulary)), dtype=self.dtype) X_neg.sort_indices() return vocabulary, X, X_pos, X_neg