def _count_vocab(self, analyzed_docs): """Create sparse feature matrix, and vocabulary where fixed_vocab=False """ vocabulary = self.vocabulary_ j_indices = _make_int_array() indptr = _make_int_array() indptr.append(0) for doc in analyzed_docs: for feature in doc: try: j_indices.append(vocabulary[feature]) except KeyError: # Ignore out-of-vocabulary items for fixed_vocab=True continue indptr.append(len(j_indices)) j_indices = frombuffer_empty(j_indices, dtype=np.intc) indptr = np.frombuffer(indptr, dtype=np.intc) values = np.ones(len(j_indices)) X = sp.csr_matrix((values, j_indices, indptr), shape=(len(indptr) - 1, len(vocabulary)), dtype=self.dtype) X.sum_duplicates() if self.binary: X.data.fill(1) return X
def _extract_features(self,raw_documents): j_indices = [] indptr = _make_int_array() values = _make_int_array() indptr.append(0) vocabulary = defaultdict(int) for doc in raw_documents: feature_counter = {} #for feature,feature_idx in contains_keywords(doc).items(): for feature,feature_idx in self.body_words_in_headline(doc).items(): vocabulary[feature]+=1 try: if feature_idx not in feature_counter: feature_counter[feature_idx] = 1 else: feature_counter[feature_idx] += 1 except KeyError: # Ignore out-of-vocabulary items for fixed_vocab=True continue j_indices.extend(feature_counter.keys()) values.extend(feature_counter.values()) indptr.append(len(j_indices)) j_indices = np.asarray(j_indices, dtype=np.intc) indptr = np.frombuffer(indptr, dtype=np.intc) values = frombuffer_empty(values, dtype=np.intc) X = sp.csr_matrix((values, j_indices, indptr), shape=(len(indptr) - 1, len(vocabulary)), dtype=self.dtype) X.sort_indices() return vocabulary, X
def _count_vocab(self, raw_documents, fixed_vocab): """Create sparse feature matrix, and vocabulary where fixed_vocab=False """ if fixed_vocab: vocabulary = self.vocabulary_ else: # Add a new value when a new vocabulary item is seen vocabulary = defaultdict() vocabulary.default_factory = vocabulary.__len__ j_indices = _make_int_array() indptr = _make_int_array() values = _make_float_array() if self.apply_socal_mask else None indptr.append(0) for doc in raw_documents: # doc: meu cajado eh muito grande # [1, 1, 1, 0, 2] if self.apply_socal_mask is True: doc_mask = self.socal.mask(doc) for index, feature in enumerate(doc): try: if feature in self.stopwords: continue # j_incides for a doc: [2, 10, 9, 102, 65] if not fixed_vocab or feature in vocabulary: j_indices.append(vocabulary[feature]) if self.apply_socal_mask: values.append(doc_mask[index]) except KeyError: # Ignore out-of-vocabulary items for fixed_vocab=True continue indptr.append(len(j_indices)) if not fixed_vocab: # disable defaultdict behaviour vocabulary = dict(vocabulary) if not vocabulary: raise ValueError("empty vocabulary; perhaps the documents only" " contain stop words") j_indices = frombuffer_empty(j_indices, dtype=np.intc) indptr = np.frombuffer(indptr, dtype=np.intc) values = values if values else np.ones(len(j_indices)) X = sp.csr_matrix((values, j_indices, indptr), shape=(len(indptr) - 1, len(vocabulary)), dtype=self.dtype) X.sum_duplicates() return vocabulary, X
def _count_vocab(self, raw_documents, fixed_vocab): """Create sparse feature matrix, and vocabulary where fixed_vocab=False """ if fixed_vocab: vocabulary = self.vocabulary_ else: # Add a new value when a new vocabulary item is seen vocabulary = defaultdict() vocabulary.default_factory = vocabulary.__len__ analyze = self.build_analyzer() j_indices = [] indptr = _make_int_array() values = _make_int_array() indptr.append(0) for doc in raw_documents: feature_counter = {} for feature in analyze(doc): try: feature_idx = vocabulary[feature] if feature_idx not in feature_counter: feature_counter[feature_idx] = 1 else: feature_counter[feature_idx] += 1 except KeyError: # Ignore out-of-vocabulary items for fixed_vocab=True continue j_indices.extend(feature_counter.keys()) values.extend(feature_counter.values()) indptr.append(len(j_indices)) if not fixed_vocab: # disable defaultdict behaviour vocabulary = dict(vocabulary) if not vocabulary: raise ValueError("empty vocabulary; perhaps the documents only" " contain stop words") j_indices = np.asarray(j_indices, dtype=np.intc) indptr = np.frombuffer(indptr, dtype=np.intc) values = frombuffer_empty(values, dtype=np.intc) X = sp.csr_matrix((values, j_indices, indptr), shape=(len(indptr) - 1, len(vocabulary)), dtype=self.dtype) X.sort_indices() return vocabulary, X
def partial_transform(self, X, fitting=None): self.add_default() transforming = True # Process everything as sparse regardless of setting X = [X] if isinstance(X, Mapping) else X indices = array("i") indptr = array("i", [0]) # XXX we could change values to an array.array as well, but it # would require (heuristic) conversion of dtype to typecode... values = [] for x in X: for f, v in x.iteritems(): self.add_element(f, v, fitting, transforming, indices, values) indptr.append(len(indices)) if len(indptr) == 1: raise ValueError("Sample sequence X is empty.") indices = frombuffer_empty(indices, dtype=np.intc) indptr = np.frombuffer(indptr, dtype=np.intc) shape = (len(indptr) - 1, len(self.vocabulary_)) result_matrix = sp.csr_matrix((values, indices, indptr), shape=shape, dtype=self.dtype) # Sort everything if asked if fitting and self.sort: self.feature_names_.sort() map_index = np.empty(len(self.feature_names_), dtype=np.int32) for new_val, f in enumerate(self.feature_names_): map_index[new_val] = self.vocabulary_[f] self.vocabulary_[f] = new_val result_matrix = result_matrix[:, map_index] if self.sparse: result_matrix.sort_indices() else: result_matrix = result_matrix.toarray() return result_matrix
def _count_vocab(self, raw_documents, fixed_vocab): """Create sparse feature matrix, and vocabulary where fixed_vocab=False """ if fixed_vocab: vocabulary = self.vocabulary_ else: if hasattr(self, 'vocabulary_') and self.vocabulary_: vocabulary = defaultdict(None, self.vocabulary_) else: # Add a new value when a new vocabulary item is seen vocabulary = defaultdict() vocabulary.default_factory = vocabulary.__len__ # print vocabulary.__len__() analyze = self.build_analyzer() j_indices = _make_int_array() indptr = _make_int_array() indptr.append(0) for doc in raw_documents: for feature in analyze(doc): try: j_indices.append(vocabulary[feature]) except KeyError: # Ignore out-of-vocabulary items for fixed_vocab=True continue indptr.append(len(j_indices)) if not fixed_vocab: # disable defaultdict behaviour vocabulary = dict(vocabulary) if not vocabulary: raise ValueError("empty vocabulary; perhaps the documents only" " contain stop words") j_indices = frombuffer_empty(j_indices, dtype=np.intc) indptr = np.frombuffer(indptr, dtype=np.intc) values = np.ones(len(j_indices)) X = sp.csr_matrix((values, j_indices, indptr), shape=(len(indptr) - 1, len(vocabulary)), dtype=self.dtype) X.sum_duplicates() return vocabulary, X
def _transform(self, X): # Sanity check: Python's array has no way of explicitly requesting the # signed 32-bit integers that scipy.sparse needs, so we use the next # best thing: typecode "i" (int). However, if that gives larger or # smaller integers than 32-bit ones, np.frombuffer screws up. assert array("i").itemsize == 4, ( "sizeof(int) != 4 on your platform; please report this at" " https://github.com/scikit-learn/scikit-learn/issues and" " include the output from platform.platform() in your bug report") dtype = self.dtype feature_names = self.feature_names_ vocab = self.vocabulary_ # Process everything as sparse regardless of setting indices = array("i") indptr = array("i", [0]) # XXX we could change values to an array.array as well, but it # would require (heuristic) conversion of dtype to typecode... values = [] if isinstance(X, dict): for f, val in X.items(): if isinstance(val, six.string_types): f = f + self.separator + val val = 1 if f in vocab and str(val) not in bad_vals_as_strings: # Get the index position from vocab, then append that index position to indices indices.append(vocab[f]) # Convert the val to the correct dtype, then append to our values list values.append(dtype(val)) indptr.append(len(indices)) if len(indptr) == 1: raise ValueError( 'The dictionary passed into DataFrameVectorizer is empty') else: # collect all the possible feature names and build sparse matrix at # same time for row_idx, row in X.iterrows(): for col_idx, val in enumerate(row): f = X.columns[col_idx] if isinstance(val, six.string_types): f = f + self.separator + val val = 1 # Only include this in our output if it was part of our training data. Silently ignore it otherwise. if f in vocab and str(val) not in bad_vals_as_strings: # Get the index position from vocab, then append that index position to indices indices.append(vocab[f]) # Convert the val to the correct dtype, then append to our values list values.append(dtype(val)) indptr.append(len(indices)) if len(indptr) == 1: raise ValueError( 'The DataFrame passed into DataFrameVectorizer is empty') indices = frombuffer_empty(indices, dtype=np.intc) indptr = np.frombuffer(indptr, dtype=np.intc) shape = (len(indptr) - 1, len(vocab)) result_matrix = sp.csr_matrix((values, indices, indptr), shape=shape, dtype=dtype) # # Sort everything if asked # if fitting and self.sort: # feature_names.sort() # map_index = np.empty(len(feature_names), dtype=np.int32) # for new_val, f in enumerate(feature_names): # map_index[new_val] = vocab[f] # vocab[f] = new_val # result_matrix = result_matrix[:, map_index] if self.sparse: result_matrix.sort_indices() else: result_matrix = result_matrix.toarray() # if fitting: # self.feature_names_ = feature_names # self.vocabulary_ = vocab return result_matrix
def _transform(self, X): # Sanity check: Python's array has no way of explicitly requesting the # signed 32-bit integers that scipy.sparse needs, so we use the next # best thing: typecode "i" (int). However, if that gives larger or # smaller integers than 32-bit ones, np.frombuffer screws up. assert array("i").itemsize == 4, ( "sizeof(int) != 4 on your platform; please report this at" " https://github.com/scikit-learn/scikit-learn/issues and" " include the output from platform.platform() in your bug report") dtype = self.dtype feature_names = self.feature_names_ vocab = self.vocabulary_ # Process everything as sparse regardless of setting indices = array("i") indptr = array("i", [0]) # XXX we could change values to an array.array as well, but it # would require (heuristic) conversion of dtype to typecode... values = [] if isinstance(X, dict): for f, val in X.items(): if self.column_descriptions.get(f, False) == 'categorical': if self.get('keep_cat_features', False) == False: f = f + self.separator + str(val) val = 1 else: if str(val) in bad_vals_as_strings: val = '_None' val = self.get('label_encoders')[f].transform([val]) if f in vocab and str(val) not in bad_vals_as_strings: indices.append(vocab[f]) # Convert the val to the correct dtype, then append to our values list values.append(dtype(val)) indptr.append(len(indices)) if len(indptr) == 1: raise ValueError('The dictionary passed into DataFrameVectorizer is empty') else: # collect all the possible feature names and build sparse matrix at # same time X_columns = X.columns string_types = six.string_types separator = self.separator indices_append = indices.append values_append = values.append keep_cat_features = self.get('keep_cat_features', False) == False is_categorical = [self.column_descriptions.get(f, False) == 'categorical' for f in X_columns] for row in X.itertuples(): for col_idx, val in enumerate(row[1:]): f = X_columns[col_idx] if is_categorical[col_idx]: if keep_cat_features: f = f + separator + str(val) val = 1 else: if str(val) in bad_vals_as_strings: val = '_None' val = self.get('label_encoders')[f].transform([val]) # Only include this in our output if it was part of our training data. Silently ignore it otherwise. if f in vocab and str(val) not in bad_vals_as_strings: # Get the index position from vocab, then append that index position to indices indices_append(vocab[f]) # Convert the val to the correct dtype, then append to our values list values_append(dtype(val)) indptr.append(len(indices)) if len(indptr) == 1: raise ValueError('The DataFrame passed into DataFrameVectorizer is empty') indices = frombuffer_empty(indices, dtype=np.intc) indptr = np.frombuffer(indptr, dtype=np.intc) shape = (len(indptr) - 1, len(vocab)) result_matrix = sp.csr_matrix((values, indices, indptr), shape=shape, dtype=dtype) if self.sparse: result_matrix.sort_indices() else: result_matrix = result_matrix.toarray() return result_matrix