def fit(self, X, unused_y=None): """Learn a vocabulary dictionary of all categories in X. Args: raw_documents: numpy matrix or iterable of lists/numpy arrays. unused_y: to match fit format signature of estimators. Returns: self """ X = setup_processor_data_feeder(X) for row in X: # Create vocabularies if not given. if self.vocabularies_ is None: # If not share, one per column, else one shared across. if not self.share: self.vocabularies_ = [ categorical_vocabulary.CategoricalVocabulary() for _ in row] else: vocab = categorical_vocabulary.CategoricalVocabulary() self.vocabularies_ = [vocab for _ in row] for idx, value in enumerate(row): # Nans are handled as unknowns. if (isinstance(value, float) and math.isnan(value)) or value == np.nan: continue self.vocabularies_[idx].add(value) if self.min_frequency > 0: for vocab in self.vocabularies_: vocab.trim(self.min_frequency) self.freeze() return self
def transform(self, X): """Transform documents to category-id matrix. Converts categories to ids give fitted vocabulary from `fit` or one provided in the constructor. Args: X: numpy matrix or iterable of lists/numpy arrays. Returns: X: iterable, [n_samples]. Category-id matrix. """ self.freeze() X = setup_processor_data_feeder(X) for row in X: output_row = [] for idx, value in enumerate(row): # Return <UNK> when it's Nan. if (isinstance(value, float) and math.isnan(value)) or value == np.nan: output_row.append(0) continue output_row.append(self.vocabularies_[idx].get(value)) yield np.array(output_row, dtype=np.int64)