def fit_before_convert(self, dataset): """Fit the encoder.""" # If in tf.data.Dataset, must be encoded already. if isinstance(dataset, tf.data.Dataset): return # Convert the data to np.ndarray. if isinstance(dataset, pd.DataFrame): dataset = dataset.values if isinstance(dataset, pd.Series): dataset = dataset.values.reshape(-1, 1) # If encoded. # TODO: support raw string labels for multi-label. if len(dataset.flatten()) != len(dataset): if self.num_classes: self._check_data_shape(dataset.shape[1:]) return # Fit encoder. labels = set(dataset.flatten()) if len(labels) < 2: raise ValueError( 'Expect the target data for {name} to have ' 'at least 2 classes, but got {num_classes}.'.format( name=self.name, num_classes=self.num_classes)) if len(labels) == 2 and not self.multi_label: self.label_encoder = encoders.LabelEncoder() else: self.label_encoder = encoders.OneHotEncoder() self.label_encoder.fit(dataset)
def fit_before_convert(self, dataset): # If in tf.data.Dataset, must be encoded already. if isinstance(dataset, tf.data.Dataset): if not self.num_classes: shape = utils.dataset_shape(dataset)[0] # Single column with 0s and 1s. if shape == 1: self.num_classes = 2 else: self.num_classes = shape return if isinstance(dataset, pd.DataFrame): dataset = dataset.values if isinstance(dataset, pd.Series): dataset = dataset.values.reshape(-1, 1) # Not label. if len(dataset.flatten()) != len(dataset): self.num_classes = dataset.shape[1] return labels = set(dataset.flatten()) if self.num_classes is None: self.num_classes = len(labels) if self.num_classes == 2: self.label_encoder = encoders.LabelEncoder() elif self.num_classes > 2: self.label_encoder = encoders.OneHotEncoder() elif self.num_classes < 2: raise ValueError( 'Expect the target data for {name} to have ' 'at least 2 classes, but got {num_classes}.'.format( name=self.name, num_classes=self.num_classes)) self.label_encoder.fit(dataset)