def test_get_column_order(self): n_points = 10000 df_in = sample_utils.get_pos_sample_synthetic(mean=[0, 1, 2], cov=np.eye(3), n_points=n_points) df_in = df_in.drop(columns=['class_label']) # Get the normalization info from the data frame. normalization_info = sample_utils.get_normalization_info(df_in) column_order = sample_utils.get_column_order(normalization_info) assert column_order == ['x001', 'x002', 'x003']
def train_model(self, x_train: pd.DataFrame) -> None: """Train a new model and report the loss and accuracy. Args: x_train: dataframe with dimensions as columns. """ self._normalization_info = sample_utils.get_normalization_info(x_train) column_order = sample_utils.get_column_order(self._normalization_info) normalized_x_train = sample_utils.normalize(x_train, self._normalization_info) normalized_training_sample = sample_utils.apply_negative_sample( positive_sample=normalized_x_train, sample_ratio=self._sample_ratio, sample_delta=self._sample_delta) x = np.float32(np.matrix(normalized_training_sample[column_order])) y = np.float32(np.array(normalized_training_sample['class_label'])) # create dataset objects from the arrays dx = tf.data.Dataset.from_tensor_slices(x) dy = tf.data.Dataset.from_tensor_slices(y) logging.info('Training ns-nn with:') logging.info(normalized_training_sample['class_label'].value_counts()) # zip the two datasets together train_dataset = tf.data.Dataset.zip( (dx, dy)).shuffle(_SHUFFLE_BUFFERSIZE).repeat().batch(self._batch_size) if self._tpu_worker: resolver = tf.contrib.cluster_resolver.TPUClusterResolver( self._tpu_worker) tf.contrib.distribute.initialize_tpu_system(resolver) strategy = tf.contrib.distribute.TPUStrategy(resolver) with strategy.scope(): self._model = self._get_model(x.shape[1], self._dropout, self._layer_width, self._n_hidden_layers) else: self._model = self._get_model(x.shape[1], self._dropout, self._layer_width, self._n_hidden_layers) self._model.fit(x=train_dataset, steps_per_epoch=self._steps_per_epoch, verbose=0, epochs=self._epochs, callbacks=[ tf.keras.callbacks.TensorBoard( log_dir=self._log_dir, histogram_freq=1, write_graph=False, write_images=False) ])
def train_model(self, x_train: pd.DataFrame) -> None: """Trains a OC-SVM Anomaly detector using the positive sample. Args: x_train: training sample, with numeric feature columns. """ self._normalization_info = sample_utils.get_normalization_info(x_train) column_order = sample_utils.get_column_order(self._normalization_info) normalized_x_train = sample_utils.normalize(x_train[column_order], self._normalization_info) super(OneClassSVMAd, self).fit(X=normalized_x_train)
def predict(self, sample_df: pd.DataFrame) -> pd.DataFrame: """Given new data, predict the probability of being positive class. Args: sample_df: dataframe with features as columns, same as train(). Returns: DataFrame as sample_df, with colum 'class_prob', prob of Normal class. """ sample_df_normalized = sample_utils.normalize(sample_df, self._normalization_info) column_order = sample_utils.get_column_order(self._normalization_info) x = np.float32(np.matrix(sample_df_normalized[column_order])) y_hat = self._model.predict(x, verbose=1, steps=1) sample_df['class_prob'] = y_hat return sample_df
def predict(self, sample_df: pd.DataFrame) -> pd.DateOffset: """Performs anomaly detection on a new sample. Args: sample_df: dataframe with the new datapoints. Returns: original dataframe with a new column labled 'class_prob' as 1.0 for normal to 0.0 for anomalous. """ sample_df_normalized = sample_utils.normalize(sample_df, self._normalization_info) column_order = sample_utils.get_column_order(self._normalization_info) x_test = np.float32(np.matrix(sample_df_normalized[column_order])) preds = super(OneClassSVMAd, self).predict(x_test) sample_df['class_prob'] = np.where(preds == -1, 0, preds) return sample_df
def predict(self, sample_df: pd.DataFrame) -> pd.DataFrame: """Performs anomaly detection on a new sample. Args: sample_df: dataframe with the new datapoints, not normalized. Returns: original dataframe with a new column labled 'class_prob' rangin from 1.0 as normal to 0.0 as anomalous. """ sample_df_normalized = sample_utils.normalize(sample_df, self._normalization_info) column_order = sample_utils.get_column_order(self._normalization_info) x = np.float32(np.matrix(sample_df_normalized[column_order])) preds = super(NegativeSamplingRandomForestAd, self).predict_proba(x) sample_df['class_prob'] = preds[:, _NORMAL_CLASS] return sample_df
def train_model(self, x_train: pd.DataFrame) -> None: """Trains a NS-NN Anomaly detector using the positive sample. Args: x_train: training sample, which does not need to be normalized. """ # TODO(sipple) Consolidate the normalization code into the base class. self._normalization_info = sample_utils.get_normalization_info(x_train) column_order = sample_utils.get_column_order(self._normalization_info) normalized_x_train = sample_utils.normalize(x_train[column_order], self._normalization_info) normalized_training_sample = sample_utils.apply_negative_sample( positive_sample=normalized_x_train, sample_ratio=self._sample_ratio, sample_delta=self._sample_delta) super(NegativeSamplingRandomForestAd, self).fit(X=normalized_training_sample[column_order], y=normalized_training_sample[_CLASS_LABEL])