def get_data_shape(file_list, best_features): data = pd.read_csv(file_list[0]) if best_features is not None: data = data[best_features].values else: data = data.values[1:] data = get_seq2seq_batch(data, seq_length, batch_size) return data.shape
def compute_set_loss(self, file_list, best_features): losses = [] for file in file_list: # Load data data = pd.read_csv(file) data = keep_best_features(data, best_features) # Prepare data for seq2seq processing data = get_seq2seq_batch(data, self.seq_length, self.batch_size) # Test data loss = self.test_batch(data) losses.append(loss) # Computes average loss on validation files avg_loss = np.average(np.asarray(losses)) return avg_loss
def get_roc(self, strt_thr, end_thr, step, files_test_normal, files_test_anomalous, best_features): print('[ + ] Computing ROC curve using :') print('\t[-->] Threshold from 0 to', end_thr) print('\t[-->] Step :', step) print('\t[-->] ROC curve has', int(end_thr / step), 'points') point_count = 1 # Init empty arrays for points coordinates a_vp = [] a_vn = [] a_fp = [] a_fn = [] # For every threshold... for thr in drange(strt_thr, end_thr, step): tot_vn = 0 tot_vp = 0 tot_fn = 0 tot_fp = 0 n_sample_norm = 0 n_sample_anom = 0 # Compute predictions for normal test set for file in files_test_normal: # Load data data = pd.read_csv(file) data = keep_best_features(data, best_features) # Prepare data for seq2seq processing data = get_seq2seq_batch(data, self.seq_length, self.batch_size) # Predict file with current threshold fp, vn = self.predict(data, thr) # Update true/false positives count n_sample_norm += data.shape[1] tot_vn += vn tot_fp += fp # Compute predictions for anomalous test set for file in files_test_anomalous: # Load data data = pd.read_csv(file) data = keep_best_features(data, best_features) # Prepare data for seq2seq processing data = get_seq2seq_batch(data, self.seq_length, self.batch_size) # Predict file with current threshold vp, fn = self.predict(data, thr) # Update n_sample_anom += data.shape[1] tot_vp += vp tot_fn += fn vp_rate = tot_vp / n_sample_anom # Append for ROC curve vn_rate = tot_vn / n_sample_norm fp_rate = tot_fp / n_sample_norm # Append for ROC curve fn_rate = tot_fn / n_sample_norm print( '\t[ + ] Point {}/{} \tTrue Positive Rate : {:.5f} \tFalse Positive Rate : {:.5f}' .format(point_count, int(end_thr / step), vp_rate, fp_rate)) point_count += 1 a_vp.append(vp_rate) a_vn.append(vn_rate) a_fp.append(fp_rate) a_fn.append(fn_rate) return a_fp, a_vp
def train(self, train_file_list, validation_file_list=None, anomalous_file_list=None, best_features=None, epoch=200): print('[ + ] Starting training !') avg_train_losses = [] avg_valid_losses = [] avg_anomalous_losses = [] for e in range(epoch): losses = [] for file in train_file_list: # Load data data = pd.read_csv(file) data = keep_best_features(data, best_features) # Prepare data for seq2seq processing data = get_seq2seq_batch(data, self.seq_length, self.batch_size) # Train step loss = self.train_batch(data) losses.append(loss) # Compute losses to display if e % 10 == 0: #print('One sample time step : ', data[0,0,:]) #print('Prediction', self.predict_batch(data)[0,0,:]) # Computes average loss on train files avg_loss = np.average(np.asarray(losses)) avg_train_losses.append(avg_loss) # Compute validation loss if validation_file_list is not None: avg_val_loss = self.compute_set_loss( validation_file_list, best_features) avg_valid_losses.append(avg_val_loss) else: avg_val_loss = np.nan # Compute anomlous set loss if anomalous_file_list is not None: avg_ano_loss = self.compute_set_loss( anomalous_file_list, best_features) avg_anomalous_losses.append(avg_ano_loss) else: avg_ano_loss = np.nan # Display losses print( '\t[ + ] Step {}/{} \tTrain loss : {:.4f} \tValidation loss : {:.4f} \tAnomalous set loss : {:.4f}' .format(e + 10, epoch, avg_loss, avg_val_loss, avg_ano_loss)) return avg_train_losses, avg_valid_losses, avg_anomalous_losses