def __init__(self): self.logfile = None self.gettrace = getattr(sys, 'gettrace', None) self.original_stdout = sys.stdout self.timestr = time.strftime("%Y%m%d-%H%M%S") self.log_file() print(__doc__) self.filehandler = Filehandler() self.ds = KDDCup1999() self.visualize = Visualize() self.random_state = 20 self.X = None self.y = None self.full = None self.ac_count = {} self.scores = OrderedDict() self.scale_cols = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate'] with timer('\nLoading dataset'): self.load_data() self.set_attack_category_count() with timer('\nEncoding categoricals'): le = preprocessing.LabelEncoder() self.full['protocol_type'] = le.fit_transform(self.full['protocol_type']) self.full['service'] = le.fit_transform(self.full['service']) self.full['flag'] = le.fit_transform(self.full['flag']) with timer('\nSetting X'): self.set_X() self.ds.shape() with timer('\nDistribution Before Scaling'): self.dist_before_scaling() with timer('\nScaling'): for scaler in (StandardScaler(), Normalizer(), MinMaxScaler(feature_range=(0, 1)), Binarizer(threshold=0.0), RobustScaler(quantile_range=(25, 75)), PowerTransformer(method='yeo-johnson'), QuantileTransformer(output_distribution='normal')): title, res_x = self.scale(scaler) label = 'attack_category' self.set_y(label) self.model_and_score(scaler, res_x, title, label) label = 'target' self.set_y(label) self.model_and_score(scaler, res_x, title, label) self.log_file() print('Finished')
def __init__(self): self.logfile = False self.gettrace = getattr(sys, 'gettrace', None) self.original_stdout = sys.stdout self.timestr = time.strftime("%Y%m%d-%H%M%S") self.log_file() print(__doc__) self.filehandler = Filehandler() self.visualize = Visualize() self.ds = KDDCup1999() with timer('\nLoading dataset'): self.ds.dataset = self.filehandler.read_csv( self.ds.config['path'], self.ds.config['file']) self.ds.set_columns() with timer('\nTransforming dataset'): self.ds.transform() with timer('\nInitial dataset discovery'): self.ds.shape() self.ds.show_duplicates(self.ds.config['level_01']) self.ds.drop_duplicates() self.show_zeros() self.ds.drop_outliers() self.ds.shape() self.ds.discovery() with timer('\nSetting target'): self.ds.set_target() with timer('\nEvaluating sparse features'): self.ds.evaluate_sparse_features(engineer=False) with timer('\nVisualising pairplot for selected columns'): self.visualize.pairplot(self.ds.dataset, self.ds.config['pairplot_cols'], self.ds.config['pairplot_target']) with timer('\nDropping columns'): self.ds.drop_cols(self.ds.config['drop_cols_01']) with timer('\nEvaluating correlation'): self.visualize.correlation_heatmap( self.ds.dataset, title='Correlation Heatmap Before Column Drop') self.ds.drop_highly_correlated() self.visualize.correlation_heatmap( self.ds.dataset, title='Correlation Heatmap After Column Drop') with timer('\nPersisting transformed dataset and target'): self.filehandler.write_csv(self.ds.config['path'], self.ds.config['file'] + '_processed', self.ds.dataset) self.filehandler.write_csv(self.ds.config['path'], self.ds.config['file'] + '_target', self.ds.target) self.ds.shape() self.log_file() print('Finished')
def __init__(self): self.logfile = None self.gettrace = getattr(sys, 'gettrace', None) self.original_stdout = sys.stdout self.timestr = time.strftime("%Y%m%d-%H%M%S") self.log_file() print(__doc__) self.random_state = 20 self.filehandler = Filehandler() self.ds = KDDCup1999() self.n_classes = 2 # Datasets self.X = None self.y = None self.X_train = None self.X_test = None self.y_train = None self.y_test = None self.label_map_int_2_string = {0: 'good', 1: 'bad', '0': 'good', '1': 'bad'} self.label_map_string_2_int = {'normal': 0, 'dos': 1, 'u2r': 1, 'r2l': 1, 'probe': 1} self.max_iters = 100 with timer('\nPreparing dataset'): self.load_data() self.set_y() self.remove_target_from_X() self.train_test_split() with timer('\nTesting model on unseen test set'): clf = XGBClassifier(n_estimators=100, random_state=self.random_state) clf.fit(self.X_train, self.y_train) self.y_pred = clf.predict(self.X_test) cm = confusion_matrix(self.y_test, self.y_pred) self.tp = cm[1, 1] self.tn = cm[0, 0] self.fp = cm[0, 1] self.fn = cm[1, 0] print('True positive (TP)', self.tp) print('True negative (TN)', self.tn) print('False positive (FP)', self.fp) print('false negative (FN)', self.fn) self.dr = self.tp / (self.tp + self.fp) self.far = self.fp / (self.tn + self.fp) self.acc = (self.tp + self.tn) / (self.tp + self.tn + self.fp + self.fn) print('Detection rate: ', self.dr) print('False alarm rate: ', self.far) print('Accuracy: ', self.acc) self.log_file() print('Finished')
def __init__(self): self.logfile = False self.gettrace = getattr(sys, 'gettrace', None) self.original_stdout = sys.stdout self.timestr = time.strftime("%Y%m%d-%H%M%S") self.log_file() print(__doc__) self.filehandler = Filehandler() self.visualize = Visualize() self.ds = KDDCup1999() self.X = None self.y = None self.full = None self.random_state = 20 self.num_features = 15 self.scale_cols = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate'] with timer('\nLoading dataset'): self.load_data() self.encode_scale() self.set_X() with timer('\nFeature selection'): for selector in (Original(), UnivariateSelector(), RecursiveSelector(), PCASelector(), #KernelPCASelector(), ExtraTreesSelector(), RandomForestSelector()): for label in ('attack_category', 'target'): self.set_y(label) with timer('\nFitting selector ' + selector.__class__.__name__): selector.fit_model(self.X, self.y) x = selector.get_top_features(self.X, label) with timer('\nXGBoost scoring of features selected by ' + selector.__class__.__name__): self.score_with_xgboost(x, self.y, selector.title) self.log_file() print('Finished')
def preprocess(self, envparm): self.filehandler = Filehandler() self.dataset_raw = self.filehandler.read_csv( self.filehandler.data_raw_path) logging.info('Original raw dataset loaded - dataset size {}'.format( self.dataset_raw.shape)) logging.info('Partitioning numerical features') self.numerical_features_raw = self.dataset_raw.iloc[:, 0:190].copy() logging.info('Partitioning categorical features') self.categorical_features_raw = self.dataset_raw.iloc[:, 190:].copy() if envparm['PlotGraphs']: num_size = 28 sample_df = self.dataset_raw.iloc[:, :num_size].copy() visualizer.matrix_missing( sample_df, 'Data Completion First ' + str(num_size) + ' Numeric Features') visualizer.bar_missing( sample_df, 'Nullity Count First ' + str(num_size) + ' Numeric Features') visualizer.heat_missing( sample_df, 'Nullity Correlation Of First ' + str(num_size) + ' Numeric Features') sample_df = self.dataset_raw.iloc[:, 190:].copy() visualizer.matrix_missing(sample_df, 'Data Completion Categorical Features') visualizer.bar_missing(sample_df, 'Nullity Count Categorical Features') visualizer.heat_missing( sample_df, 'Nullity Correlation Of Categorical Features') if envparm['ProcessDS01']: self.prepare_output_variant_01() if envparm['ProcessDS02']: self.prepare_output_variant_02() if envparm['ProcessDS03']: self.prepare_output_variant_03()
def __init__(self): self.logfile = None self.gettrace = getattr(sys, 'gettrace', None) self.original_stdout = sys.stdout self.timestr = time.strftime("%Y%m%d-%H%M%S") self.log_file() print(__doc__) self.filehandler = Filehandler() self.ds = KDDCup1999() self.X = None self.y = None self.full = None with timer('\nLoading dataset'): self.load_data() with timer('\nPreparing Tensor Input Files'): for t2d in (Tensor2d_type_1(), Tensor2d_type_2()): with timer('\nBuilding 2d tensor - ' + t2d.__class__.__name__): t2d.set_X(self.full) t2d.encode_categoricals() t2d.set_y(self.full) t2d.sample() t2d.scale() t2d.pca() t2d.add_target() self.filehandler.write_csv( self.ds.config['path'], self.ds.config['file'] + '_' + t2d.__class__.__name__, t2d.X) print('Shape of ' + self.ds.config['file'] + '_' + t2d.__class__.__name__ + ' : ' + str(t2d.X.shape)) self.log_file() print('Finished')
def main(): filehandler = Filehandler() modeller = Modeller() if envparm['PrepEnabled']: logging.info("Executing preprocessor") Preprocessor(envparm) if envparm['ProcessDS01']: process_dataset(filehandler, modeller, filehandler.dataset_prep_path_01) if envparm['ProcessDS02']: process_dataset(filehandler, modeller, filehandler.dataset_prep_path_02) if envparm['ProcessDS03']: process_dataset(filehandler, modeller, filehandler.dataset_prep_path_03) if modeller.scores: modeller.output_scores(filehandler)
class AnnMLPOptimiseEvaluateMulti: def __init__(self): self.logfile = None self.gettrace = getattr(sys, 'gettrace', None) self.original_stdout = sys.stdout self.timestr = time.strftime("%Y%m%d-%H%M%S") self.log_file() print(__doc__) self.n_classes = 5 self.random_state = 20 self.filehandler = Filehandler() self.ds = KDDCup1999() self.folder = 'viz' self.fprefix_multi = 'Hyper - annMLPMulti - ' # Datasets self.X = None self.y = None self.X_train = None self.X_test = None self.y_train = None self.y_test = None self.hyp = None self.lr = None self.label_map_string_2_int = { 'normal': 0, 'dos': 1, 'u2r': 2, 'r2l': 3, 'probe': 4 } self.max_iters = 100 with timer('\nPreparing dataset'): self.load_data() self.set_y() self.remove_target_from_X() self.train_test_split() with timer('\nPreparing base logistic regression'): self.lr = LogisticRegression(max_iter=self.max_iters) self.lr.fit(self.X_train, self.y_train) with timer('\nPreparing confusion matrix and base DR'): self.y_pred = self.lr.predict(self.X_test) cm = confusion_matrix(self.y_test, self.y_pred) self.tp = self.get_tp_from_cm(cm) self.tn = self.get_tn_from_cm(cm) self.fp = self.get_fp_from_cm(cm) self.fn = self.get_fn_from_cm(cm) self.dr = self.tp / (self.tp + self.fp) print('log reg dr', self.dr) with timer('\nVisualising optimisation search'): self.load_hyp() self.hyp['lr'] = round(self.hyp['lr'] / 1000, 3) # Hyperparameter correlation with val DR self.hyp_val_dr = self.hyp self.hyp_val_dr.drop([ 'round_epochs', 'epochs', 'loss', 'dr', 'far', 'acc', 'val_loss', 'val_acc', 'val_far' ], axis=1, inplace=True) self.dr_corr = self.hyp_val_dr.corr() plt.clf() fig, ax = plt.subplots(figsize=(10, 10)) title = 'Validation DR Hyperparameter Correlation' ax.set_title(title, size=16) colormap = sns.diverging_palette(220, 10, as_cmap=True) sns.heatmap(self.dr_corr, cmap=colormap, annot=True, fmt=".2f", cbar=False, vmin=-0.4, vmax=0.4) plt.xticks(range(len(self.dr_corr.columns)), self.dr_corr.columns) plt.yticks(range(len(self.dr_corr.columns)), self.dr_corr.columns) plt.savefig(fname=self.fname(title), dpi=300, format='png') plt.show() self.hyp['val_dr_change'] = round(self.hyp.val_dr - self.dr, 3) pd.set_option('display.max_columns', 100) print(self.hyp.sort_values(by='val_dr', ascending=False).head()) self.color = 'cornflowerblue' metric = 'lr' plt.clf() fig, ax = plt.subplots(figsize=(10, 6)) ax = sns.boxplot(x=metric, y='val_dr_change', data=self.hyp.reset_index(), color=self.color) title = 'Validation DR Change Over Baseline As Fn Of Learning Rate' plt.title(title, fontsize=16) plt.xlabel('Learning Rate', fontsize=12) plt.ylabel('Validation DR Change', fontsize=12) plt.savefig(fname=self.fname(title), dpi=300, format='png') plt.show() metric = 'first_neuron' plt.clf() fig, ax = plt.subplots(figsize=(10, 6)) ax = sns.boxplot(x=metric, y='val_dr_change', data=self.hyp.reset_index(), color=self.color) title = 'Validation DR Change Over Baseline As Fn Of # Neurons First Layer' plt.title(title, fontsize=16) plt.xlabel('First Neuron', fontsize=12) plt.ylabel('Validation DR Change', fontsize=12) plt.savefig(fname=self.fname(title), dpi=300, format='png') plt.show() metric = 'hidden_layers' plt.clf() fig, ax = plt.subplots(figsize=(10, 6)) ax = sns.boxplot(x=metric, y='val_dr_change', data=self.hyp.reset_index(), color=self.color) title = 'Validation DR Change Over Baseline As Fn Of # Hidden Layers' plt.title(title, fontsize=16) plt.xlabel('Hidden Layers', fontsize=12) plt.ylabel('Validation DR Change', fontsize=12) plt.savefig(fname=self.fname(title), dpi=300, format='png') plt.show() metric = 'hidden_neuron' plt.clf() fig, ax = plt.subplots(figsize=(10, 6)) ax = sns.boxplot(x=metric, y='val_dr_change', data=self.hyp.reset_index(), color=self.color) title = 'Validation DR Change Over Baseline As Fn Of # Hidden Layer Neurons' plt.title(title, fontsize=16) plt.xlabel('Hidden Neurons', fontsize=12) plt.ylabel('Validation DR Change', fontsize=12) plt.savefig(fname=self.fname(title), dpi=300, format='png') plt.show() metric = 'batch_size' plt.clf() fig, ax = plt.subplots(figsize=(10, 6)) ax = sns.boxplot(x=metric, y='val_dr_change', data=self.hyp.reset_index(), color=self.color) title = 'Validation DR Change Over Baseline As Fn Of Batch Size' plt.title(title, fontsize=16) plt.xlabel('Batch Size', fontsize=12) plt.ylabel('Validation DR Change', fontsize=12) plt.savefig(fname=self.fname(title), dpi=300, format='png') plt.show() metric = 'dropout' plt.clf() fig, ax = plt.subplots(figsize=(12, 8)) ax = sns.boxplot(x=metric, y='val_dr_change', data=self.hyp.reset_index(), color=self.color) title = 'Validation DR Change Over Baseline As Fn Of Dropout' plt.title(title, fontsize=16) plt.xlabel('Dropout', fontsize=12) plt.ylabel('Validation DR Change', fontsize=12) plt.savefig(fname=self.fname(title), dpi=300, format='png') plt.show() plt.clf() fig, ax = plt.subplots(figsize=(9, 7)) df_grid = self.hyp.reset_index().groupby( ['first_neuron', 'hidden_neuron']).val_dr_change.mean().unstack() ax = sns.heatmap(data=df_grid, cmap=(sns.diverging_palette(10, 220, sep=80, n=7)), annot=True, cbar=False) title = 'Validation DR Change Over Baseline As Fn Of First Neuron & Hidden Neuron' plt.title(title, fontsize=12) plt.xlabel('Hidden Neuron', fontsize=10) plt.ylabel('First Neuron', fontsize=10) plt.savefig(fname=self.fname(title), dpi=300, format='png') plt.show() plt.clf() fig, ax = plt.subplots(figsize=(9, 7)) df_grid = self.hyp.reset_index().groupby( ['hidden_layers', 'hidden_neuron']).val_dr_change.mean().unstack() ax = sns.heatmap(data=df_grid, cmap=(sns.diverging_palette(10, 220, sep=80, n=7)), annot=True, cbar=False) title = 'Validation DR Change Over Baseline As Fn Of Hidden Layers & Hidden Neuron' plt.title(title, fontsize=16) plt.xlabel('Hidden Neuron', fontsize=10) plt.ylabel('Hidden Layers', fontsize=10) plt.savefig(fname=self.fname(title), dpi=300, format='png') plt.show() plt.clf() fig, ax = plt.subplots(figsize=(9, 7)) df_grid = self.hyp.reset_index().groupby( ['batch_size', 'dropout']).val_dr_change.mean().unstack() ax = sns.heatmap(data=df_grid, cmap=(sns.diverging_palette(10, 220, sep=80, n=7)), annot=True, cbar=False) title = 'Validation DR Change Over Baseline As Fn Of Batch Size & Dropout' plt.xlabel('Dropout', fontsize=10) plt.ylabel('Batch Size', fontsize=10) plt.title(title, fontsize=16) plt.savefig(fname=self.fname(title), dpi=300, format='png') plt.show() plt.clf() fig, ax = plt.subplots(figsize=(9, 7)) df_grid = self.hyp.reset_index().groupby( ['lr', 'dropout']).val_dr_change.mean().unstack() ax = sns.heatmap(data=df_grid, cmap=(sns.diverging_palette(10, 220, sep=80, n=7)), annot=True, cbar=False) title = 'Validation DR Change Over Baseline As Fn Of Learning Rate & Dropout' plt.xlabel('Dropout', fontsize=10) plt.ylabel('Learning Rate', fontsize=10) plt.title(title, fontsize=16) plt.savefig(fname=self.fname(title), dpi=300, format='png') plt.show() self.log_file() print('Finished') def get_base_dr(self): y_pred = pd.Series(0.5, index=self.y_train.index) cm = confusion_matrix(self.y_train, y_pred) tp = self.get_tp_from_cm(cm) fn = self.get_fn_from_cm(cm) dr = tp / (tp + fn) print('dr ', dr) return dr # True positives are the diagonal elements def get_tp_from_cm(self, cm): tp = np.diag(cm) print('tp', np.sum(np.diag(cm))) return np.sum(tp) def get_tn_from_cm(self, cm): tn = [] for i in range(self.n_classes): temp = np.delete(cm, i, 0) # delete ith row temp = np.delete(temp, i, 1) # delete ith column tn.append(sum(sum(temp))) print('tn ', np.sum(tn)) return np.sum(tn) # Sum of columns minus diagonal def get_fp_from_cm(self, cm): fp = [] for i in range(self.n_classes): fp.append(sum(cm[:, i]) - cm[i, i]) print('fp ', np.sum(fp)) return np.sum(fp) # Sum of rows minus diagonal def get_fn_from_cm(self, cm): fn = [] for i in range(self.n_classes): fn.append(sum(cm[i, :]) - cm[i, i]) print('fn', np.sum(fn)) return np.sum(fn) def log_file(self): if self.gettrace is None: pass elif self.gettrace(): pass else: if self.logfile: sys.stdout = self.original_stdout self.logfile.close() self.logfile = False else: # Redirect stdout to file for logging if not in debug mode self.logfile = open( 'logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w') sys.stdout = self.logfile def load_data(self): self.X = self.filehandler.read_csv( self.ds.config['path'], self.ds.config['file'] + '_Tensor2d_type_1') print('\tRow count:\t', '{}'.format(self.X.shape[0])) print('\tColumn count:\t', '{}'.format(self.X.shape[1])) def load_hyp(self): self.hyp = pd.read_csv( 'tuning/Hyperparameter tuning - AnnMLPMultiOptimize_1.csv') def set_y(self): self.y = self.X['attack_category'] self.y = self.y.map(self.label_map_string_2_int) def remove_target_from_X(self): self.X.drop('attack_category', axis=1, inplace=True) def train_test_split(self): self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.X, self.y, test_size=0.30, random_state=self.random_state) def fname(self, title): return '{}/{}.png'.format(self.folder, self.fprefix_multi + title)
class XGBoostBinary: def __init__(self): self.logfile = None self.gettrace = getattr(sys, 'gettrace', None) self.original_stdout = sys.stdout self.timestr = time.strftime("%Y%m%d-%H%M%S") self.log_file() print(__doc__) self.random_state = 20 self.filehandler = Filehandler() self.ds = KDDCup1999() self.n_classes = 2 # Datasets self.X = None self.y = None self.X_train = None self.X_test = None self.y_train = None self.y_test = None self.label_map_int_2_string = {0: 'good', 1: 'bad', '0': 'good', '1': 'bad'} self.label_map_string_2_int = {'normal': 0, 'dos': 1, 'u2r': 1, 'r2l': 1, 'probe': 1} self.max_iters = 100 with timer('\nPreparing dataset'): self.load_data() self.set_y() self.remove_target_from_X() self.train_test_split() with timer('\nTesting model on unseen test set'): clf = XGBClassifier(n_estimators=100, random_state=self.random_state) clf.fit(self.X_train, self.y_train) self.y_pred = clf.predict(self.X_test) cm = confusion_matrix(self.y_test, self.y_pred) self.tp = cm[1, 1] self.tn = cm[0, 0] self.fp = cm[0, 1] self.fn = cm[1, 0] print('True positive (TP)', self.tp) print('True negative (TN)', self.tn) print('False positive (FP)', self.fp) print('false negative (FN)', self.fn) self.dr = self.tp / (self.tp + self.fp) self.far = self.fp / (self.tn + self.fp) self.acc = (self.tp + self.tn) / (self.tp + self.tn + self.fp + self.fn) print('Detection rate: ', self.dr) print('False alarm rate: ', self.far) print('Accuracy: ', self.acc) self.log_file() print('Finished') def log_file(self): if self.gettrace is None: pass elif self.gettrace(): pass else: if self.logfile: sys.stdout = self.original_stdout self.logfile.close() self.logfile = False else: # Redirect stdout to file for logging if not in debug mode self.logfile = open('logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w') sys.stdout = self.logfile def load_data(self): self.X = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_Tensor2d_type_1') print('\tRow count:\t', '{}'.format(self.X.shape[0])) print('\tColumn count:\t', '{}'.format(self.X.shape[1])) def set_y(self): self.y = self.X['attack_category'] self.y = self.y.map(self.label_map_string_2_int) def remove_target_from_X(self): self.X.drop('attack_category', axis=1, inplace=True) def train_test_split(self): self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.30, random_state=self.random_state) def map_target_to_label(self, t): return np.vectorize(self.label_map_int_2_string.get)(t) def fname(self, title): return '{}/{}.png'.format(self.folder, title)
def __init__(self): self.logfile = None self.gettrace = getattr(sys, 'gettrace', None) self.original_stdout = sys.stdout self.timestr = time.strftime("%Y%m%d-%H%M%S") self.log_file() print(__doc__) self.filehandler = Filehandler() self.ds = KDDCup1999() self.visualize = Visualize() self.random_state = 20 self.clusters_stop = 11 self.x = None self.y = None self.full = None self.ac_count = {} self.feature_idx = {0: 0, 1: 0, 2: 0} self.pca_idx = {0: 0, 1: 1, 2: 2, 'pca': True} self.kernelpca_idx = {0: 0, 1: 1, 2: 2, 'kpca': True} self.scale_cols = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate'] self.cluster_cols = [('count', 'diff_srv_rate', 'src_bytes'), ('src_bytes', 'dst_host_srv_count', 'dst_bytes'), ('srv_diff_host_rate', 'srv_count', 'serror_rate'), ('serror_rate', 'dst_host_diff_srv_rate', 'flag')] with timer('\nLoading dataset'): self.load_data() self.ds.shape() with timer('\nEncode and Scale dataset'): self.encode_scale() with timer('\nSetting X and y'): self.set_x_y() with timer('\nPlotting clusters for specific columns'): for cola, colb, colc in self.cluster_cols: for c in range(2, self.clusters_stop): self.set_indexes(cola, colb, colc) with timer('\n2D clustering without PCA'): self.cluster(idx=self.feature_idx, n_clusters=c) with timer('\n3D clustering without PCA'): self.cluster(idx=self.feature_idx, n_clusters=c, projection='3d') with timer('\nPlotting clusters applying PCA'): for c in range(2, self.clusters_stop): with timer('\n2D clustering with PCA'): self.cluster(idx=self.pca_idx, n_clusters=c) with timer('\n3D clustering with PCA'): self.cluster(idx=self.pca_idx, n_clusters=c, projection='3d') # Commented out due to memory error #with timer('\nPlotting clusters Kernel applying PCA'): # for c in range(2, 7): # with timer('\n2D clustering with Kernel PCA'): # self.cluster(idx=self.kernelpca_idx, n_clusters=c) # with timer('\n3D clustering with Kernel PCA'): # self.cluster(idx=self.kernelpca_idx, n_clusters=c, projection='3d') self.log_file() print('Finished')
class Clustering: def __init__(self): self.logfile = None self.gettrace = getattr(sys, 'gettrace', None) self.original_stdout = sys.stdout self.timestr = time.strftime("%Y%m%d-%H%M%S") self.log_file() print(__doc__) self.filehandler = Filehandler() self.ds = KDDCup1999() self.visualize = Visualize() self.random_state = 20 self.clusters_stop = 11 self.x = None self.y = None self.full = None self.ac_count = {} self.feature_idx = {0: 0, 1: 0, 2: 0} self.pca_idx = {0: 0, 1: 1, 2: 2, 'pca': True} self.kernelpca_idx = {0: 0, 1: 1, 2: 2, 'kpca': True} self.scale_cols = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate'] self.cluster_cols = [('count', 'diff_srv_rate', 'src_bytes'), ('src_bytes', 'dst_host_srv_count', 'dst_bytes'), ('srv_diff_host_rate', 'srv_count', 'serror_rate'), ('serror_rate', 'dst_host_diff_srv_rate', 'flag')] with timer('\nLoading dataset'): self.load_data() self.ds.shape() with timer('\nEncode and Scale dataset'): self.encode_scale() with timer('\nSetting X and y'): self.set_x_y() with timer('\nPlotting clusters for specific columns'): for cola, colb, colc in self.cluster_cols: for c in range(2, self.clusters_stop): self.set_indexes(cola, colb, colc) with timer('\n2D clustering without PCA'): self.cluster(idx=self.feature_idx, n_clusters=c) with timer('\n3D clustering without PCA'): self.cluster(idx=self.feature_idx, n_clusters=c, projection='3d') with timer('\nPlotting clusters applying PCA'): for c in range(2, self.clusters_stop): with timer('\n2D clustering with PCA'): self.cluster(idx=self.pca_idx, n_clusters=c) with timer('\n3D clustering with PCA'): self.cluster(idx=self.pca_idx, n_clusters=c, projection='3d') # Commented out due to memory error #with timer('\nPlotting clusters Kernel applying PCA'): # for c in range(2, 7): # with timer('\n2D clustering with Kernel PCA'): # self.cluster(idx=self.kernelpca_idx, n_clusters=c) # with timer('\n3D clustering with Kernel PCA'): # self.cluster(idx=self.kernelpca_idx, n_clusters=c, projection='3d') self.log_file() print('Finished') def log_file(self): if self.gettrace is None: pass elif self.gettrace(): pass else: if self.logfile: sys.stdout = self.original_stdout self.logfile.close() self.logfile = False else: # Redirect stdout to file for logging if not in debug mode self.logfile = open('logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w') sys.stdout = self.logfile def load_data(self): self.ds.dataset = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_processed') self.ds.target = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_target') self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1) def encode_scale(self): # Encode categoricals le = preprocessing.LabelEncoder() self.full['protocol_type'] = le.fit_transform(self.full['protocol_type']) self.full['service'] = le.fit_transform(self.full['service']) self.full['flag'] = le.fit_transform(self.full['flag']) # Scale sc = StandardScaler() self.full[self.scale_cols] = sc.fit_transform(self.full[self.scale_cols]) def set_x_y(self): self.x = self.full.iloc[:, :-2] self.y = self.full['target'] def set_indexes(self, cola, colb, colc): self.feature_idx[0] = self.x.columns.get_loc(cola) self.feature_idx[1] = self.x.columns.get_loc(colb) self.feature_idx[2] = self.x.columns.get_loc(colc) def cluster(self, idx, n_clusters, projection=None): df_x = self.x kmeans = KMeans(n_clusters=n_clusters, random_state=self.random_state) kmeans.fit(df_x) y_km = kmeans.fit_predict(df_x) self.visualize.scatter_clusters(self.x, n_clusters, y_km, idx, projection)
def __init__(self): os.environ[ 'TF_CPP_MIN_LOG_LEVEL'] = '2' # Ignore low level instruction warnings tf.logging.set_verbosity(tf.logging.ERROR) # Set tensorflow verbosity sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) print(__doc__) self.random_state = 20 self.filehandler = Filehandler() self.ds = KDDCup1999() self.folder = 'tuning' # Datasets self.X = None self.y = None self.X_train = None self.X_test = None self.y_train = None self.y_test = None self.n_features = None self.label_map_string_2_int = { 'normal': 0, 'dos': 1, 'u2r': 2, 'r2l': 3, 'probe': 4 } with timer('\nPreparing dataset'): self.load_data() self.set_y() self.remove_target_from_X() self.n_features_all = self.X.shape[1] self.n_features_50pct = int(self.n_features_all * 0.5) self.n_features_80pct = int(self.n_features_all * 0.8) self.y = pd.get_dummies(self.y) self.X = self.X.values self.y = self.y.values with timer('\nSearching parameter space'): # self.p = {'lr': (0.5, 5, 10), # 'first_neuron': [self.n_features_70pct, self.n_features_all], # 'hidden_layers': [0, 1, 2], # 'hidden_neuron': [self.n_features_70pct, self.n_features_all], # 'batch_size': [100, 200], # 'epochs': [30], # 'dropout': (0, 0.2, 0.5), # 'weight_regulizer': [None], # 'emb_output_dims': [None], # 'shape': ['brick', 'long_funnel'], # 'optimizer': [Adam, RMSprop], # 'losses': [binary_crossentropy], # 'activation': [relu], # 'last_activation': [sigmoid]} self.ptest = { 'lr': [10], 'first_neuron': [self.n_features_all], 'hidden_layers': [1], 'hidden_neuron': [self.n_features_all], 'batch_size': [100], 'epochs': [5], 'dropout': [0.2], 'optimizer': [SGD], 'activation': [relu], 'last_activation': [softmax] } self.p1 = { 'lr': (0.5, 5, 10), 'first_neuron': [ self.n_features_50pct, self.n_features_80pct, self.n_features_all ], 'hidden_layers': [1, 2, 3], 'hidden_neuron': [ self.n_features_50pct, self.n_features_80pct, self.n_features_all ], 'batch_size': [100, 500, 1000], 'epochs': [20], 'dropout': (0, 0.2, 5), 'optimizer': [SGD, RMSprop], 'activation': [relu], 'last_activation': [softmax] } dataset_name = self.folder + '/Hyperparameter tuning - ' + self.__class__.__name__ scan = ta.Scan(x=self.X, y=self.y, model=self.get_model, params=self.p1, grid_downsample=0.01, dataset_name=dataset_name, experiment_no='1') with timer('\nEvaluating Scan'): r = ta.Reporting(scan) # get the number of rounds in the Scan print('\nNumber of rounds in scan ', r.rounds()) # get highest results print('\nHighest validation accuracy', r.high('val_dr')) print('\nHighest validation detection rate', r.high('val_dr')) print('\nHighest validation false alarm rate', r.high('val_far')) # get the highest result for any metric print(r.high('val_dr')) # get the round with the best result print('Best round', r.rounds2high()) # get the best paramaters print(r.best_params()) #r.plot_corr() #plt.show() # a four dimensional bar grid #r.plot_bars('batch_size', 'val_dr', 'hidden_layers', 'lr') #plt.show() print('Finished')
class AnnMLPMultiOptimize: def __init__(self): os.environ[ 'TF_CPP_MIN_LOG_LEVEL'] = '2' # Ignore low level instruction warnings tf.logging.set_verbosity(tf.logging.ERROR) # Set tensorflow verbosity sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) print(__doc__) self.random_state = 20 self.filehandler = Filehandler() self.ds = KDDCup1999() self.folder = 'tuning' # Datasets self.X = None self.y = None self.X_train = None self.X_test = None self.y_train = None self.y_test = None self.n_features = None self.label_map_string_2_int = { 'normal': 0, 'dos': 1, 'u2r': 2, 'r2l': 3, 'probe': 4 } with timer('\nPreparing dataset'): self.load_data() self.set_y() self.remove_target_from_X() self.n_features_all = self.X.shape[1] self.n_features_50pct = int(self.n_features_all * 0.5) self.n_features_80pct = int(self.n_features_all * 0.8) self.y = pd.get_dummies(self.y) self.X = self.X.values self.y = self.y.values with timer('\nSearching parameter space'): # self.p = {'lr': (0.5, 5, 10), # 'first_neuron': [self.n_features_70pct, self.n_features_all], # 'hidden_layers': [0, 1, 2], # 'hidden_neuron': [self.n_features_70pct, self.n_features_all], # 'batch_size': [100, 200], # 'epochs': [30], # 'dropout': (0, 0.2, 0.5), # 'weight_regulizer': [None], # 'emb_output_dims': [None], # 'shape': ['brick', 'long_funnel'], # 'optimizer': [Adam, RMSprop], # 'losses': [binary_crossentropy], # 'activation': [relu], # 'last_activation': [sigmoid]} self.ptest = { 'lr': [10], 'first_neuron': [self.n_features_all], 'hidden_layers': [1], 'hidden_neuron': [self.n_features_all], 'batch_size': [100], 'epochs': [5], 'dropout': [0.2], 'optimizer': [SGD], 'activation': [relu], 'last_activation': [softmax] } self.p1 = { 'lr': (0.5, 5, 10), 'first_neuron': [ self.n_features_50pct, self.n_features_80pct, self.n_features_all ], 'hidden_layers': [1, 2, 3], 'hidden_neuron': [ self.n_features_50pct, self.n_features_80pct, self.n_features_all ], 'batch_size': [100, 500, 1000], 'epochs': [20], 'dropout': (0, 0.2, 5), 'optimizer': [SGD, RMSprop], 'activation': [relu], 'last_activation': [softmax] } dataset_name = self.folder + '/Hyperparameter tuning - ' + self.__class__.__name__ scan = ta.Scan(x=self.X, y=self.y, model=self.get_model, params=self.p1, grid_downsample=0.01, dataset_name=dataset_name, experiment_no='1') with timer('\nEvaluating Scan'): r = ta.Reporting(scan) # get the number of rounds in the Scan print('\nNumber of rounds in scan ', r.rounds()) # get highest results print('\nHighest validation accuracy', r.high('val_dr')) print('\nHighest validation detection rate', r.high('val_dr')) print('\nHighest validation false alarm rate', r.high('val_far')) # get the highest result for any metric print(r.high('val_dr')) # get the round with the best result print('Best round', r.rounds2high()) # get the best paramaters print(r.best_params()) #r.plot_corr() #plt.show() # a four dimensional bar grid #r.plot_bars('batch_size', 'val_dr', 'hidden_layers', 'lr') #plt.show() print('Finished') @staticmethod def dr(y_true, y_pred): y_pred_pos = K.round(K.clip(y_pred, 0, 1)) y_pred_neg = 1 - y_pred_pos y_pos = K.round(K.clip(y_true, 0, 1)) tp = K.sum(y_pos * y_pred_pos) fn = K.sum(y_pos * y_pred_neg) return tp / (tp + fn + K.epsilon()) @staticmethod def far(y_true, y_pred): y_pred_pos = K.round(K.clip(y_pred, 0, 1)) y_pred_neg = 1 - y_pred_pos y_pos = K.round(K.clip(y_true, 0, 1)) y_neg = 1 - y_pos tn = K.sum(y_neg * y_pred_neg) fp = K.sum(y_neg * y_pred_pos) return fp / (tn + fp + K.epsilon()) def get_model(self, x_train, y_train, x_val, y_val, params): model = models.Sequential() # Input layer with dropout model.add( layers.Dense(params['first_neuron'], activation=params['activation'], input_shape=(self.n_features_all, ))) model.add(layers.Dropout(params['dropout'])) # Hidden layers with dropout for i in range(params['hidden_layers']): model.add( layers.Dense(params['hidden_neuron'], activation=params['activation'])) model.add(layers.Dropout(params['dropout'])) # Output layer model.add(layers.Dense(5, activation=params['last_activation'])) # Build model model.compile(params['optimizer']( lr=lr_normalizer(params['lr'], params['optimizer'])), loss='categorical_crossentropy', metrics=['accuracy', self.dr, self.far]) history = model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=params['batch_size'], epochs=params['epochs'], verbose=0) return history, model def load_data(self): self.X = self.filehandler.read_csv( self.ds.config['path'], self.ds.config['file'] + '_Tensor2d_type_1') print('\tRow count:\t', '{}'.format(self.X.shape[0])) print('\tColumn count:\t', '{}'.format(self.X.shape[1])) def set_y(self): self.y = self.X['attack_category'] self.y = self.y.map(self.label_map_string_2_int) def remove_target_from_X(self): self.X.drop('attack_category', axis=1, inplace=True) def train_test_split(self): self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.X, self.y, test_size=0.30, random_state=self.random_state)
class Preptensorinputs: def __init__(self): self.logfile = None self.gettrace = getattr(sys, 'gettrace', None) self.original_stdout = sys.stdout self.timestr = time.strftime("%Y%m%d-%H%M%S") self.log_file() print(__doc__) self.filehandler = Filehandler() self.ds = KDDCup1999() self.X = None self.y = None self.full = None with timer('\nLoading dataset'): self.load_data() with timer('\nPreparing Tensor Input Files'): for t2d in (Tensor2d_type_1(), Tensor2d_type_2()): with timer('\nBuilding 2d tensor - ' + t2d.__class__.__name__): t2d.set_X(self.full) t2d.encode_categoricals() t2d.set_y(self.full) t2d.sample() t2d.scale() t2d.pca() t2d.add_target() self.filehandler.write_csv( self.ds.config['path'], self.ds.config['file'] + '_' + t2d.__class__.__name__, t2d.X) print('Shape of ' + self.ds.config['file'] + '_' + t2d.__class__.__name__ + ' : ' + str(t2d.X.shape)) self.log_file() print('Finished') def log_file(self): if self.gettrace is None: pass elif self.gettrace(): pass else: if self.logfile: sys.stdout = self.original_stdout self.logfile.close() self.logfile = False else: # Redirect stdout to file for logging if not in debug mode self.logfile = open( 'logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w') sys.stdout = self.logfile def load_data(self): self.ds.dataset = self.filehandler.read_csv( self.ds.config['path'], self.ds.config['file'] + '_processed') self.ds.target = self.filehandler.read_csv( self.ds.config['path'], self.ds.config['file'] + '_target') self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1) self.ds.shape()
class AnnMLPBinary: def __init__(self): os.environ[ 'TF_CPP_MIN_LOG_LEVEL'] = '2' # Ignore low level instruction warnings tf.logging.set_verbosity(tf.logging.ERROR) # Set tensorflow verbosity self.g = tf.Graph() self.tf_sess = tf.Session( config=tf.ConfigProto(log_device_placement=True), graph=self.g) self.logfile = None self.gettrace = getattr(sys, 'gettrace', None) self.original_stdout = sys.stdout self.timestr = time.strftime("%Y%m%d-%H%M%S") self.log_file() print(__doc__) self.random_state = 20 self.filehandler = Filehandler() self.ds = KDDCup1999() self.visualize = Visualize() self.folder = 'viz' # Datasets self.X = None self.y = None self.X_train = None self.X_test = None self.y_train = None self.y_test = None self.n_features = None self.label_map_int_2_string = { 0: 'good', 1: 'bad', '0': 'good', '1': 'bad' } self.label_map_string_2_int = { 'normal': 0, 'dos': 1, 'u2r': 1, 'r2l': 1, 'probe': 1 } # K-fold validation self.splits = 5 self.kfold = StratifiedKFold(n_splits=self.splits, shuffle=True, random_state=self.random_state) # Network parameters self.epochs = 20 self.batch_size = 100 self.verbose = 0 # Scores self.metric_loss = [] self.metric_acc = [] self.metric_dr = [] self.metric_far = [] self.metric_val_loss = [] self.metric_val_acc = [] self.metric_val_dr = [] self.metric_val_far = [] with timer('\nPreparing dataset'): self.load_data() self.set_y() self.remove_target_from_X() self.n_features = self.X.shape[1] self.train_test_split() with timer('\nTraining & validating model with kfold'): self.g.as_default() # Reset graph for tensorboard display K.clear_session() # Train model on K-1 and validate using remaining fold for train, val in self.kfold.split(self.X_train, self.y_train): #self.tensorboard = TensorBoard(log_dir='logs/tb/annmlpbinary_cv') self.model = self.get_model() self.history = self.model.fit( self.X_train.iloc[train], self.y_train.iloc[train], validation_data=(self.X_train.iloc[val], self.y_train.iloc[val]), epochs=self.epochs, batch_size=self.batch_size, verbose=self.verbose) #callbacks=[self.tensorboard]) self.metric_loss.append(self.history.history['loss']) self.metric_acc.append(self.history.history['acc']) self.metric_dr.append(self.history.history['dr']) self.metric_far.append(self.history.history['far']) self.metric_val_loss.append(self.history.history['val_loss']) self.metric_val_acc.append(self.history.history['val_acc']) self.metric_val_dr.append(self.history.history['val_dr']) self.metric_val_far.append(self.history.history['val_far']) print('\nTraining mean loss', np.mean(self.metric_loss)) print('Training mean acc', np.mean(self.metric_acc)) print('Training mean dr', np.mean(self.metric_dr)) print('Training mean far', np.mean(self.metric_far)) print('\nValidation mean loss', np.mean(self.metric_val_loss)) print('Validation mean acc', np.mean(self.metric_val_acc)) print('Validation mean dr', np.mean(self.metric_val_dr)) print('Validation mean far', np.mean(self.metric_val_far)) with timer('\nTesting model on unseen test set'): self.g.as_default() # Reset graph for tensorboard display K.clear_session() self.tensorboard = TensorBoard(log_dir='logs/tb/annmlpbinary_test') self.model = self.get_model() # Train model on complete train set and validate with unseen test set self.history = self.model.fit(self.X_train, self.y_train, validation_data=(self.X_test, self.y_test), epochs=self.epochs, batch_size=self.batch_size, verbose=self.verbose, callbacks=[self.tensorboard]) with timer('\nVisualising results'): # Plot model plot_model(self.model, to_file='viz/annMLPBinary - model plot.png') # Get single class prediction (rather than multi class probability summing to 1) y_pred = self.model.predict_classes(self.X_test) print('Test loss', np.mean(self.history.history['loss'])) print('Test acc', np.mean(self.history.history['acc'])) print('Test dr', np.mean(self.history.history['dr'])) print('Test far', np.mean(self.history.history['far'])) # Remap to string class targets self.y_pred = self.map_target_to_label(y_pred) self.y_pred = self.y_pred.ravel() self.y_test = self.map_target_to_label(self.y_test) self.visualize.confusion_matrix(self.y_test, self.y_pred, self.__class__.__name__) epochs = range(1, len(self.history.history['loss']) + 1) # Plot loss fig, ax = plt.subplots(figsize=(15, 8)) plt.style.use('ggplot') ax.xaxis.set_major_locator(MaxNLocator(integer=True)) ax.tick_params(axis='both', which='major', labelsize=12) ax.plot(epochs, np.mean(self.metric_loss, axis=0), 'g', label='Training') ax.plot(epochs, np.mean(self.metric_val_loss, axis=0), 'b', label='Validation') ax.plot(epochs, self.history.history['loss'], 'r', label='Test') self.title = '{} - {}'.format(self.__class__.__name__, 'Loss') plt.title(self.title, fontsize=18) plt.xlabel('Epochs', fontsize=14) plt.ylabel('Loss', fontsize=14) plt.legend(loc=1, prop={'size': 14}) plt.savefig(fname=self.fname(self.title), dpi=300, format='png') plt.show() # Plot accuracy plt.clf() fig, ax = plt.subplots(figsize=(15, 8)) plt.style.use('ggplot') ax.xaxis.set_major_locator(MaxNLocator(integer=True)) ax.tick_params(axis='both', which='major', labelsize=12) ax.plot(epochs, np.mean(self.metric_acc, axis=0), 'g', label='Training') ax.plot(epochs, np.mean(self.metric_val_acc, axis=0), 'b', label='Validation') ax.plot(epochs, self.history.history['acc'], 'r', label='Test') self.title = '{} - {}'.format(self.__class__.__name__, 'Accuracy') plt.title(self.title, fontsize=18) plt.xlabel('Epochs', fontsize=14) plt.ylabel('Accuracy', fontsize=14) plt.legend(loc=4, prop={'size': 14}) plt.savefig(fname=self.fname(self.title), dpi=300, format='png') plt.show() # Plot detection rate plt.clf() fig, ax = plt.subplots(figsize=(15, 8)) plt.style.use('ggplot') ax.xaxis.set_major_locator(MaxNLocator(integer=True)) ax.tick_params(axis='both', which='major', labelsize=12) ax.plot(epochs, np.mean(self.metric_dr, axis=0), 'g', label='Training') ax.plot(epochs, np.mean(self.metric_val_dr, axis=0), 'b', label='Validation') ax.plot(epochs, self.history.history['dr'], 'r', label='Test') self.title = '{} - {}'.format(self.__class__.__name__, 'Detection Rate') plt.title(self.title, fontsize=18) plt.xlabel('Epochs', fontsize=14) plt.ylabel('Detection Rate', fontsize=14) plt.legend(loc=4, prop={'size': 14}) plt.savefig(fname=self.fname(self.title), dpi=300, format='png') plt.show() # Plot false alarm rate plt.clf() fig, ax = plt.subplots(figsize=(15, 8)) plt.style.use('ggplot') ax.xaxis.set_major_locator(MaxNLocator(integer=True)) ax.tick_params(axis='both', which='major', labelsize=12) ax.plot(epochs, np.mean(self.metric_far, axis=0), 'g', label='Training') ax.plot(epochs, np.mean(self.metric_val_far, axis=0), 'b', label='Validation') ax.plot(epochs, self.history.history['far'], 'r', label='Test') self.title = '{} - {}'.format(self.__class__.__name__, 'False Alarm Rate') plt.title(self.title, fontsize=18) plt.xlabel('Epochs', fontsize=14) plt.ylabel('False Alarm Rate', fontsize=14) plt.legend(loc=1, prop={'size': 14}) plt.savefig(fname=self.fname(self.title), dpi=300, format='png') plt.show() self.log_file() print('Finished') @staticmethod def dr(y_true, y_pred): y_pred_pos = K.round(K.clip(y_pred, 0, 1)) y_pred_neg = 1 - y_pred_pos y_pos = K.round(K.clip(y_true, 0, 1)) tp = K.sum(y_pos * y_pred_pos) fn = K.sum(y_pos * y_pred_neg) return tp / (tp + fn + K.epsilon()) @staticmethod def far(y_true, y_pred): y_pred_pos = K.round(K.clip(y_pred, 0, 1)) y_pred_neg = 1 - y_pred_pos y_pos = K.round(K.clip(y_true, 0, 1)) y_neg = 1 - y_pos tn = K.sum(y_neg * y_pred_neg) fp = K.sum(y_neg * y_pred_pos) return fp / (tn + fp + K.epsilon()) def get_model(self): model = models.Sequential() model.add( layers.Dense(25, activation='relu', input_shape=(self.n_features, ))) model.add(layers.Dropout(0.08)) model.add(layers.Dense(25, activation='relu')) model.add(layers.Dropout(0.08)) model.add(layers.Dense(25, activation='relu')) model.add(layers.Dropout(0.08)) model.add(layers.Dense(25, activation='relu')) model.add(layers.Dropout(0.08)) model.add(layers.Dense(1, activation='sigmoid')) model.compile(optimizer=optimizers.RMSprop(lr=0.0023), loss='binary_crossentropy', metrics=['accuracy', self.dr, self.far]) return model def log_file(self): if self.gettrace is None: pass elif self.gettrace(): pass else: if self.logfile: sys.stdout = self.original_stdout self.logfile.close() self.logfile = False else: # Redirect stdout to file for logging if not in debug mode self.logfile = open( 'logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w') sys.stdout = self.logfile def load_data(self): self.X = self.filehandler.read_csv( self.ds.config['path'], self.ds.config['file'] + '_Tensor2d_type_2') print('\tRow count:\t', '{}'.format(self.X.shape[0])) print('\tColumn count:\t', '{}'.format(self.X.shape[1])) def set_y(self): self.y = self.X['attack_category'] self.y = self.y.map(self.label_map_string_2_int) def remove_target_from_X(self): self.X.drop('attack_category', axis=1, inplace=True) def train_test_split(self): self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.X, self.y, test_size=0.30, random_state=self.random_state) def map_target_to_label(self, t): return np.vectorize(self.label_map_int_2_string.get)(t) def fname(self, title): return '{}/{}.png'.format(self.folder, title)
class Modelling: def __init__(self): os.environ[ 'TF_CPP_MIN_LOG_LEVEL'] = '2' # Ignore low level instruction warnings tf.logging.set_verbosity(tf.logging.ERROR) # Set tensorflow verbosity # self.logfile = None # self.gettrace = getattr(sys, 'gettrace', None) # self.original_stdout = sys.stdout # self.timestr = time.strftime("%Y%m%d-%H%M%S") # self.log_file() print(__doc__) self.filehandler = Filehandler() self.ds = KDDCup1999() self.visualize = Visualize() self.full = None self.X = None self.y = None self.X_train = None self.X_test = None self.y_train = None self.y_test = None self.n_features = None self.random_state = 20 self.label_multi = { 0: 'normal', '0': 'normal', 1: 'dos', '1': 'dos', 2: 'u2r', '2': 'u2r', 3: 'r2l', '3': 'r2l', 4: 'probe', '4': 'probe' } self.label_binary = {0: 'good', '0': 'good', 1: 'bad', '1': 'bad'} with timer('\nLoading dataset'): self.load_data() with timer('\nSetting X and y'): self.set_X() self.n_features = self.X.shape[1] models = (RandomForestClf(), AnnSLPBinary(self.n_features), AnnMLPBinary(self.n_features), AnnMLPMulti(self.n_features)) classification_type = ('Binary', 'Multi') for m, ctype in itertools.product(models, classification_type): score = False if ctype == 'Binary' and m.binary_enabled: self.set_y_binary() score = True elif ctype == 'Multi' and m.multi_enabled: self.set_y_multi() score = True if not score: continue with timer('\nTraining and scoring {} - {} target'.format( m.__class__.__name__, ctype)): m.base['model'] = m.get_model() #self.train_test_split() m.score(self.X, self.y, ctype) m.y_test[ctype] = pd.Series(m.y_test[ctype]) m.y_pred[ctype] = pd.Series(m.y_pred[ctype]) m.y_test[ctype] = m.y_test[ctype].astype(int) m.y_pred[ctype] = m.y_pred[ctype].astype(int) if ctype == 'Binary': m.y_test[ctype] = self.series_map_ac_binary_to_label( m.y_test[ctype]) m.y_pred[ctype] = self.series_map_ac_binary_to_label( m.y_pred[ctype]) else: m.y_test[ctype] = self.series_map_ac_multi_to_label( m.y_test[ctype]) m.y_pred[ctype] = self.series_map_ac_multi_to_label( m.y_pred[ctype]) title = '{} - {} - {} '.format('CM', m.__class__.__name__, ctype) self.visualize.confusion_matrix(m.y_test[ctype], m.y_pred[ctype], title) self.scores(m.y_test[ctype], m.y_pred[ctype]) # Append the scores to a scores array. I could then do an np.mean(scores) to get the mean(average) from all the kfolds # save the epoch number and gfold number if possible as well, to get a per/epoch score # self.log_file() print('Finished') def log_file(self): if self.gettrace is None: pass elif self.gettrace(): pass else: if self.logfile: sys.stdout = self.original_stdout self.logfile.close() self.logfile = False else: # Redirect stdout to file for logging if not in debug mode self.logfile = open( 'logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w') sys.stdout = self.logfile def load_data(self): self.full = self.filehandler.read_csv( self.ds.config['path'], self.ds.config['file'] + '_Tensor2d_type_1') def set_X(self): self.X = self.full.loc[:, self.full.columns != 'attack_category'] def set_y_binary(self): self.y = self.full.loc[:, ['attack_category']] self.df_map_ac_label_to_binary() self.y = self.y.values.ravel() def set_y_multi(self): self.y = self.full.loc[:, ['attack_category']] self.df_map_ac_label_to_multi() self.y = self.y.values.ravel() def train_test_split(self): self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.X, self.y, test_size=0.30, random_state=self.random_state) def df_map_ac_label_to_binary(self): conditions = [(self.y['attack_category'] == 'normal'), (self.y['attack_category'] == 'dos') | (self.y['attack_category'] == 'u2r') | (self.y['attack_category'] == 'r2l') | (self.y['attack_category'] == 'probe')] self.y['attack_category'] = np.select(conditions, [0, 1]) def df_map_ac_label_to_multi(self): conditions = [(self.y['attack_category'] == 'normal'), (self.y['attack_category'] == 'dos'), (self.y['attack_category'] == 'u2r'), (self.y['attack_category'] == 'r2l'), (self.y['attack_category'] == 'probe')] self.y['attack_category'] = np.select( conditions, ['0', '1', '2', '3', '4']) # string for get_dummies encoding def series_map_ac_multi_to_label(self, s): return s.map(self.label_multi) def series_map_ac_binary_to_label(self, s): return s.map(self.label_binary) def scores(self, y_test, y_pred): print('Accuracy {}'.format(accuracy_score(y_test, y_pred))) print('F1 {}'.format(classification_report(y_test, y_pred, digits=10)))
class Preprocessor: def __init__(self, envparm): self.dataset_raw = None self.numerical_features_raw = None self.categorical_features_raw = None self.filehandler = None self.preprocess(envparm) @staticmethod def drop_features_min_unique(dataset, min_threshold): features_dropped_str = '' for col in dataset: if len(dataset[col].unique()) <= min_threshold: features_dropped_str += str(col) + ' ' dataset.drop(col, inplace=True, axis=1) logging.info( 'Features dropped with unique value count <= {} - {}'.format( min_threshold, features_dropped_str)) return dataset @staticmethod def drop_features_max_unique(dataset, max_threshold): features_dropped_str = '' for col in dataset: if len(dataset[col].unique()) >= max_threshold: features_dropped_str += str(col) + ' ' dataset.drop(col, inplace=True, axis=1) logging.info( 'Features dropped with unique value count >= {} - {}'.format( max_threshold, features_dropped_str)) return dataset @staticmethod def drop_features_max_null(dataset, max_threshold): features_dropped_str = '' for col in dataset: if sum(dataset[col].isnull()) >= max_threshold: features_dropped_str += str(col) + ' ' dataset.drop(col, inplace=True, axis=1) logging.info( 'Features dropped with null value count >= {} - {}'.format( max_threshold, features_dropped_str)) return dataset @staticmethod def plot_num_obs_missing_values(dataset): df = pd.DataFrame(data=dataset.isnull().sum(), columns=['Count']) df['bin'] = pd.cut(df['Count'], [ -1, 10, 20, 30, 40, 50, 100, 200, 300, 400, 500, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 50000 ], labels=[ '0-10', '10-20', '20-30', '30-40', '40-50', '50-100', '100-200', '200-300', '300-400', '400-500', '500-1K', '1K-2K', '2K-3K', '3K-4K', '4K-5K', '5K-6K', '6K-7K', '7K-8K', '8K-9K', '9K-10K', '10K-50K' ]) countplot = sns.countplot(y="bin", data=df) countplot.set(ylabel="Observations With Null Values", xlabel="Feature Count", title="Observations With Null Values Per Feature") plt.show() @staticmethod def impute_numeric_feature_with_zero(dataset): dataset.fillna(0, inplace=True) @staticmethod def impute_categorical_feature_with_blank(dataset): dataset.fillna('', inplace=True) def prepare_output_variant_01(self): logging.info('Preparing output variant 01') numerical_features = self.numerical_features_raw.copy() logging.info('Validating numerical features') numerical_features = self.drop_features_min_unique( numerical_features, 1) logging.info('Validating categorical features') categorical_features = self.categorical_features_raw.copy() categorical_features = self.drop_features_min_unique( categorical_features, 1) logging.info('Imputing numerical features with mean') numerical_features.fillna(numerical_features.mean(), inplace=True) logging.info('Imputing categorical features with "missing"') categorical_features.fillna('missing', inplace=True) # Random Forest needs the categorical features encoding otherwise string to float error logging.info('Label encoding categorical features') labelencoder_categorical = LabelEncoder() labelencoder_categorical = categorical_features.apply( labelencoder_categorical.fit_transform) dataset_output = self.filehandler.output_prep_dataset( self.filehandler.dataset_prep_path_01, numerical_features, labelencoder_categorical) logging.info('Dataset size after feature transformation - {}'.format( dataset_output.shape)) logging.info('Completed Preparing output variant 01') def prepare_output_variant_02(self): logging.info('Preparing output variant 02') numerical_features = self.numerical_features_raw.copy() logging.info('Validating numerical features') numerical_features = self.drop_features_min_unique( numerical_features, 1) logging.info('Validating categorical features') categorical_features = self.categorical_features_raw.copy() categorical_features = self.drop_features_min_unique( categorical_features, 1) logging.info('Imputing numerical features with zero') numerical_features.fillna(0, inplace=True) logging.info('Imputing categorical features with "missing"') categorical_features.fillna('missing', inplace=True) # Random Forest needs the categorical features encoding otherwise string to float error logging.info('Label encoding categorical features') labelencoder_categorical = LabelEncoder() labelencoder_categorical = categorical_features.apply( labelencoder_categorical.fit_transform) dataset_output = self.filehandler.output_prep_dataset( self.filehandler.dataset_prep_path_02, numerical_features, labelencoder_categorical) logging.info('Dataset size after feature transformation - {}'.format( dataset_output.shape)) logging.info('Completed Preparing output variant 02') def prepare_output_variant_03(self): logging.info('Preparing output variant 03') numerical_features = self.numerical_features_raw.copy() logging.info('Validating numerical features') numerical_features = self.drop_features_min_unique( numerical_features, 1) logging.info('Validating categorical features') categorical_features = self.categorical_features_raw.copy() categorical_features = self.drop_features_min_unique( categorical_features, 1) categorical_features = self.drop_features_max_unique( categorical_features, 11) logging.info('Imputing numerical features with -9876') numerical_features.fillna(-9876, inplace=True) logging.info('Imputing categorical features with "missing"') categorical_features.fillna('missing', inplace=True) # Random Forest needs the categorical features encoding otherwise string to float error logging.info('Label encoding categorical features') labelencoder_categorical = LabelEncoder() labelencoder_categorical = categorical_features.apply( labelencoder_categorical.fit_transform) # One hot encoding results in memory error due to wide dataset onehotencoder = OneHotEncoder() labelencoder_categorical = onehotencoder.fit_transform( labelencoder_categorical).toarray() labelencoder_categorical_df = pd.DataFrame(labelencoder_categorical) dataset_output = self.filehandler.output_prep_dataset( self.filehandler.dataset_prep_path_03, numerical_features, labelencoder_categorical_df) logging.info('Dataset size after feature transformation - {}'.format( dataset_output.shape)) logging.info('Completed Preparing output variant 03') def preprocess(self, envparm): self.filehandler = Filehandler() self.dataset_raw = self.filehandler.read_csv( self.filehandler.data_raw_path) logging.info('Original raw dataset loaded - dataset size {}'.format( self.dataset_raw.shape)) logging.info('Partitioning numerical features') self.numerical_features_raw = self.dataset_raw.iloc[:, 0:190].copy() logging.info('Partitioning categorical features') self.categorical_features_raw = self.dataset_raw.iloc[:, 190:].copy() if envparm['PlotGraphs']: num_size = 28 sample_df = self.dataset_raw.iloc[:, :num_size].copy() visualizer.matrix_missing( sample_df, 'Data Completion First ' + str(num_size) + ' Numeric Features') visualizer.bar_missing( sample_df, 'Nullity Count First ' + str(num_size) + ' Numeric Features') visualizer.heat_missing( sample_df, 'Nullity Correlation Of First ' + str(num_size) + ' Numeric Features') sample_df = self.dataset_raw.iloc[:, 190:].copy() visualizer.matrix_missing(sample_df, 'Data Completion Categorical Features') visualizer.bar_missing(sample_df, 'Nullity Count Categorical Features') visualizer.heat_missing( sample_df, 'Nullity Correlation Of Categorical Features') if envparm['ProcessDS01']: self.prepare_output_variant_01() if envparm['ProcessDS02']: self.prepare_output_variant_02() if envparm['ProcessDS03']: self.prepare_output_variant_03()
class FeatureSelection: def __init__(self): self.logfile = False self.gettrace = getattr(sys, 'gettrace', None) self.original_stdout = sys.stdout self.timestr = time.strftime("%Y%m%d-%H%M%S") self.log_file() print(__doc__) self.filehandler = Filehandler() self.visualize = Visualize() self.ds = KDDCup1999() self.X = None self.y = None self.full = None self.random_state = 20 self.num_features = 15 self.scale_cols = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate'] with timer('\nLoading dataset'): self.load_data() self.encode_scale() self.set_X() with timer('\nFeature selection'): for selector in (Original(), UnivariateSelector(), RecursiveSelector(), PCASelector(), #KernelPCASelector(), ExtraTreesSelector(), RandomForestSelector()): for label in ('attack_category', 'target'): self.set_y(label) with timer('\nFitting selector ' + selector.__class__.__name__): selector.fit_model(self.X, self.y) x = selector.get_top_features(self.X, label) with timer('\nXGBoost scoring of features selected by ' + selector.__class__.__name__): self.score_with_xgboost(x, self.y, selector.title) self.log_file() print('Finished') def log_file(self): if self.gettrace is None: pass elif self.gettrace(): pass else: if self.logfile: sys.stdout = self.original_stdout self.logfile.close() self.logfile = False else: # Redirect stdout to file for logging if not in debug mode self.logfile = open('logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w') sys.stdout = self.logfile def load_data(self): self.ds.dataset = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_processed') self.ds.target = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_target') self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1) self.ds.shape() print(self.ds.dataset.columns) self.ds.row_count_by_target('attack_category') def encode_scale(self): # Encode categoricals le = preprocessing.LabelEncoder() self.full['protocol_type'] = le.fit_transform(self.full['protocol_type']) self.full['service'] = le.fit_transform(self.full['service']) self.full['flag'] = le.fit_transform(self.full['flag']) # Scale sc = MinMaxScaler() self.full[self.scale_cols] = sc.fit_transform(self.full[self.scale_cols]) def set_X(self): self.X = self.full.iloc[:, :-2] def set_y(self, label): self.y = self.full[label] def score_with_xgboost(self, x, y, title): clf = XGBClassifier(n_estimators=100, random_state=self.random_state) kfold = StratifiedKFold(n_splits=10, random_state=self.random_state) results = cross_val_score(clf, x, y, cv=kfold) print("XGBoost Accuracy: %.2f%% (+/- %.2f%%)" % (results.mean() * 100, results.std() * 100)) y_pred = cross_val_predict(clf, x, y, cv=10) self.visualize.confusion_matrix(y, y_pred, title)
def __init__(self): self.logfile = None self.gettrace = getattr(sys, 'gettrace', None) self.original_stdout = sys.stdout self.timestr = time.strftime("%Y%m%d-%H%M%S") self.log_file() print(__doc__) self.filehandler = Filehandler() self.ds = KDDCup1999() self.visualize = Visualize() self.random_state = 20 self.X = None self.y = None self.full = None # RF Feature selected plus sparse cols self.cols = [ 'count', 'diff_srv_rate', 'src_bytes', 'dst_host_srv_count', 'flag', 'dst_bytes', 'serror_rate', 'dst_host_diff_srv_rate', 'service', 'dst_host_count', 'dst_host_srv_diff_host_rate', 'logged_in', 'protocol_type', 'dst_host_same_src_port_rate', 'hot', 'srv_count', 'wrong_fragment', 'num_compromised', 'rerror_rate', 'srv_diff_host_rate', 'urgent', 'num_failed_logins', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login' ] with timer('\nLoading dataset'): self.load_data() with timer('\nScaling'): # Sampling options for sampler in (Original(), RandomOverSampler(), SMOTE(random_state=self.random_state), ADASYN(random_state=self.random_state), BorderlineSMOTE(random_state=self.random_state, kind='borderline-1')): self.X = self.full.loc[:, self.cols] self.X['target'] = self.full['target'] print('X shape with selected features and binary - ', self.X.shape) self.X = pd.get_dummies( data=self.X, columns=['protocol_type', 'service', 'flag']) print('X shape after encoding categoricals - ', self.X.shape) # Re-sample based on attack_category labels res_x = pd.DataFrame() res_x, res_y_attack_category, title = self.sample( sampler, self.X, self.full['attack_category']) res_y_target = res_x[ 'target'] # Grab target as y from resampled x set res_x.drop(columns=['target'], inplace=True) print('X shape after sampling and removing target - ', res_x.shape) print('y shape with attack_category after resample - ', res_y_attack_category.shape) print(res_y_attack_category.value_counts()) res_y_attack_category.value_counts().plot( kind='bar', title=title + ' - Resampled Count (attack_category)') plt.show() print('y shape with target after resample - ', res_y_target.shape) # Scale after resampling qt = QuantileTransformer(output_distribution='normal') res_x = qt.fit_transform(res_x) print('X shape after scaling - ', res_x.shape) # Score on attack_category multi-class self.model_and_score(res_x, res_y_attack_category, title, 'attack_category') # Score on binary target self.model_and_score(res_x, res_y_target, title, 'target') self.log_file() print('Finished')
class Sampling: def __init__(self): self.logfile = None self.gettrace = getattr(sys, 'gettrace', None) self.original_stdout = sys.stdout self.timestr = time.strftime("%Y%m%d-%H%M%S") self.log_file() print(__doc__) self.filehandler = Filehandler() self.ds = KDDCup1999() self.visualize = Visualize() self.random_state = 20 self.X = None self.y = None self.full = None # RF Feature selected plus sparse cols self.cols = [ 'count', 'diff_srv_rate', 'src_bytes', 'dst_host_srv_count', 'flag', 'dst_bytes', 'serror_rate', 'dst_host_diff_srv_rate', 'service', 'dst_host_count', 'dst_host_srv_diff_host_rate', 'logged_in', 'protocol_type', 'dst_host_same_src_port_rate', 'hot', 'srv_count', 'wrong_fragment', 'num_compromised', 'rerror_rate', 'srv_diff_host_rate', 'urgent', 'num_failed_logins', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login' ] with timer('\nLoading dataset'): self.load_data() with timer('\nScaling'): # Sampling options for sampler in (Original(), RandomOverSampler(), SMOTE(random_state=self.random_state), ADASYN(random_state=self.random_state), BorderlineSMOTE(random_state=self.random_state, kind='borderline-1')): self.X = self.full.loc[:, self.cols] self.X['target'] = self.full['target'] print('X shape with selected features and binary - ', self.X.shape) self.X = pd.get_dummies( data=self.X, columns=['protocol_type', 'service', 'flag']) print('X shape after encoding categoricals - ', self.X.shape) # Re-sample based on attack_category labels res_x = pd.DataFrame() res_x, res_y_attack_category, title = self.sample( sampler, self.X, self.full['attack_category']) res_y_target = res_x[ 'target'] # Grab target as y from resampled x set res_x.drop(columns=['target'], inplace=True) print('X shape after sampling and removing target - ', res_x.shape) print('y shape with attack_category after resample - ', res_y_attack_category.shape) print(res_y_attack_category.value_counts()) res_y_attack_category.value_counts().plot( kind='bar', title=title + ' - Resampled Count (attack_category)') plt.show() print('y shape with target after resample - ', res_y_target.shape) # Scale after resampling qt = QuantileTransformer(output_distribution='normal') res_x = qt.fit_transform(res_x) print('X shape after scaling - ', res_x.shape) # Score on attack_category multi-class self.model_and_score(res_x, res_y_attack_category, title, 'attack_category') # Score on binary target self.model_and_score(res_x, res_y_target, title, 'target') self.log_file() print('Finished') def log_file(self): if self.gettrace is None: pass elif self.gettrace(): pass else: if self.logfile: sys.stdout = self.original_stdout self.logfile.close() self.logfile = False else: # Redirect stdout to file for logging if not in debug mode self.logfile = open( 'logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w') sys.stdout = self.logfile def load_data(self): self.ds.dataset = self.filehandler.read_csv( self.ds.config['path'], self.ds.config['file'] + '_processed') self.ds.target = self.filehandler.read_csv( self.ds.config['path'], self.ds.config['file'] + '_target') self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1) self.ds.shape() self.ds.row_count_by_target('attack_category') def set_y(self, label): self.y = self.full[label] def sample(self, sampler, X, y): title = sampler.__class__.__name__ res_x, res_y = sampler.fit_resample(X, y) if isinstance(res_x, np.ndarray): res_x = pd.DataFrame(res_x, columns=X.columns) if isinstance(res_y, np.ndarray): res_y = pd.Series(res_y) print('Shape after sampling with {} - x {}, y {}'.format( title, res_x.shape, res_y.shape)) return res_x, res_y, title def model_and_score(self, X, y, title, label): clf = XGBClassifier(n_estimators=50, random_state=self.random_state) kfold = StratifiedKFold(n_splits=5, random_state=self.random_state) results = cross_val_score(clf, X, y, cv=kfold) y_pred = cross_val_predict(clf, X, y, cv=5) print('{} - {} - XGBoost Accuracy: {:.2f}% (+/- {:.2f}'.format( title, label, results.mean() * 100, results.std() * 100)) self.visualize.confusion_matrix( y, y_pred, '{} - {} - Label {}'.format(title, clf.__class__.__name__, label))
class Scaling: def __init__(self): self.logfile = None self.gettrace = getattr(sys, 'gettrace', None) self.original_stdout = sys.stdout self.timestr = time.strftime("%Y%m%d-%H%M%S") self.log_file() print(__doc__) self.filehandler = Filehandler() self.ds = KDDCup1999() self.visualize = Visualize() self.random_state = 20 self.X = None self.y = None self.full = None self.ac_count = {} self.scores = OrderedDict() self.scale_cols = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate'] with timer('\nLoading dataset'): self.load_data() self.set_attack_category_count() with timer('\nEncoding categoricals'): le = preprocessing.LabelEncoder() self.full['protocol_type'] = le.fit_transform(self.full['protocol_type']) self.full['service'] = le.fit_transform(self.full['service']) self.full['flag'] = le.fit_transform(self.full['flag']) with timer('\nSetting X'): self.set_X() self.ds.shape() with timer('\nDistribution Before Scaling'): self.dist_before_scaling() with timer('\nScaling'): for scaler in (StandardScaler(), Normalizer(), MinMaxScaler(feature_range=(0, 1)), Binarizer(threshold=0.0), RobustScaler(quantile_range=(25, 75)), PowerTransformer(method='yeo-johnson'), QuantileTransformer(output_distribution='normal')): title, res_x = self.scale(scaler) label = 'attack_category' self.set_y(label) self.model_and_score(scaler, res_x, title, label) label = 'target' self.set_y(label) self.model_and_score(scaler, res_x, title, label) self.log_file() print('Finished') def log_file(self): if self.gettrace is None: pass elif self.gettrace(): pass else: if self.logfile: sys.stdout = self.original_stdout self.logfile.close() self.logfile = False else: # Redirect stdout to file for logging if not in debug mode self.logfile = open('logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w') sys.stdout = self.logfile def load_data(self): self.ds.dataset = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_processed') self.ds.target = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_target') self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1) self.ds.shape() self.ds.row_count_by_target('attack_category') def set_attack_category_count(self): ac = self.full['attack_category'].value_counts() for key, value in ac.items(): self.ac_count[key] = value def set_X(self): self.X = self.full.loc[:, self.scale_cols] def set_y(self, label): self.y = self.full[label] def dist_before_scaling(self): self.visualize.kdeplot('Distribution Before Scaling', self.X, self.scale_cols) def scale(self, scaler): x = self.X[self.scale_cols] with warnings.catch_warnings(): warnings.simplefilter("ignore") res_x = scaler.fit_transform(x) res_x = pd.DataFrame(res_x, columns=self.scale_cols) title = 'Distribution After ' + scaler.__class__.__name__ self.visualize.kdeplot(title, res_x, self.scale_cols) return title, res_x def model_and_score(self, scaler, res_x, title, label): clf = XGBClassifier(n_estimators=100, random_state=self.random_state) kfold = StratifiedKFold(n_splits=10, random_state=self.random_state) results = cross_val_score(clf, res_x, self.y, cv=kfold) y_pred = cross_val_predict(clf, res_x, self.y, cv=10) print('{} - {} - XGBoost Accuracy: {:.2f}% (+/- {:.2f}'.format(title, label, results.mean() * 100, results.std() * 100)) self.visualize.confusion_matrix(self.y, y_pred, '{} - {} - Label {}'.format(title, clf.__class__.__name__, label))
def __init__(self): self.logfile = None self.gettrace = getattr(sys, 'gettrace', None) self.original_stdout = sys.stdout self.timestr = time.strftime("%Y%m%d-%H%M%S") self.log_file() print(__doc__) self.n_classes = 5 self.random_state = 20 self.filehandler = Filehandler() self.ds = KDDCup1999() self.folder = 'viz' self.fprefix_multi = 'Hyper - annMLPMulti - ' # Datasets self.X = None self.y = None self.X_train = None self.X_test = None self.y_train = None self.y_test = None self.hyp = None self.lr = None self.label_map_string_2_int = { 'normal': 0, 'dos': 1, 'u2r': 2, 'r2l': 3, 'probe': 4 } self.max_iters = 100 with timer('\nPreparing dataset'): self.load_data() self.set_y() self.remove_target_from_X() self.train_test_split() with timer('\nPreparing base logistic regression'): self.lr = LogisticRegression(max_iter=self.max_iters) self.lr.fit(self.X_train, self.y_train) with timer('\nPreparing confusion matrix and base DR'): self.y_pred = self.lr.predict(self.X_test) cm = confusion_matrix(self.y_test, self.y_pred) self.tp = self.get_tp_from_cm(cm) self.tn = self.get_tn_from_cm(cm) self.fp = self.get_fp_from_cm(cm) self.fn = self.get_fn_from_cm(cm) self.dr = self.tp / (self.tp + self.fp) print('log reg dr', self.dr) with timer('\nVisualising optimisation search'): self.load_hyp() self.hyp['lr'] = round(self.hyp['lr'] / 1000, 3) # Hyperparameter correlation with val DR self.hyp_val_dr = self.hyp self.hyp_val_dr.drop([ 'round_epochs', 'epochs', 'loss', 'dr', 'far', 'acc', 'val_loss', 'val_acc', 'val_far' ], axis=1, inplace=True) self.dr_corr = self.hyp_val_dr.corr() plt.clf() fig, ax = plt.subplots(figsize=(10, 10)) title = 'Validation DR Hyperparameter Correlation' ax.set_title(title, size=16) colormap = sns.diverging_palette(220, 10, as_cmap=True) sns.heatmap(self.dr_corr, cmap=colormap, annot=True, fmt=".2f", cbar=False, vmin=-0.4, vmax=0.4) plt.xticks(range(len(self.dr_corr.columns)), self.dr_corr.columns) plt.yticks(range(len(self.dr_corr.columns)), self.dr_corr.columns) plt.savefig(fname=self.fname(title), dpi=300, format='png') plt.show() self.hyp['val_dr_change'] = round(self.hyp.val_dr - self.dr, 3) pd.set_option('display.max_columns', 100) print(self.hyp.sort_values(by='val_dr', ascending=False).head()) self.color = 'cornflowerblue' metric = 'lr' plt.clf() fig, ax = plt.subplots(figsize=(10, 6)) ax = sns.boxplot(x=metric, y='val_dr_change', data=self.hyp.reset_index(), color=self.color) title = 'Validation DR Change Over Baseline As Fn Of Learning Rate' plt.title(title, fontsize=16) plt.xlabel('Learning Rate', fontsize=12) plt.ylabel('Validation DR Change', fontsize=12) plt.savefig(fname=self.fname(title), dpi=300, format='png') plt.show() metric = 'first_neuron' plt.clf() fig, ax = plt.subplots(figsize=(10, 6)) ax = sns.boxplot(x=metric, y='val_dr_change', data=self.hyp.reset_index(), color=self.color) title = 'Validation DR Change Over Baseline As Fn Of # Neurons First Layer' plt.title(title, fontsize=16) plt.xlabel('First Neuron', fontsize=12) plt.ylabel('Validation DR Change', fontsize=12) plt.savefig(fname=self.fname(title), dpi=300, format='png') plt.show() metric = 'hidden_layers' plt.clf() fig, ax = plt.subplots(figsize=(10, 6)) ax = sns.boxplot(x=metric, y='val_dr_change', data=self.hyp.reset_index(), color=self.color) title = 'Validation DR Change Over Baseline As Fn Of # Hidden Layers' plt.title(title, fontsize=16) plt.xlabel('Hidden Layers', fontsize=12) plt.ylabel('Validation DR Change', fontsize=12) plt.savefig(fname=self.fname(title), dpi=300, format='png') plt.show() metric = 'hidden_neuron' plt.clf() fig, ax = plt.subplots(figsize=(10, 6)) ax = sns.boxplot(x=metric, y='val_dr_change', data=self.hyp.reset_index(), color=self.color) title = 'Validation DR Change Over Baseline As Fn Of # Hidden Layer Neurons' plt.title(title, fontsize=16) plt.xlabel('Hidden Neurons', fontsize=12) plt.ylabel('Validation DR Change', fontsize=12) plt.savefig(fname=self.fname(title), dpi=300, format='png') plt.show() metric = 'batch_size' plt.clf() fig, ax = plt.subplots(figsize=(10, 6)) ax = sns.boxplot(x=metric, y='val_dr_change', data=self.hyp.reset_index(), color=self.color) title = 'Validation DR Change Over Baseline As Fn Of Batch Size' plt.title(title, fontsize=16) plt.xlabel('Batch Size', fontsize=12) plt.ylabel('Validation DR Change', fontsize=12) plt.savefig(fname=self.fname(title), dpi=300, format='png') plt.show() metric = 'dropout' plt.clf() fig, ax = plt.subplots(figsize=(12, 8)) ax = sns.boxplot(x=metric, y='val_dr_change', data=self.hyp.reset_index(), color=self.color) title = 'Validation DR Change Over Baseline As Fn Of Dropout' plt.title(title, fontsize=16) plt.xlabel('Dropout', fontsize=12) plt.ylabel('Validation DR Change', fontsize=12) plt.savefig(fname=self.fname(title), dpi=300, format='png') plt.show() plt.clf() fig, ax = plt.subplots(figsize=(9, 7)) df_grid = self.hyp.reset_index().groupby( ['first_neuron', 'hidden_neuron']).val_dr_change.mean().unstack() ax = sns.heatmap(data=df_grid, cmap=(sns.diverging_palette(10, 220, sep=80, n=7)), annot=True, cbar=False) title = 'Validation DR Change Over Baseline As Fn Of First Neuron & Hidden Neuron' plt.title(title, fontsize=12) plt.xlabel('Hidden Neuron', fontsize=10) plt.ylabel('First Neuron', fontsize=10) plt.savefig(fname=self.fname(title), dpi=300, format='png') plt.show() plt.clf() fig, ax = plt.subplots(figsize=(9, 7)) df_grid = self.hyp.reset_index().groupby( ['hidden_layers', 'hidden_neuron']).val_dr_change.mean().unstack() ax = sns.heatmap(data=df_grid, cmap=(sns.diverging_palette(10, 220, sep=80, n=7)), annot=True, cbar=False) title = 'Validation DR Change Over Baseline As Fn Of Hidden Layers & Hidden Neuron' plt.title(title, fontsize=16) plt.xlabel('Hidden Neuron', fontsize=10) plt.ylabel('Hidden Layers', fontsize=10) plt.savefig(fname=self.fname(title), dpi=300, format='png') plt.show() plt.clf() fig, ax = plt.subplots(figsize=(9, 7)) df_grid = self.hyp.reset_index().groupby( ['batch_size', 'dropout']).val_dr_change.mean().unstack() ax = sns.heatmap(data=df_grid, cmap=(sns.diverging_palette(10, 220, sep=80, n=7)), annot=True, cbar=False) title = 'Validation DR Change Over Baseline As Fn Of Batch Size & Dropout' plt.xlabel('Dropout', fontsize=10) plt.ylabel('Batch Size', fontsize=10) plt.title(title, fontsize=16) plt.savefig(fname=self.fname(title), dpi=300, format='png') plt.show() plt.clf() fig, ax = plt.subplots(figsize=(9, 7)) df_grid = self.hyp.reset_index().groupby( ['lr', 'dropout']).val_dr_change.mean().unstack() ax = sns.heatmap(data=df_grid, cmap=(sns.diverging_palette(10, 220, sep=80, n=7)), annot=True, cbar=False) title = 'Validation DR Change Over Baseline As Fn Of Learning Rate & Dropout' plt.xlabel('Dropout', fontsize=10) plt.ylabel('Learning Rate', fontsize=10) plt.title(title, fontsize=16) plt.savefig(fname=self.fname(title), dpi=300, format='png') plt.show() self.log_file() print('Finished')
class Linearity: def __init__(self): self.logfile = None self.gettrace = getattr(sys, 'gettrace', None) self.original_stdout = sys.stdout self.timestr = time.strftime("%Y%m%d-%H%M%S") self.log_file() print(__doc__) self.filehandler = Filehandler() self.ds = KDDCup1999() self.visualize = Visualize() self.random_state = 20 self.X = None self.y = None self.sample = None self.full = None self.ac_count = {} self.scale_cols = [ 'duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate' ] self.full_weights = { 'normal': 1, 'dos': 1, 'probe': 1, 'u2r': 1, 'r2l': 1 } self.minimal_weights = { 'normal': 0.01, 'dos': 0.01, 'probe': 0.2, 'u2r': 0.5, 'r2l': 0.5 } with timer('\nLoading dataset'): self.load_data() self.set_attack_category_count() self.ds.shape() with timer('\nEncode and Scale dataset'): # Encode categoricals le = preprocessing.LabelEncoder() self.full['protocol_type'] = le.fit_transform( self.full['protocol_type']) self.full['service'] = le.fit_transform(self.full['service']) self.full['flag'] = le.fit_transform(self.full['flag']) # Scale sc = StandardScaler() self.full[self.scale_cols] = sc.fit_transform( self.full[self.scale_cols]) with timer('\nPlotting scatter graphs'): self.sample_dataset(self.full_weights) print(self.sample.shape) self.set_X_y('target') self.scatter() with timer('\nPlotting scatter graphs with convex hull'): self.sample_dataset(self.full_weights) print(self.sample.shape) self.set_X_y('target') self.convex_hull() with timer('\nPlotting linear separability with classifiers'): self.sample_dataset(self.minimal_weights) print(self.sample.shape) self.set_X_y('target') self.classifiers() self.log_file() print('Finished') def log_file(self): if self.gettrace is None: pass elif self.gettrace(): pass else: if self.logfile: sys.stdout = self.original_stdout self.logfile.close() self.logfile = False else: # Redirect stdout to file for logging if not in debug mode self.logfile = open( 'logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w') sys.stdout = self.logfile def scatter(self): self.visualize.scatter(self.X, cola='src_bytes', colb='dst_bytes', hue='target') self.visualize.scatter(self.X, cola='count', colb='diff_srv_rate', hue='target') self.visualize.scatter(self.X, cola='duration', colb='src_bytes', hue='target') self.visualize.scatter(self.X, cola='dst_host_srv_count', colb='dst_bytes', hue='target') self.visualize.scatter(self.X, cola='serror_rate', colb='rerror_rate', hue='target') self.visualize.scatter(self.X, cola='dst_host_srv_count', colb='dst_bytes', hue='target') self.visualize.scatter(self.X, cola='srv_diff_host_rate', colb='srv_count', hue='target') def convex_hull(self): buckets = self.y.unique() self.visualize.convex_hull(self.X, buckets, cola='src_bytes', colb='dst_bytes', target='target') self.visualize.convex_hull(self.X, buckets, cola='count', colb='diff_srv_rate', target='target') self.visualize.convex_hull(self.X, buckets, cola='duration', colb='src_bytes', target='target') self.visualize.convex_hull(self.X, buckets, cola='dst_host_srv_count', colb='dst_bytes', target='target') self.visualize.convex_hull(self.X, buckets, cola='serror_rate', colb='rerror_rate', target='target') self.visualize.convex_hull(self.X, buckets, cola='dst_host_srv_count', colb='dst_bytes', target='target') self.visualize.convex_hull(self.X, buckets, cola='srv_diff_host_rate', colb='srv_count', target='target') def load_data(self): self.ds.dataset = self.filehandler.read_csv( self.ds.config['path'], self.ds.config['file'] + '_processed') self.ds.target = self.filehandler.read_csv( self.ds.config['path'], self.ds.config['file'] + '_target') self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1) def set_attack_category_count(self): ac = self.full['attack_category'].value_counts() for key, value in ac.items(): self.ac_count[key] = value def set_X_y(self, target): print('Setting X, with y as {}'.format(target)) self.X = self.sample self.y = self.sample[target] def sample_dataset(self, weights): print('Sampling dataset with weights {}'.format(weights)) self.sample = pd.DataFrame() for key, value in self.ac_count.items(): samples = int(value * weights[key]) df = self.full[self.full.attack_category == key].sample( samples, random_state=self.random_state) self.sample = self.sample.append(df) def classifiers(self): le = preprocessing.LabelEncoder() self.y = le.fit_transform(self.y) _y = self.y models = (Perceptron(max_iter=100, tol=1e-3, random_state=self.random_state), LinearSVC(max_iter=500, random_state=self.random_state, tol=1e-5), SVC(kernel='rbf', gamma=5, C=10.0, random_state=self.random_state)) titles = ('Perceptron', 'LinearSVC (linear kernel)', 'SVC with RBF kernel') columns = [('srv_diff_host_rate', 'srv_count'), ('dst_host_srv_count', 'count'), ('dst_host_srv_count', 'dst_bytes')] for clf, title in zip(models, titles): for cola, colb in columns: _x = self.X.loc[:, [cola, colb]] clf.fit(_x, _y) _y_pred = clf.predict(_x) self.visualize.boundary(_x, _y, clf, title, cola, colb) self.visualize.confusion_matrix( _y, _y_pred, title + ' - ' + cola + ' vs ' + colb)
def __init__(self): os.environ[ 'TF_CPP_MIN_LOG_LEVEL'] = '2' # Ignore low level instruction warnings tf.logging.set_verbosity(tf.logging.ERROR) # Set tensorflow verbosity # self.logfile = None # self.gettrace = getattr(sys, 'gettrace', None) # self.original_stdout = sys.stdout # self.timestr = time.strftime("%Y%m%d-%H%M%S") # self.log_file() print(__doc__) self.filehandler = Filehandler() self.ds = KDDCup1999() self.visualize = Visualize() self.full = None self.X = None self.y = None self.X_train = None self.X_test = None self.y_train = None self.y_test = None self.n_features = None self.random_state = 20 self.label_multi = { 0: 'normal', '0': 'normal', 1: 'dos', '1': 'dos', 2: 'u2r', '2': 'u2r', 3: 'r2l', '3': 'r2l', 4: 'probe', '4': 'probe' } self.label_binary = {0: 'good', '0': 'good', 1: 'bad', '1': 'bad'} with timer('\nLoading dataset'): self.load_data() with timer('\nSetting X and y'): self.set_X() self.n_features = self.X.shape[1] models = (RandomForestClf(), AnnSLPBinary(self.n_features), AnnMLPBinary(self.n_features), AnnMLPMulti(self.n_features)) classification_type = ('Binary', 'Multi') for m, ctype in itertools.product(models, classification_type): score = False if ctype == 'Binary' and m.binary_enabled: self.set_y_binary() score = True elif ctype == 'Multi' and m.multi_enabled: self.set_y_multi() score = True if not score: continue with timer('\nTraining and scoring {} - {} target'.format( m.__class__.__name__, ctype)): m.base['model'] = m.get_model() #self.train_test_split() m.score(self.X, self.y, ctype) m.y_test[ctype] = pd.Series(m.y_test[ctype]) m.y_pred[ctype] = pd.Series(m.y_pred[ctype]) m.y_test[ctype] = m.y_test[ctype].astype(int) m.y_pred[ctype] = m.y_pred[ctype].astype(int) if ctype == 'Binary': m.y_test[ctype] = self.series_map_ac_binary_to_label( m.y_test[ctype]) m.y_pred[ctype] = self.series_map_ac_binary_to_label( m.y_pred[ctype]) else: m.y_test[ctype] = self.series_map_ac_multi_to_label( m.y_test[ctype]) m.y_pred[ctype] = self.series_map_ac_multi_to_label( m.y_pred[ctype]) title = '{} - {} - {} '.format('CM', m.__class__.__name__, ctype) self.visualize.confusion_matrix(m.y_test[ctype], m.y_pred[ctype], title) self.scores(m.y_test[ctype], m.y_pred[ctype]) # Append the scores to a scores array. I could then do an np.mean(scores) to get the mean(average) from all the kfolds # save the epoch number and gfold number if possible as well, to get a per/epoch score # self.log_file() print('Finished')
def __init__(self): self.logfile = None self.gettrace = getattr(sys, 'gettrace', None) self.original_stdout = sys.stdout self.timestr = time.strftime("%Y%m%d-%H%M%S") self.log_file() print(__doc__) self.filehandler = Filehandler() self.ds = KDDCup1999() self.visualize = Visualize() self.random_state = 20 self.X = None self.y = None self.sample = None self.full = None self.ac_count = {} self.scale_cols = [ 'duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate' ] self.full_weights = { 'normal': 1, 'dos': 1, 'probe': 1, 'u2r': 1, 'r2l': 1 } self.minimal_weights = { 'normal': 0.01, 'dos': 0.01, 'probe': 0.2, 'u2r': 0.5, 'r2l': 0.5 } with timer('\nLoading dataset'): self.load_data() self.set_attack_category_count() self.ds.shape() with timer('\nEncode and Scale dataset'): # Encode categoricals le = preprocessing.LabelEncoder() self.full['protocol_type'] = le.fit_transform( self.full['protocol_type']) self.full['service'] = le.fit_transform(self.full['service']) self.full['flag'] = le.fit_transform(self.full['flag']) # Scale sc = StandardScaler() self.full[self.scale_cols] = sc.fit_transform( self.full[self.scale_cols]) with timer('\nPlotting scatter graphs'): self.sample_dataset(self.full_weights) print(self.sample.shape) self.set_X_y('target') self.scatter() with timer('\nPlotting scatter graphs with convex hull'): self.sample_dataset(self.full_weights) print(self.sample.shape) self.set_X_y('target') self.convex_hull() with timer('\nPlotting linear separability with classifiers'): self.sample_dataset(self.minimal_weights) print(self.sample.shape) self.set_X_y('target') self.classifiers() self.log_file() print('Finished')
def __init__(self): os.environ[ 'TF_CPP_MIN_LOG_LEVEL'] = '2' # Ignore low level instruction warnings tf.logging.set_verbosity(tf.logging.ERROR) # Set tensorflow verbosity self.g = tf.Graph() self.tf_sess = tf.Session( config=tf.ConfigProto(log_device_placement=True), graph=self.g) self.logfile = None self.gettrace = getattr(sys, 'gettrace', None) self.original_stdout = sys.stdout self.timestr = time.strftime("%Y%m%d-%H%M%S") self.log_file() print(__doc__) self.random_state = 20 self.filehandler = Filehandler() self.ds = KDDCup1999() self.visualize = Visualize() self.folder = 'viz' # Datasets self.X = None self.y = None self.X_train = None self.X_test = None self.y_train = None self.y_test = None self.n_features = None self.label_map_int_2_string = { 0: 'good', 1: 'bad', '0': 'good', '1': 'bad' } self.label_map_string_2_int = { 'normal': 0, 'dos': 1, 'u2r': 1, 'r2l': 1, 'probe': 1 } # K-fold validation self.splits = 5 self.kfold = StratifiedKFold(n_splits=self.splits, shuffle=True, random_state=self.random_state) # Network parameters self.epochs = 20 self.batch_size = 100 self.verbose = 0 # Scores self.metric_loss = [] self.metric_acc = [] self.metric_dr = [] self.metric_far = [] self.metric_val_loss = [] self.metric_val_acc = [] self.metric_val_dr = [] self.metric_val_far = [] with timer('\nPreparing dataset'): self.load_data() self.set_y() self.remove_target_from_X() self.n_features = self.X.shape[1] self.train_test_split() with timer('\nTraining & validating model with kfold'): self.g.as_default() # Reset graph for tensorboard display K.clear_session() # Train model on K-1 and validate using remaining fold for train, val in self.kfold.split(self.X_train, self.y_train): #self.tensorboard = TensorBoard(log_dir='logs/tb/annmlpbinary_cv') self.model = self.get_model() self.history = self.model.fit( self.X_train.iloc[train], self.y_train.iloc[train], validation_data=(self.X_train.iloc[val], self.y_train.iloc[val]), epochs=self.epochs, batch_size=self.batch_size, verbose=self.verbose) #callbacks=[self.tensorboard]) self.metric_loss.append(self.history.history['loss']) self.metric_acc.append(self.history.history['acc']) self.metric_dr.append(self.history.history['dr']) self.metric_far.append(self.history.history['far']) self.metric_val_loss.append(self.history.history['val_loss']) self.metric_val_acc.append(self.history.history['val_acc']) self.metric_val_dr.append(self.history.history['val_dr']) self.metric_val_far.append(self.history.history['val_far']) print('\nTraining mean loss', np.mean(self.metric_loss)) print('Training mean acc', np.mean(self.metric_acc)) print('Training mean dr', np.mean(self.metric_dr)) print('Training mean far', np.mean(self.metric_far)) print('\nValidation mean loss', np.mean(self.metric_val_loss)) print('Validation mean acc', np.mean(self.metric_val_acc)) print('Validation mean dr', np.mean(self.metric_val_dr)) print('Validation mean far', np.mean(self.metric_val_far)) with timer('\nTesting model on unseen test set'): self.g.as_default() # Reset graph for tensorboard display K.clear_session() self.tensorboard = TensorBoard(log_dir='logs/tb/annmlpbinary_test') self.model = self.get_model() # Train model on complete train set and validate with unseen test set self.history = self.model.fit(self.X_train, self.y_train, validation_data=(self.X_test, self.y_test), epochs=self.epochs, batch_size=self.batch_size, verbose=self.verbose, callbacks=[self.tensorboard]) with timer('\nVisualising results'): # Plot model plot_model(self.model, to_file='viz/annMLPBinary - model plot.png') # Get single class prediction (rather than multi class probability summing to 1) y_pred = self.model.predict_classes(self.X_test) print('Test loss', np.mean(self.history.history['loss'])) print('Test acc', np.mean(self.history.history['acc'])) print('Test dr', np.mean(self.history.history['dr'])) print('Test far', np.mean(self.history.history['far'])) # Remap to string class targets self.y_pred = self.map_target_to_label(y_pred) self.y_pred = self.y_pred.ravel() self.y_test = self.map_target_to_label(self.y_test) self.visualize.confusion_matrix(self.y_test, self.y_pred, self.__class__.__name__) epochs = range(1, len(self.history.history['loss']) + 1) # Plot loss fig, ax = plt.subplots(figsize=(15, 8)) plt.style.use('ggplot') ax.xaxis.set_major_locator(MaxNLocator(integer=True)) ax.tick_params(axis='both', which='major', labelsize=12) ax.plot(epochs, np.mean(self.metric_loss, axis=0), 'g', label='Training') ax.plot(epochs, np.mean(self.metric_val_loss, axis=0), 'b', label='Validation') ax.plot(epochs, self.history.history['loss'], 'r', label='Test') self.title = '{} - {}'.format(self.__class__.__name__, 'Loss') plt.title(self.title, fontsize=18) plt.xlabel('Epochs', fontsize=14) plt.ylabel('Loss', fontsize=14) plt.legend(loc=1, prop={'size': 14}) plt.savefig(fname=self.fname(self.title), dpi=300, format='png') plt.show() # Plot accuracy plt.clf() fig, ax = plt.subplots(figsize=(15, 8)) plt.style.use('ggplot') ax.xaxis.set_major_locator(MaxNLocator(integer=True)) ax.tick_params(axis='both', which='major', labelsize=12) ax.plot(epochs, np.mean(self.metric_acc, axis=0), 'g', label='Training') ax.plot(epochs, np.mean(self.metric_val_acc, axis=0), 'b', label='Validation') ax.plot(epochs, self.history.history['acc'], 'r', label='Test') self.title = '{} - {}'.format(self.__class__.__name__, 'Accuracy') plt.title(self.title, fontsize=18) plt.xlabel('Epochs', fontsize=14) plt.ylabel('Accuracy', fontsize=14) plt.legend(loc=4, prop={'size': 14}) plt.savefig(fname=self.fname(self.title), dpi=300, format='png') plt.show() # Plot detection rate plt.clf() fig, ax = plt.subplots(figsize=(15, 8)) plt.style.use('ggplot') ax.xaxis.set_major_locator(MaxNLocator(integer=True)) ax.tick_params(axis='both', which='major', labelsize=12) ax.plot(epochs, np.mean(self.metric_dr, axis=0), 'g', label='Training') ax.plot(epochs, np.mean(self.metric_val_dr, axis=0), 'b', label='Validation') ax.plot(epochs, self.history.history['dr'], 'r', label='Test') self.title = '{} - {}'.format(self.__class__.__name__, 'Detection Rate') plt.title(self.title, fontsize=18) plt.xlabel('Epochs', fontsize=14) plt.ylabel('Detection Rate', fontsize=14) plt.legend(loc=4, prop={'size': 14}) plt.savefig(fname=self.fname(self.title), dpi=300, format='png') plt.show() # Plot false alarm rate plt.clf() fig, ax = plt.subplots(figsize=(15, 8)) plt.style.use('ggplot') ax.xaxis.set_major_locator(MaxNLocator(integer=True)) ax.tick_params(axis='both', which='major', labelsize=12) ax.plot(epochs, np.mean(self.metric_far, axis=0), 'g', label='Training') ax.plot(epochs, np.mean(self.metric_val_far, axis=0), 'b', label='Validation') ax.plot(epochs, self.history.history['far'], 'r', label='Test') self.title = '{} - {}'.format(self.__class__.__name__, 'False Alarm Rate') plt.title(self.title, fontsize=18) plt.xlabel('Epochs', fontsize=14) plt.ylabel('False Alarm Rate', fontsize=14) plt.legend(loc=1, prop={'size': 14}) plt.savefig(fname=self.fname(self.title), dpi=300, format='png') plt.show() self.log_file() print('Finished')
class Preprocessing: def __init__(self): self.logfile = False self.gettrace = getattr(sys, 'gettrace', None) self.original_stdout = sys.stdout self.timestr = time.strftime("%Y%m%d-%H%M%S") self.log_file() print(__doc__) self.filehandler = Filehandler() self.visualize = Visualize() self.ds = KDDCup1999() with timer('\nLoading dataset'): self.ds.dataset = self.filehandler.read_csv( self.ds.config['path'], self.ds.config['file']) self.ds.set_columns() with timer('\nTransforming dataset'): self.ds.transform() with timer('\nInitial dataset discovery'): self.ds.shape() self.ds.show_duplicates(self.ds.config['level_01']) self.ds.drop_duplicates() self.show_zeros() self.ds.drop_outliers() self.ds.shape() self.ds.discovery() with timer('\nSetting target'): self.ds.set_target() with timer('\nEvaluating sparse features'): self.ds.evaluate_sparse_features(engineer=False) with timer('\nVisualising pairplot for selected columns'): self.visualize.pairplot(self.ds.dataset, self.ds.config['pairplot_cols'], self.ds.config['pairplot_target']) with timer('\nDropping columns'): self.ds.drop_cols(self.ds.config['drop_cols_01']) with timer('\nEvaluating correlation'): self.visualize.correlation_heatmap( self.ds.dataset, title='Correlation Heatmap Before Column Drop') self.ds.drop_highly_correlated() self.visualize.correlation_heatmap( self.ds.dataset, title='Correlation Heatmap After Column Drop') with timer('\nPersisting transformed dataset and target'): self.filehandler.write_csv(self.ds.config['path'], self.ds.config['file'] + '_processed', self.ds.dataset) self.filehandler.write_csv(self.ds.config['path'], self.ds.config['file'] + '_target', self.ds.target) self.ds.shape() self.log_file() print('Finished') def log_file(self): if self.gettrace is None: pass elif self.gettrace(): pass else: if self.logfile: sys.stdout = self.original_stdout self.logfile.close() self.logfile = False else: # Redirect stdout to file for logging if not in debug mode self.logfile = open( 'logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w') sys.stdout = self.logfile def show_zeros(self): df = self.ds.dataset.iloc[:, :-3] df[( df == 0 )] = np.nan # Transform 0's to NaN for visualisation of sparseness with missingno self.visualize.matrix_missing( df, 'Nullity matrix of features with 0 values') self.visualize.bar_missing(df, 'Bar plot of features with 0 values') self.visualize.heat_missing(df, 'Heatmap of features with missing values')