Exemplo n.º 1
0
class Modelling:
    def __init__(self):
        os.environ[
            'TF_CPP_MIN_LOG_LEVEL'] = '2'  # Ignore low level instruction warnings
        tf.logging.set_verbosity(tf.logging.ERROR)  # Set tensorflow verbosity

        # self.logfile = None
        # self.gettrace = getattr(sys, 'gettrace', None)
        # self.original_stdout = sys.stdout
        # self.timestr = time.strftime("%Y%m%d-%H%M%S")
        # self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.visualize = Visualize()
        self.full = None
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.n_features = None
        self.random_state = 20
        self.label_multi = {
            0: 'normal',
            '0': 'normal',
            1: 'dos',
            '1': 'dos',
            2: 'u2r',
            '2': 'u2r',
            3: 'r2l',
            '3': 'r2l',
            4: 'probe',
            '4': 'probe'
        }
        self.label_binary = {0: 'good', '0': 'good', 1: 'bad', '1': 'bad'}

        with timer('\nLoading dataset'):
            self.load_data()

        with timer('\nSetting X and y'):
            self.set_X()
            self.n_features = self.X.shape[1]

        models = (RandomForestClf(), AnnSLPBinary(self.n_features),
                  AnnMLPBinary(self.n_features), AnnMLPMulti(self.n_features))
        classification_type = ('Binary', 'Multi')

        for m, ctype in itertools.product(models, classification_type):
            score = False
            if ctype == 'Binary' and m.binary_enabled:
                self.set_y_binary()
                score = True
            elif ctype == 'Multi' and m.multi_enabled:
                self.set_y_multi()
                score = True

            if not score:
                continue

            with timer('\nTraining and scoring {} - {} target'.format(
                    m.__class__.__name__, ctype)):
                m.base['model'] = m.get_model()
                #self.train_test_split()
                m.score(self.X, self.y, ctype)

            m.y_test[ctype] = pd.Series(m.y_test[ctype])
            m.y_pred[ctype] = pd.Series(m.y_pred[ctype])
            m.y_test[ctype] = m.y_test[ctype].astype(int)
            m.y_pred[ctype] = m.y_pred[ctype].astype(int)

            if ctype == 'Binary':
                m.y_test[ctype] = self.series_map_ac_binary_to_label(
                    m.y_test[ctype])
                m.y_pred[ctype] = self.series_map_ac_binary_to_label(
                    m.y_pred[ctype])
            else:
                m.y_test[ctype] = self.series_map_ac_multi_to_label(
                    m.y_test[ctype])
                m.y_pred[ctype] = self.series_map_ac_multi_to_label(
                    m.y_pred[ctype])

            title = '{} - {} - {} '.format('CM', m.__class__.__name__, ctype)
            self.visualize.confusion_matrix(m.y_test[ctype], m.y_pred[ctype],
                                            title)
            self.scores(m.y_test[ctype], m.y_pred[ctype])

    # Append the scores to a scores array. I could then do an np.mean(scores) to get the mean(average) from all the kfolds
    # save the epoch number and gfold number if possible as well, to get a per/epoch score

    # self.log_file()
        print('Finished')

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open(
                    'logs/{}_{}_stdout.txt'.format(self.__class__.__name__,
                                                   self.timestr), 'w')
                sys.stdout = self.logfile

    def load_data(self):
        self.full = self.filehandler.read_csv(
            self.ds.config['path'],
            self.ds.config['file'] + '_Tensor2d_type_1')

    def set_X(self):
        self.X = self.full.loc[:, self.full.columns != 'attack_category']

    def set_y_binary(self):
        self.y = self.full.loc[:, ['attack_category']]
        self.df_map_ac_label_to_binary()
        self.y = self.y.values.ravel()

    def set_y_multi(self):
        self.y = self.full.loc[:, ['attack_category']]
        self.df_map_ac_label_to_multi()
        self.y = self.y.values.ravel()

    def train_test_split(self):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.30, random_state=self.random_state)

    def df_map_ac_label_to_binary(self):
        conditions = [(self.y['attack_category'] == 'normal'),
                      (self.y['attack_category'] == 'dos') |
                      (self.y['attack_category'] == 'u2r') |
                      (self.y['attack_category'] == 'r2l') |
                      (self.y['attack_category'] == 'probe')]
        self.y['attack_category'] = np.select(conditions, [0, 1])

    def df_map_ac_label_to_multi(self):
        conditions = [(self.y['attack_category'] == 'normal'),
                      (self.y['attack_category'] == 'dos'),
                      (self.y['attack_category'] == 'u2r'),
                      (self.y['attack_category'] == 'r2l'),
                      (self.y['attack_category'] == 'probe')]
        self.y['attack_category'] = np.select(
            conditions,
            ['0', '1', '2', '3', '4'])  # string for get_dummies encoding

    def series_map_ac_multi_to_label(self, s):
        return s.map(self.label_multi)

    def series_map_ac_binary_to_label(self, s):
        return s.map(self.label_binary)

    def scores(self, y_test, y_pred):
        print('Accuracy {}'.format(accuracy_score(y_test, y_pred)))
        print('F1 {}'.format(classification_report(y_test, y_pred, digits=10)))
Exemplo n.º 2
0
class AnnMLPBinary:
    def __init__(self):
        os.environ[
            'TF_CPP_MIN_LOG_LEVEL'] = '2'  # Ignore low level instruction warnings
        tf.logging.set_verbosity(tf.logging.ERROR)  # Set tensorflow verbosity
        self.g = tf.Graph()
        self.tf_sess = tf.Session(
            config=tf.ConfigProto(log_device_placement=True), graph=self.g)

        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.random_state = 20
        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.visualize = Visualize()
        self.folder = 'viz'

        # Datasets
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.n_features = None
        self.label_map_int_2_string = {
            0: 'good',
            1: 'bad',
            '0': 'good',
            '1': 'bad'
        }
        self.label_map_string_2_int = {
            'normal': 0,
            'dos': 1,
            'u2r': 1,
            'r2l': 1,
            'probe': 1
        }

        # K-fold validation
        self.splits = 5
        self.kfold = StratifiedKFold(n_splits=self.splits,
                                     shuffle=True,
                                     random_state=self.random_state)

        # Network parameters
        self.epochs = 20
        self.batch_size = 100
        self.verbose = 0

        # Scores
        self.metric_loss = []
        self.metric_acc = []
        self.metric_dr = []
        self.metric_far = []

        self.metric_val_loss = []
        self.metric_val_acc = []
        self.metric_val_dr = []
        self.metric_val_far = []

        with timer('\nPreparing dataset'):
            self.load_data()
            self.set_y()
            self.remove_target_from_X()
            self.n_features = self.X.shape[1]
            self.train_test_split()

        with timer('\nTraining & validating model with kfold'):
            self.g.as_default()  # Reset graph for tensorboard display
            K.clear_session()

            # Train model on K-1 and validate using remaining fold
            for train, val in self.kfold.split(self.X_train, self.y_train):
                #self.tensorboard = TensorBoard(log_dir='logs/tb/annmlpbinary_cv')
                self.model = self.get_model()

                self.history = self.model.fit(
                    self.X_train.iloc[train],
                    self.y_train.iloc[train],
                    validation_data=(self.X_train.iloc[val],
                                     self.y_train.iloc[val]),
                    epochs=self.epochs,
                    batch_size=self.batch_size,
                    verbose=self.verbose)
                #callbacks=[self.tensorboard])

                self.metric_loss.append(self.history.history['loss'])
                self.metric_acc.append(self.history.history['acc'])
                self.metric_dr.append(self.history.history['dr'])
                self.metric_far.append(self.history.history['far'])
                self.metric_val_loss.append(self.history.history['val_loss'])
                self.metric_val_acc.append(self.history.history['val_acc'])
                self.metric_val_dr.append(self.history.history['val_dr'])
                self.metric_val_far.append(self.history.history['val_far'])

            print('\nTraining mean loss', np.mean(self.metric_loss))
            print('Training mean acc', np.mean(self.metric_acc))
            print('Training mean dr', np.mean(self.metric_dr))
            print('Training mean far', np.mean(self.metric_far))
            print('\nValidation mean loss', np.mean(self.metric_val_loss))
            print('Validation mean acc', np.mean(self.metric_val_acc))
            print('Validation mean dr', np.mean(self.metric_val_dr))
            print('Validation mean far', np.mean(self.metric_val_far))

        with timer('\nTesting model on unseen test set'):
            self.g.as_default()  # Reset graph for tensorboard display
            K.clear_session()

            self.tensorboard = TensorBoard(log_dir='logs/tb/annmlpbinary_test')
            self.model = self.get_model()

            # Train model on complete train set and validate with unseen test set
            self.history = self.model.fit(self.X_train,
                                          self.y_train,
                                          validation_data=(self.X_test,
                                                           self.y_test),
                                          epochs=self.epochs,
                                          batch_size=self.batch_size,
                                          verbose=self.verbose,
                                          callbacks=[self.tensorboard])

        with timer('\nVisualising results'):
            # Plot model
            plot_model(self.model, to_file='viz/annMLPBinary - model plot.png')

            # Get single class prediction (rather than multi class probability summing to 1)
            y_pred = self.model.predict_classes(self.X_test)

            print('Test loss', np.mean(self.history.history['loss']))
            print('Test acc', np.mean(self.history.history['acc']))
            print('Test dr', np.mean(self.history.history['dr']))
            print('Test far', np.mean(self.history.history['far']))

            # Remap to string class targets
            self.y_pred = self.map_target_to_label(y_pred)
            self.y_pred = self.y_pred.ravel()
            self.y_test = self.map_target_to_label(self.y_test)

            self.visualize.confusion_matrix(self.y_test, self.y_pred,
                                            self.__class__.__name__)

            epochs = range(1, len(self.history.history['loss']) + 1)

            # Plot loss
            fig, ax = plt.subplots(figsize=(15, 8))
            plt.style.use('ggplot')
            ax.xaxis.set_major_locator(MaxNLocator(integer=True))
            ax.tick_params(axis='both', which='major', labelsize=12)
            ax.plot(epochs,
                    np.mean(self.metric_loss, axis=0),
                    'g',
                    label='Training')
            ax.plot(epochs,
                    np.mean(self.metric_val_loss, axis=0),
                    'b',
                    label='Validation')
            ax.plot(epochs, self.history.history['loss'], 'r', label='Test')
            self.title = '{} - {}'.format(self.__class__.__name__, 'Loss')
            plt.title(self.title, fontsize=18)
            plt.xlabel('Epochs', fontsize=14)
            plt.ylabel('Loss', fontsize=14)
            plt.legend(loc=1, prop={'size': 14})
            plt.savefig(fname=self.fname(self.title), dpi=300, format='png')
            plt.show()

            # Plot accuracy
            plt.clf()
            fig, ax = plt.subplots(figsize=(15, 8))
            plt.style.use('ggplot')
            ax.xaxis.set_major_locator(MaxNLocator(integer=True))
            ax.tick_params(axis='both', which='major', labelsize=12)
            ax.plot(epochs,
                    np.mean(self.metric_acc, axis=0),
                    'g',
                    label='Training')
            ax.plot(epochs,
                    np.mean(self.metric_val_acc, axis=0),
                    'b',
                    label='Validation')
            ax.plot(epochs, self.history.history['acc'], 'r', label='Test')
            self.title = '{} - {}'.format(self.__class__.__name__, 'Accuracy')
            plt.title(self.title, fontsize=18)
            plt.xlabel('Epochs', fontsize=14)
            plt.ylabel('Accuracy', fontsize=14)
            plt.legend(loc=4, prop={'size': 14})
            plt.savefig(fname=self.fname(self.title), dpi=300, format='png')
            plt.show()

            # Plot detection rate
            plt.clf()
            fig, ax = plt.subplots(figsize=(15, 8))
            plt.style.use('ggplot')
            ax.xaxis.set_major_locator(MaxNLocator(integer=True))
            ax.tick_params(axis='both', which='major', labelsize=12)
            ax.plot(epochs,
                    np.mean(self.metric_dr, axis=0),
                    'g',
                    label='Training')
            ax.plot(epochs,
                    np.mean(self.metric_val_dr, axis=0),
                    'b',
                    label='Validation')
            ax.plot(epochs, self.history.history['dr'], 'r', label='Test')
            self.title = '{} - {}'.format(self.__class__.__name__,
                                          'Detection Rate')
            plt.title(self.title, fontsize=18)
            plt.xlabel('Epochs', fontsize=14)
            plt.ylabel('Detection Rate', fontsize=14)
            plt.legend(loc=4, prop={'size': 14})
            plt.savefig(fname=self.fname(self.title), dpi=300, format='png')
            plt.show()

            # Plot false alarm rate
            plt.clf()
            fig, ax = plt.subplots(figsize=(15, 8))
            plt.style.use('ggplot')
            ax.xaxis.set_major_locator(MaxNLocator(integer=True))
            ax.tick_params(axis='both', which='major', labelsize=12)
            ax.plot(epochs,
                    np.mean(self.metric_far, axis=0),
                    'g',
                    label='Training')
            ax.plot(epochs,
                    np.mean(self.metric_val_far, axis=0),
                    'b',
                    label='Validation')
            ax.plot(epochs, self.history.history['far'], 'r', label='Test')
            self.title = '{} - {}'.format(self.__class__.__name__,
                                          'False Alarm Rate')
            plt.title(self.title, fontsize=18)
            plt.xlabel('Epochs', fontsize=14)
            plt.ylabel('False Alarm Rate', fontsize=14)
            plt.legend(loc=1, prop={'size': 14})
            plt.savefig(fname=self.fname(self.title), dpi=300, format='png')
            plt.show()

        self.log_file()
        print('Finished')

    @staticmethod
    def dr(y_true, y_pred):
        y_pred_pos = K.round(K.clip(y_pred, 0, 1))
        y_pred_neg = 1 - y_pred_pos
        y_pos = K.round(K.clip(y_true, 0, 1))
        tp = K.sum(y_pos * y_pred_pos)
        fn = K.sum(y_pos * y_pred_neg)
        return tp / (tp + fn + K.epsilon())

    @staticmethod
    def far(y_true, y_pred):
        y_pred_pos = K.round(K.clip(y_pred, 0, 1))
        y_pred_neg = 1 - y_pred_pos
        y_pos = K.round(K.clip(y_true, 0, 1))
        y_neg = 1 - y_pos
        tn = K.sum(y_neg * y_pred_neg)
        fp = K.sum(y_neg * y_pred_pos)
        return fp / (tn + fp + K.epsilon())

    def get_model(self):
        model = models.Sequential()
        model.add(
            layers.Dense(25,
                         activation='relu',
                         input_shape=(self.n_features, )))
        model.add(layers.Dropout(0.08))
        model.add(layers.Dense(25, activation='relu'))
        model.add(layers.Dropout(0.08))
        model.add(layers.Dense(25, activation='relu'))
        model.add(layers.Dropout(0.08))
        model.add(layers.Dense(25, activation='relu'))
        model.add(layers.Dropout(0.08))
        model.add(layers.Dense(1, activation='sigmoid'))
        model.compile(optimizer=optimizers.RMSprop(lr=0.0023),
                      loss='binary_crossentropy',
                      metrics=['accuracy', self.dr, self.far])
        return model

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open(
                    'logs/{}_{}_stdout.txt'.format(self.__class__.__name__,
                                                   self.timestr), 'w')
                sys.stdout = self.logfile

    def load_data(self):
        self.X = self.filehandler.read_csv(
            self.ds.config['path'],
            self.ds.config['file'] + '_Tensor2d_type_2')
        print('\tRow count:\t', '{}'.format(self.X.shape[0]))
        print('\tColumn count:\t', '{}'.format(self.X.shape[1]))

    def set_y(self):
        self.y = self.X['attack_category']
        self.y = self.y.map(self.label_map_string_2_int)

    def remove_target_from_X(self):
        self.X.drop('attack_category', axis=1, inplace=True)

    def train_test_split(self):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.30, random_state=self.random_state)

    def map_target_to_label(self, t):
        return np.vectorize(self.label_map_int_2_string.get)(t)

    def fname(self, title):
        return '{}/{}.png'.format(self.folder, title)
Exemplo n.º 3
0
class XGBoostBinary:
    def __init__(self):
        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.random_state = 20
        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.n_classes = 2

        # Datasets
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.label_map_int_2_string = {0: 'good', 1: 'bad', '0': 'good', '1': 'bad'}
        self.label_map_string_2_int = {'normal': 0, 'dos': 1, 'u2r': 1, 'r2l': 1, 'probe': 1}
        self.max_iters = 100

        with timer('\nPreparing dataset'):
            self.load_data()
            self.set_y()
            self.remove_target_from_X()
            self.train_test_split()

        with timer('\nTesting model on unseen test set'):
            clf = XGBClassifier(n_estimators=100, random_state=self.random_state)
            clf.fit(self.X_train, self.y_train)
            self.y_pred = clf.predict(self.X_test)
            cm = confusion_matrix(self.y_test, self.y_pred)
            self.tp = cm[1, 1]
            self.tn = cm[0, 0]
            self.fp = cm[0, 1]
            self.fn = cm[1, 0]

            print('True positive (TP)', self.tp)
            print('True negative (TN)', self.tn)
            print('False positive (FP)', self.fp)
            print('false negative (FN)', self.fn)

            self.dr = self.tp / (self.tp + self.fp)
            self.far = self.fp / (self.tn + self.fp)
            self.acc = (self.tp + self.tn) / (self.tp + self.tn + self.fp + self.fn)
            print('Detection rate: ', self.dr)
            print('False alarm rate: ', self.far)
            print('Accuracy: ', self.acc)

        self.log_file()
        print('Finished')

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open('logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w')
                sys.stdout = self.logfile

    def load_data(self):
        self.X = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_Tensor2d_type_1')
        print('\tRow count:\t', '{}'.format(self.X.shape[0]))
        print('\tColumn count:\t', '{}'.format(self.X.shape[1]))

    def set_y(self):
        self.y = self.X['attack_category']
        self.y = self.y.map(self.label_map_string_2_int)

    def remove_target_from_X(self):
        self.X.drop('attack_category', axis=1, inplace=True)

    def train_test_split(self):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.30,
                                                                                random_state=self.random_state)

    def map_target_to_label(self, t):
        return np.vectorize(self.label_map_int_2_string.get)(t)

    def fname(self, title):
        return '{}/{}.png'.format(self.folder, title)
Exemplo n.º 4
0
class Preprocessor:
    def __init__(self, envparm):
        self.dataset_raw = None
        self.numerical_features_raw = None
        self.categorical_features_raw = None
        self.filehandler = None
        self.preprocess(envparm)

    @staticmethod
    def drop_features_min_unique(dataset, min_threshold):
        features_dropped_str = ''
        for col in dataset:
            if len(dataset[col].unique()) <= min_threshold:
                features_dropped_str += str(col) + ' '
                dataset.drop(col, inplace=True, axis=1)

        logging.info(
            'Features dropped with unique value count <= {} - {}'.format(
                min_threshold, features_dropped_str))
        return dataset

    @staticmethod
    def drop_features_max_unique(dataset, max_threshold):
        features_dropped_str = ''
        for col in dataset:
            if len(dataset[col].unique()) >= max_threshold:
                features_dropped_str += str(col) + ' '
                dataset.drop(col, inplace=True, axis=1)

        logging.info(
            'Features dropped with unique value count >= {} - {}'.format(
                max_threshold, features_dropped_str))
        return dataset

    @staticmethod
    def drop_features_max_null(dataset, max_threshold):
        features_dropped_str = ''
        for col in dataset:
            if sum(dataset[col].isnull()) >= max_threshold:
                features_dropped_str += str(col) + ' '
                dataset.drop(col, inplace=True, axis=1)

        logging.info(
            'Features dropped with null value count >= {} - {}'.format(
                max_threshold, features_dropped_str))
        return dataset

    @staticmethod
    def plot_num_obs_missing_values(dataset):
        df = pd.DataFrame(data=dataset.isnull().sum(), columns=['Count'])
        df['bin'] = pd.cut(df['Count'], [
            -1, 10, 20, 30, 40, 50, 100, 200, 300, 400, 500, 1000, 2000, 3000,
            4000, 5000, 6000, 7000, 8000, 9000, 10000, 50000
        ],
                           labels=[
                               '0-10', '10-20', '20-30', '30-40', '40-50',
                               '50-100', '100-200', '200-300', '300-400',
                               '400-500', '500-1K', '1K-2K', '2K-3K', '3K-4K',
                               '4K-5K', '5K-6K', '6K-7K', '7K-8K', '8K-9K',
                               '9K-10K', '10K-50K'
                           ])
        countplot = sns.countplot(y="bin", data=df)
        countplot.set(ylabel="Observations With Null Values",
                      xlabel="Feature Count",
                      title="Observations With Null Values Per Feature")
        plt.show()

    @staticmethod
    def impute_numeric_feature_with_zero(dataset):
        dataset.fillna(0, inplace=True)

    @staticmethod
    def impute_categorical_feature_with_blank(dataset):
        dataset.fillna('', inplace=True)

    def prepare_output_variant_01(self):
        logging.info('Preparing output variant 01')
        numerical_features = self.numerical_features_raw.copy()

        logging.info('Validating numerical features')
        numerical_features = self.drop_features_min_unique(
            numerical_features, 1)

        logging.info('Validating categorical features')
        categorical_features = self.categorical_features_raw.copy()
        categorical_features = self.drop_features_min_unique(
            categorical_features, 1)

        logging.info('Imputing numerical features with mean')
        numerical_features.fillna(numerical_features.mean(), inplace=True)

        logging.info('Imputing categorical features with "missing"')
        categorical_features.fillna('missing', inplace=True)

        # Random Forest needs the categorical features encoding otherwise string to float error
        logging.info('Label encoding categorical features')
        labelencoder_categorical = LabelEncoder()
        labelencoder_categorical = categorical_features.apply(
            labelencoder_categorical.fit_transform)

        dataset_output = self.filehandler.output_prep_dataset(
            self.filehandler.dataset_prep_path_01, numerical_features,
            labelencoder_categorical)

        logging.info('Dataset size after feature transformation - {}'.format(
            dataset_output.shape))
        logging.info('Completed Preparing output variant 01')

    def prepare_output_variant_02(self):
        logging.info('Preparing output variant 02')
        numerical_features = self.numerical_features_raw.copy()

        logging.info('Validating numerical features')
        numerical_features = self.drop_features_min_unique(
            numerical_features, 1)

        logging.info('Validating categorical features')
        categorical_features = self.categorical_features_raw.copy()
        categorical_features = self.drop_features_min_unique(
            categorical_features, 1)

        logging.info('Imputing numerical features with zero')
        numerical_features.fillna(0, inplace=True)

        logging.info('Imputing categorical features with "missing"')
        categorical_features.fillna('missing', inplace=True)

        # Random Forest needs the categorical features encoding otherwise string to float error
        logging.info('Label encoding categorical features')
        labelencoder_categorical = LabelEncoder()
        labelencoder_categorical = categorical_features.apply(
            labelencoder_categorical.fit_transform)

        dataset_output = self.filehandler.output_prep_dataset(
            self.filehandler.dataset_prep_path_02, numerical_features,
            labelencoder_categorical)

        logging.info('Dataset size after feature transformation - {}'.format(
            dataset_output.shape))
        logging.info('Completed Preparing output variant 02')

    def prepare_output_variant_03(self):
        logging.info('Preparing output variant 03')
        numerical_features = self.numerical_features_raw.copy()

        logging.info('Validating numerical features')
        numerical_features = self.drop_features_min_unique(
            numerical_features, 1)

        logging.info('Validating categorical features')
        categorical_features = self.categorical_features_raw.copy()
        categorical_features = self.drop_features_min_unique(
            categorical_features, 1)
        categorical_features = self.drop_features_max_unique(
            categorical_features, 11)

        logging.info('Imputing numerical features with -9876')
        numerical_features.fillna(-9876, inplace=True)

        logging.info('Imputing categorical features with "missing"')
        categorical_features.fillna('missing', inplace=True)

        # Random Forest needs the categorical features encoding otherwise string to float error
        logging.info('Label encoding categorical features')
        labelencoder_categorical = LabelEncoder()
        labelencoder_categorical = categorical_features.apply(
            labelencoder_categorical.fit_transform)

        # One hot encoding results in memory error due to wide dataset
        onehotencoder = OneHotEncoder()
        labelencoder_categorical = onehotencoder.fit_transform(
            labelencoder_categorical).toarray()
        labelencoder_categorical_df = pd.DataFrame(labelencoder_categorical)

        dataset_output = self.filehandler.output_prep_dataset(
            self.filehandler.dataset_prep_path_03, numerical_features,
            labelencoder_categorical_df)

        logging.info('Dataset size after feature transformation - {}'.format(
            dataset_output.shape))
        logging.info('Completed Preparing output variant 03')

    def preprocess(self, envparm):
        self.filehandler = Filehandler()
        self.dataset_raw = self.filehandler.read_csv(
            self.filehandler.data_raw_path)
        logging.info('Original raw dataset loaded - dataset size {}'.format(
            self.dataset_raw.shape))

        logging.info('Partitioning numerical features')
        self.numerical_features_raw = self.dataset_raw.iloc[:, 0:190].copy()

        logging.info('Partitioning categorical features')
        self.categorical_features_raw = self.dataset_raw.iloc[:, 190:].copy()

        if envparm['PlotGraphs']:
            num_size = 28
            sample_df = self.dataset_raw.iloc[:, :num_size].copy()
            visualizer.matrix_missing(
                sample_df,
                'Data Completion First ' + str(num_size) + ' Numeric Features')
            visualizer.bar_missing(
                sample_df,
                'Nullity Count First ' + str(num_size) + ' Numeric Features')
            visualizer.heat_missing(
                sample_df, 'Nullity Correlation Of First ' + str(num_size) +
                ' Numeric Features')

            sample_df = self.dataset_raw.iloc[:, 190:].copy()
            visualizer.matrix_missing(sample_df,
                                      'Data Completion Categorical Features')
            visualizer.bar_missing(sample_df,
                                   'Nullity Count Categorical Features')
            visualizer.heat_missing(
                sample_df, 'Nullity Correlation Of Categorical Features')

        if envparm['ProcessDS01']:
            self.prepare_output_variant_01()

        if envparm['ProcessDS02']:
            self.prepare_output_variant_02()

        if envparm['ProcessDS03']:
            self.prepare_output_variant_03()
Exemplo n.º 5
0
class AnnMLPMultiOptimize:
    def __init__(self):
        os.environ[
            'TF_CPP_MIN_LOG_LEVEL'] = '2'  # Ignore low level instruction warnings
        tf.logging.set_verbosity(tf.logging.ERROR)  # Set tensorflow verbosity
        sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

        print(__doc__)

        self.random_state = 20
        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.folder = 'tuning'

        # Datasets
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.n_features = None
        self.label_map_string_2_int = {
            'normal': 0,
            'dos': 1,
            'u2r': 2,
            'r2l': 3,
            'probe': 4
        }

        with timer('\nPreparing dataset'):
            self.load_data()
            self.set_y()
            self.remove_target_from_X()
            self.n_features_all = self.X.shape[1]
            self.n_features_50pct = int(self.n_features_all * 0.5)
            self.n_features_80pct = int(self.n_features_all * 0.8)
            self.y = pd.get_dummies(self.y)
            self.X = self.X.values
            self.y = self.y.values

        with timer('\nSearching parameter space'):
            # self.p = {'lr': (0.5, 5, 10),
            #      'first_neuron': [self.n_features_70pct, self.n_features_all],
            #      'hidden_layers': [0, 1, 2],
            #      'hidden_neuron': [self.n_features_70pct, self.n_features_all],
            #      'batch_size': [100, 200],
            #      'epochs': [30],
            #      'dropout': (0, 0.2, 0.5),
            #      'weight_regulizer': [None],
            #      'emb_output_dims': [None],
            #      'shape': ['brick', 'long_funnel'],
            #      'optimizer': [Adam, RMSprop],
            #      'losses': [binary_crossentropy],
            #      'activation': [relu],
            #      'last_activation': [sigmoid]}

            self.ptest = {
                'lr': [10],
                'first_neuron': [self.n_features_all],
                'hidden_layers': [1],
                'hidden_neuron': [self.n_features_all],
                'batch_size': [100],
                'epochs': [5],
                'dropout': [0.2],
                'optimizer': [SGD],
                'activation': [relu],
                'last_activation': [softmax]
            }

            self.p1 = {
                'lr': (0.5, 5, 10),
                'first_neuron': [
                    self.n_features_50pct, self.n_features_80pct,
                    self.n_features_all
                ],
                'hidden_layers': [1, 2, 3],
                'hidden_neuron': [
                    self.n_features_50pct, self.n_features_80pct,
                    self.n_features_all
                ],
                'batch_size': [100, 500, 1000],
                'epochs': [20],
                'dropout': (0, 0.2, 5),
                'optimizer': [SGD, RMSprop],
                'activation': [relu],
                'last_activation': [softmax]
            }

            dataset_name = self.folder + '/Hyperparameter tuning - ' + self.__class__.__name__
            scan = ta.Scan(x=self.X,
                           y=self.y,
                           model=self.get_model,
                           params=self.p1,
                           grid_downsample=0.01,
                           dataset_name=dataset_name,
                           experiment_no='1')

            with timer('\nEvaluating Scan'):
                r = ta.Reporting(scan)

                # get the number of rounds in the Scan
                print('\nNumber of rounds in scan ', r.rounds())

                # get highest results
                print('\nHighest validation accuracy', r.high('val_dr'))
                print('\nHighest validation detection rate', r.high('val_dr'))
                print('\nHighest validation false alarm rate',
                      r.high('val_far'))

                # get the highest result for any metric
                print(r.high('val_dr'))

                # get the round with the best result
                print('Best round', r.rounds2high())

                # get the best paramaters
                print(r.best_params())

                #r.plot_corr()
                #plt.show()

                # a four dimensional bar grid
                #r.plot_bars('batch_size', 'val_dr', 'hidden_layers', 'lr')
                #plt.show()

        print('Finished')

    @staticmethod
    def dr(y_true, y_pred):
        y_pred_pos = K.round(K.clip(y_pred, 0, 1))
        y_pred_neg = 1 - y_pred_pos
        y_pos = K.round(K.clip(y_true, 0, 1))
        tp = K.sum(y_pos * y_pred_pos)
        fn = K.sum(y_pos * y_pred_neg)
        return tp / (tp + fn + K.epsilon())

    @staticmethod
    def far(y_true, y_pred):
        y_pred_pos = K.round(K.clip(y_pred, 0, 1))
        y_pred_neg = 1 - y_pred_pos
        y_pos = K.round(K.clip(y_true, 0, 1))
        y_neg = 1 - y_pos
        tn = K.sum(y_neg * y_pred_neg)
        fp = K.sum(y_neg * y_pred_pos)
        return fp / (tn + fp + K.epsilon())

    def get_model(self, x_train, y_train, x_val, y_val, params):

        model = models.Sequential()

        # Input layer with dropout
        model.add(
            layers.Dense(params['first_neuron'],
                         activation=params['activation'],
                         input_shape=(self.n_features_all, )))
        model.add(layers.Dropout(params['dropout']))

        # Hidden layers with dropout
        for i in range(params['hidden_layers']):
            model.add(
                layers.Dense(params['hidden_neuron'],
                             activation=params['activation']))
            model.add(layers.Dropout(params['dropout']))

        # Output layer
        model.add(layers.Dense(5, activation=params['last_activation']))

        # Build model
        model.compile(params['optimizer'](
            lr=lr_normalizer(params['lr'], params['optimizer'])),
                      loss='categorical_crossentropy',
                      metrics=['accuracy', self.dr, self.far])

        history = model.fit(x_train,
                            y_train,
                            validation_data=(x_val, y_val),
                            batch_size=params['batch_size'],
                            epochs=params['epochs'],
                            verbose=0)

        return history, model

    def load_data(self):
        self.X = self.filehandler.read_csv(
            self.ds.config['path'],
            self.ds.config['file'] + '_Tensor2d_type_1')
        print('\tRow count:\t', '{}'.format(self.X.shape[0]))
        print('\tColumn count:\t', '{}'.format(self.X.shape[1]))

    def set_y(self):
        self.y = self.X['attack_category']
        self.y = self.y.map(self.label_map_string_2_int)

    def remove_target_from_X(self):
        self.X.drop('attack_category', axis=1, inplace=True)

    def train_test_split(self):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.30, random_state=self.random_state)
Exemplo n.º 6
0
class Clustering:
    def __init__(self):
        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.visualize = Visualize()
        self.random_state = 20
        self.clusters_stop = 11
        self.x = None
        self.y = None
        self.full = None
        self.ac_count = {}
        self.feature_idx = {0: 0, 1: 0, 2: 0}
        self.pca_idx = {0: 0, 1: 1, 2: 2, 'pca': True}
        self.kernelpca_idx = {0: 0, 1: 1, 2: 2, 'kpca': True}
        self.scale_cols = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
                           'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
                           'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login',
                           'count', 'srv_count', 'serror_rate', 'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate',
                           'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate',
                           'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate']
        self.cluster_cols = [('count', 'diff_srv_rate', 'src_bytes'),
                             ('src_bytes', 'dst_host_srv_count', 'dst_bytes'),
                             ('srv_diff_host_rate', 'srv_count', 'serror_rate'),
                             ('serror_rate', 'dst_host_diff_srv_rate', 'flag')]

        with timer('\nLoading dataset'):
            self.load_data()
            self.ds.shape()
        with timer('\nEncode and Scale dataset'):
            self.encode_scale()
        with timer('\nSetting X and y'):
            self.set_x_y()
        with timer('\nPlotting clusters for specific columns'):
            for cola, colb, colc in self.cluster_cols:
                for c in range(2, self.clusters_stop):
                    self.set_indexes(cola, colb, colc)
                    with timer('\n2D clustering without PCA'):
                        self.cluster(idx=self.feature_idx, n_clusters=c)
                    with timer('\n3D clustering without PCA'):
                        self.cluster(idx=self.feature_idx, n_clusters=c, projection='3d')
        with timer('\nPlotting clusters applying PCA'):
            for c in range(2, self.clusters_stop):
                with timer('\n2D clustering with PCA'):
                    self.cluster(idx=self.pca_idx, n_clusters=c)
                with timer('\n3D clustering with PCA'):
                    self.cluster(idx=self.pca_idx, n_clusters=c, projection='3d')
        # Commented out due to memory error
        #with timer('\nPlotting clusters Kernel applying PCA'):
        #    for c in range(2, 7):
        #        with timer('\n2D clustering with Kernel PCA'):
        #            self.cluster(idx=self.kernelpca_idx, n_clusters=c)
        #        with timer('\n3D clustering with Kernel PCA'):
        #            self.cluster(idx=self.kernelpca_idx, n_clusters=c, projection='3d')

        self.log_file()
        print('Finished')

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open('logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w')
                sys.stdout = self.logfile

    def load_data(self):
        self.ds.dataset = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_processed')
        self.ds.target = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_target')
        self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1)

    def encode_scale(self):
        # Encode categoricals
        le = preprocessing.LabelEncoder()
        self.full['protocol_type'] = le.fit_transform(self.full['protocol_type'])
        self.full['service'] = le.fit_transform(self.full['service'])
        self.full['flag'] = le.fit_transform(self.full['flag'])

        # Scale
        sc = StandardScaler()
        self.full[self.scale_cols] = sc.fit_transform(self.full[self.scale_cols])

    def set_x_y(self):
        self.x = self.full.iloc[:, :-2]
        self.y = self.full['target']

    def set_indexes(self, cola, colb, colc):
        self.feature_idx[0] = self.x.columns.get_loc(cola)
        self.feature_idx[1] = self.x.columns.get_loc(colb)
        self.feature_idx[2] = self.x.columns.get_loc(colc)

    def cluster(self, idx, n_clusters, projection=None):
        df_x = self.x
        kmeans = KMeans(n_clusters=n_clusters, random_state=self.random_state)
        kmeans.fit(df_x)
        y_km = kmeans.fit_predict(df_x)
        self.visualize.scatter_clusters(self.x, n_clusters, y_km, idx, projection)
Exemplo n.º 7
0
class AnnMLPOptimiseEvaluateMulti:
    def __init__(self):
        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.n_classes = 5
        self.random_state = 20
        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.folder = 'viz'
        self.fprefix_multi = 'Hyper - annMLPMulti - '

        # Datasets
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.hyp = None
        self.lr = None
        self.label_map_string_2_int = {
            'normal': 0,
            'dos': 1,
            'u2r': 2,
            'r2l': 3,
            'probe': 4
        }
        self.max_iters = 100

        with timer('\nPreparing dataset'):
            self.load_data()
            self.set_y()
            self.remove_target_from_X()
            self.train_test_split()

        with timer('\nPreparing base logistic regression'):
            self.lr = LogisticRegression(max_iter=self.max_iters)
            self.lr.fit(self.X_train, self.y_train)

        with timer('\nPreparing confusion matrix and base DR'):
            self.y_pred = self.lr.predict(self.X_test)
            cm = confusion_matrix(self.y_test, self.y_pred)
            self.tp = self.get_tp_from_cm(cm)
            self.tn = self.get_tn_from_cm(cm)
            self.fp = self.get_fp_from_cm(cm)
            self.fn = self.get_fn_from_cm(cm)
            self.dr = self.tp / (self.tp + self.fp)
            print('log reg dr', self.dr)

        with timer('\nVisualising optimisation search'):
            self.load_hyp()
            self.hyp['lr'] = round(self.hyp['lr'] / 1000, 3)

            # Hyperparameter correlation with val DR
            self.hyp_val_dr = self.hyp
            self.hyp_val_dr.drop([
                'round_epochs', 'epochs', 'loss', 'dr', 'far', 'acc',
                'val_loss', 'val_acc', 'val_far'
            ],
                                 axis=1,
                                 inplace=True)
            self.dr_corr = self.hyp_val_dr.corr()
            plt.clf()
            fig, ax = plt.subplots(figsize=(10, 10))
            title = 'Validation DR Hyperparameter Correlation'
            ax.set_title(title, size=16)
            colormap = sns.diverging_palette(220, 10, as_cmap=True)
            sns.heatmap(self.dr_corr,
                        cmap=colormap,
                        annot=True,
                        fmt=".2f",
                        cbar=False,
                        vmin=-0.4,
                        vmax=0.4)
            plt.xticks(range(len(self.dr_corr.columns)), self.dr_corr.columns)
            plt.yticks(range(len(self.dr_corr.columns)), self.dr_corr.columns)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            self.hyp['val_dr_change'] = round(self.hyp.val_dr - self.dr, 3)
            pd.set_option('display.max_columns', 100)
            print(self.hyp.sort_values(by='val_dr', ascending=False).head())

            self.color = 'cornflowerblue'

            metric = 'lr'
            plt.clf()
            fig, ax = plt.subplots(figsize=(10, 6))
            ax = sns.boxplot(x=metric,
                             y='val_dr_change',
                             data=self.hyp.reset_index(),
                             color=self.color)
            title = 'Validation DR Change Over Baseline As Fn Of Learning Rate'
            plt.title(title, fontsize=16)
            plt.xlabel('Learning Rate', fontsize=12)
            plt.ylabel('Validation DR Change', fontsize=12)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            metric = 'first_neuron'
            plt.clf()
            fig, ax = plt.subplots(figsize=(10, 6))
            ax = sns.boxplot(x=metric,
                             y='val_dr_change',
                             data=self.hyp.reset_index(),
                             color=self.color)
            title = 'Validation DR Change Over Baseline As Fn Of # Neurons First Layer'
            plt.title(title, fontsize=16)
            plt.xlabel('First Neuron', fontsize=12)
            plt.ylabel('Validation DR Change', fontsize=12)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            metric = 'hidden_layers'
            plt.clf()
            fig, ax = plt.subplots(figsize=(10, 6))
            ax = sns.boxplot(x=metric,
                             y='val_dr_change',
                             data=self.hyp.reset_index(),
                             color=self.color)
            title = 'Validation DR Change Over Baseline As Fn Of # Hidden Layers'
            plt.title(title, fontsize=16)
            plt.xlabel('Hidden Layers', fontsize=12)
            plt.ylabel('Validation DR Change', fontsize=12)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            metric = 'hidden_neuron'
            plt.clf()
            fig, ax = plt.subplots(figsize=(10, 6))
            ax = sns.boxplot(x=metric,
                             y='val_dr_change',
                             data=self.hyp.reset_index(),
                             color=self.color)
            title = 'Validation DR Change Over Baseline As Fn Of # Hidden Layer Neurons'
            plt.title(title, fontsize=16)
            plt.xlabel('Hidden Neurons', fontsize=12)
            plt.ylabel('Validation DR Change', fontsize=12)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            metric = 'batch_size'
            plt.clf()
            fig, ax = plt.subplots(figsize=(10, 6))
            ax = sns.boxplot(x=metric,
                             y='val_dr_change',
                             data=self.hyp.reset_index(),
                             color=self.color)
            title = 'Validation DR Change Over Baseline As Fn Of Batch Size'
            plt.title(title, fontsize=16)
            plt.xlabel('Batch Size', fontsize=12)
            plt.ylabel('Validation DR Change', fontsize=12)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            metric = 'dropout'
            plt.clf()
            fig, ax = plt.subplots(figsize=(12, 8))
            ax = sns.boxplot(x=metric,
                             y='val_dr_change',
                             data=self.hyp.reset_index(),
                             color=self.color)
            title = 'Validation DR Change Over Baseline As Fn Of Dropout'
            plt.title(title, fontsize=16)
            plt.xlabel('Dropout', fontsize=12)
            plt.ylabel('Validation DR Change', fontsize=12)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            plt.clf()
            fig, ax = plt.subplots(figsize=(9, 7))
            df_grid = self.hyp.reset_index().groupby(
                ['first_neuron',
                 'hidden_neuron']).val_dr_change.mean().unstack()
            ax = sns.heatmap(data=df_grid,
                             cmap=(sns.diverging_palette(10, 220, sep=80,
                                                         n=7)),
                             annot=True,
                             cbar=False)
            title = 'Validation DR Change Over Baseline As Fn Of First Neuron & Hidden Neuron'
            plt.title(title, fontsize=12)
            plt.xlabel('Hidden Neuron', fontsize=10)
            plt.ylabel('First Neuron', fontsize=10)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            plt.clf()
            fig, ax = plt.subplots(figsize=(9, 7))
            df_grid = self.hyp.reset_index().groupby(
                ['hidden_layers',
                 'hidden_neuron']).val_dr_change.mean().unstack()
            ax = sns.heatmap(data=df_grid,
                             cmap=(sns.diverging_palette(10, 220, sep=80,
                                                         n=7)),
                             annot=True,
                             cbar=False)
            title = 'Validation DR Change Over Baseline As Fn Of Hidden Layers & Hidden Neuron'
            plt.title(title, fontsize=16)
            plt.xlabel('Hidden Neuron', fontsize=10)
            plt.ylabel('Hidden Layers', fontsize=10)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            plt.clf()
            fig, ax = plt.subplots(figsize=(9, 7))
            df_grid = self.hyp.reset_index().groupby(
                ['batch_size', 'dropout']).val_dr_change.mean().unstack()
            ax = sns.heatmap(data=df_grid,
                             cmap=(sns.diverging_palette(10, 220, sep=80,
                                                         n=7)),
                             annot=True,
                             cbar=False)
            title = 'Validation DR Change Over Baseline As Fn Of Batch Size & Dropout'
            plt.xlabel('Dropout', fontsize=10)
            plt.ylabel('Batch Size', fontsize=10)
            plt.title(title, fontsize=16)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            plt.clf()
            fig, ax = plt.subplots(figsize=(9, 7))
            df_grid = self.hyp.reset_index().groupby(
                ['lr', 'dropout']).val_dr_change.mean().unstack()
            ax = sns.heatmap(data=df_grid,
                             cmap=(sns.diverging_palette(10, 220, sep=80,
                                                         n=7)),
                             annot=True,
                             cbar=False)
            title = 'Validation DR Change Over Baseline As Fn Of Learning Rate & Dropout'
            plt.xlabel('Dropout', fontsize=10)
            plt.ylabel('Learning Rate', fontsize=10)
            plt.title(title, fontsize=16)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

        self.log_file()
        print('Finished')

    def get_base_dr(self):
        y_pred = pd.Series(0.5, index=self.y_train.index)
        cm = confusion_matrix(self.y_train, y_pred)
        tp = self.get_tp_from_cm(cm)
        fn = self.get_fn_from_cm(cm)
        dr = tp / (tp + fn)
        print('dr ', dr)
        return dr

    # True positives are the diagonal elements
    def get_tp_from_cm(self, cm):
        tp = np.diag(cm)
        print('tp', np.sum(np.diag(cm)))
        return np.sum(tp)

    def get_tn_from_cm(self, cm):
        tn = []
        for i in range(self.n_classes):
            temp = np.delete(cm, i, 0)  # delete ith row
            temp = np.delete(temp, i, 1)  # delete ith column
            tn.append(sum(sum(temp)))
        print('tn ', np.sum(tn))
        return np.sum(tn)

    # Sum of columns minus diagonal
    def get_fp_from_cm(self, cm):
        fp = []
        for i in range(self.n_classes):
            fp.append(sum(cm[:, i]) - cm[i, i])
        print('fp ', np.sum(fp))
        return np.sum(fp)

    # Sum of rows minus diagonal
    def get_fn_from_cm(self, cm):
        fn = []
        for i in range(self.n_classes):
            fn.append(sum(cm[i, :]) - cm[i, i])
        print('fn', np.sum(fn))
        return np.sum(fn)

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open(
                    'logs/{}_{}_stdout.txt'.format(self.__class__.__name__,
                                                   self.timestr), 'w')
                sys.stdout = self.logfile

    def load_data(self):
        self.X = self.filehandler.read_csv(
            self.ds.config['path'],
            self.ds.config['file'] + '_Tensor2d_type_1')
        print('\tRow count:\t', '{}'.format(self.X.shape[0]))
        print('\tColumn count:\t', '{}'.format(self.X.shape[1]))

    def load_hyp(self):
        self.hyp = pd.read_csv(
            'tuning/Hyperparameter tuning - AnnMLPMultiOptimize_1.csv')

    def set_y(self):
        self.y = self.X['attack_category']
        self.y = self.y.map(self.label_map_string_2_int)

    def remove_target_from_X(self):
        self.X.drop('attack_category', axis=1, inplace=True)

    def train_test_split(self):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.30, random_state=self.random_state)

    def fname(self, title):
        return '{}/{}.png'.format(self.folder, self.fprefix_multi + title)
Exemplo n.º 8
0
class Linearity:
    def __init__(self):
        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.visualize = Visualize()
        self.random_state = 20

        self.X = None
        self.y = None
        self.sample = None
        self.full = None
        self.ac_count = {}
        self.scale_cols = [
            'duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
            'urgent', 'hot', 'num_failed_logins', 'logged_in',
            'num_compromised', 'root_shell', 'su_attempted', 'num_root',
            'num_file_creations', 'num_shells', 'num_access_files',
            'is_guest_login', 'count', 'srv_count', 'serror_rate',
            'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate',
            'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate',
            'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate'
        ]
        self.full_weights = {
            'normal': 1,
            'dos': 1,
            'probe': 1,
            'u2r': 1,
            'r2l': 1
        }
        self.minimal_weights = {
            'normal': 0.01,
            'dos': 0.01,
            'probe': 0.2,
            'u2r': 0.5,
            'r2l': 0.5
        }

        with timer('\nLoading dataset'):
            self.load_data()
            self.set_attack_category_count()
            self.ds.shape()
        with timer('\nEncode and Scale dataset'):
            # Encode categoricals
            le = preprocessing.LabelEncoder()
            self.full['protocol_type'] = le.fit_transform(
                self.full['protocol_type'])
            self.full['service'] = le.fit_transform(self.full['service'])
            self.full['flag'] = le.fit_transform(self.full['flag'])

            # Scale
            sc = StandardScaler()
            self.full[self.scale_cols] = sc.fit_transform(
                self.full[self.scale_cols])
        with timer('\nPlotting scatter graphs'):
            self.sample_dataset(self.full_weights)
            print(self.sample.shape)
            self.set_X_y('target')
            self.scatter()
        with timer('\nPlotting scatter graphs with convex hull'):
            self.sample_dataset(self.full_weights)
            print(self.sample.shape)
            self.set_X_y('target')
            self.convex_hull()
        with timer('\nPlotting linear separability with classifiers'):
            self.sample_dataset(self.minimal_weights)
            print(self.sample.shape)
            self.set_X_y('target')
            self.classifiers()

        self.log_file()
        print('Finished')

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open(
                    'logs/{}_{}_stdout.txt'.format(self.__class__.__name__,
                                                   self.timestr), 'w')
                sys.stdout = self.logfile

    def scatter(self):
        self.visualize.scatter(self.X,
                               cola='src_bytes',
                               colb='dst_bytes',
                               hue='target')
        self.visualize.scatter(self.X,
                               cola='count',
                               colb='diff_srv_rate',
                               hue='target')
        self.visualize.scatter(self.X,
                               cola='duration',
                               colb='src_bytes',
                               hue='target')
        self.visualize.scatter(self.X,
                               cola='dst_host_srv_count',
                               colb='dst_bytes',
                               hue='target')
        self.visualize.scatter(self.X,
                               cola='serror_rate',
                               colb='rerror_rate',
                               hue='target')
        self.visualize.scatter(self.X,
                               cola='dst_host_srv_count',
                               colb='dst_bytes',
                               hue='target')
        self.visualize.scatter(self.X,
                               cola='srv_diff_host_rate',
                               colb='srv_count',
                               hue='target')

    def convex_hull(self):
        buckets = self.y.unique()
        self.visualize.convex_hull(self.X,
                                   buckets,
                                   cola='src_bytes',
                                   colb='dst_bytes',
                                   target='target')
        self.visualize.convex_hull(self.X,
                                   buckets,
                                   cola='count',
                                   colb='diff_srv_rate',
                                   target='target')
        self.visualize.convex_hull(self.X,
                                   buckets,
                                   cola='duration',
                                   colb='src_bytes',
                                   target='target')
        self.visualize.convex_hull(self.X,
                                   buckets,
                                   cola='dst_host_srv_count',
                                   colb='dst_bytes',
                                   target='target')
        self.visualize.convex_hull(self.X,
                                   buckets,
                                   cola='serror_rate',
                                   colb='rerror_rate',
                                   target='target')
        self.visualize.convex_hull(self.X,
                                   buckets,
                                   cola='dst_host_srv_count',
                                   colb='dst_bytes',
                                   target='target')
        self.visualize.convex_hull(self.X,
                                   buckets,
                                   cola='srv_diff_host_rate',
                                   colb='srv_count',
                                   target='target')

    def load_data(self):
        self.ds.dataset = self.filehandler.read_csv(
            self.ds.config['path'], self.ds.config['file'] + '_processed')
        self.ds.target = self.filehandler.read_csv(
            self.ds.config['path'], self.ds.config['file'] + '_target')
        self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1)

    def set_attack_category_count(self):
        ac = self.full['attack_category'].value_counts()
        for key, value in ac.items():
            self.ac_count[key] = value

    def set_X_y(self, target):
        print('Setting X, with y as {}'.format(target))
        self.X = self.sample
        self.y = self.sample[target]

    def sample_dataset(self, weights):
        print('Sampling dataset with weights {}'.format(weights))
        self.sample = pd.DataFrame()
        for key, value in self.ac_count.items():
            samples = int(value * weights[key])
            df = self.full[self.full.attack_category == key].sample(
                samples, random_state=self.random_state)
            self.sample = self.sample.append(df)

    def classifiers(self):
        le = preprocessing.LabelEncoder()
        self.y = le.fit_transform(self.y)
        _y = self.y

        models = (Perceptron(max_iter=100,
                             tol=1e-3,
                             random_state=self.random_state),
                  LinearSVC(max_iter=500,
                            random_state=self.random_state,
                            tol=1e-5),
                  SVC(kernel='rbf',
                      gamma=5,
                      C=10.0,
                      random_state=self.random_state))

        titles = ('Perceptron', 'LinearSVC (linear kernel)',
                  'SVC with RBF kernel')
        columns = [('srv_diff_host_rate', 'srv_count'),
                   ('dst_host_srv_count', 'count'),
                   ('dst_host_srv_count', 'dst_bytes')]
        for clf, title in zip(models, titles):
            for cola, colb in columns:
                _x = self.X.loc[:, [cola, colb]]
                clf.fit(_x, _y)
                _y_pred = clf.predict(_x)
                self.visualize.boundary(_x, _y, clf, title, cola, colb)
                self.visualize.confusion_matrix(
                    _y, _y_pred, title + ' - ' + cola + ' vs ' + colb)
Exemplo n.º 9
0
class FeatureSelection:
    def __init__(self):
        self.logfile = False
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.visualize = Visualize()
        self.ds = KDDCup1999()
        self.X = None
        self.y = None
        self.full = None
        self.random_state = 20
        self.num_features = 15
        self.scale_cols = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
                           'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
                           'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login',
                           'count', 'srv_count', 'serror_rate', 'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate',
                           'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate',
                           'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate']

        with timer('\nLoading dataset'):
            self.load_data()
            self.encode_scale()
            self.set_X()
        with timer('\nFeature selection'):
            for selector in (Original(),
                             UnivariateSelector(),
                             RecursiveSelector(),
                             PCASelector(),
                             #KernelPCASelector(),
                             ExtraTreesSelector(),
                             RandomForestSelector()):
                for label in ('attack_category', 'target'):
                    self.set_y(label)
                    with timer('\nFitting selector ' + selector.__class__.__name__):
                        selector.fit_model(self.X, self.y)
                        x = selector.get_top_features(self.X, label)
                    with timer('\nXGBoost scoring of features selected by ' + selector.__class__.__name__):
                        self.score_with_xgboost(x, self.y, selector.title)

        self.log_file()
        print('Finished')

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open('logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w')
                sys.stdout = self.logfile

    def load_data(self):
        self.ds.dataset = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_processed')
        self.ds.target = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_target')
        self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1)
        self.ds.shape()
        print(self.ds.dataset.columns)
        self.ds.row_count_by_target('attack_category')

    def encode_scale(self):
        # Encode categoricals
        le = preprocessing.LabelEncoder()
        self.full['protocol_type'] = le.fit_transform(self.full['protocol_type'])
        self.full['service'] = le.fit_transform(self.full['service'])
        self.full['flag'] = le.fit_transform(self.full['flag'])

        # Scale
        sc = MinMaxScaler()
        self.full[self.scale_cols] = sc.fit_transform(self.full[self.scale_cols])

    def set_X(self):
        self.X = self.full.iloc[:, :-2]

    def set_y(self, label):
        self.y = self.full[label]

    def score_with_xgboost(self, x, y, title):
        clf = XGBClassifier(n_estimators=100, random_state=self.random_state)
        kfold = StratifiedKFold(n_splits=10, random_state=self.random_state)
        results = cross_val_score(clf, x, y, cv=kfold)
        print("XGBoost Accuracy: %.2f%% (+/- %.2f%%)" % (results.mean() * 100, results.std() * 100))
        y_pred = cross_val_predict(clf, x, y, cv=10)
        self.visualize.confusion_matrix(y, y_pred, title)
Exemplo n.º 10
0
class Sampling:
    def __init__(self):
        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.visualize = Visualize()
        self.random_state = 20
        self.X = None
        self.y = None
        self.full = None

        # RF Feature selected plus sparse cols
        self.cols = [
            'count', 'diff_srv_rate', 'src_bytes', 'dst_host_srv_count',
            'flag', 'dst_bytes', 'serror_rate', 'dst_host_diff_srv_rate',
            'service', 'dst_host_count', 'dst_host_srv_diff_host_rate',
            'logged_in', 'protocol_type', 'dst_host_same_src_port_rate', 'hot',
            'srv_count', 'wrong_fragment', 'num_compromised', 'rerror_rate',
            'srv_diff_host_rate', 'urgent', 'num_failed_logins', 'root_shell',
            'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
            'num_access_files', 'is_guest_login'
        ]

        with timer('\nLoading dataset'):
            self.load_data()

        with timer('\nScaling'):
            # Sampling options
            for sampler in (Original(), RandomOverSampler(),
                            SMOTE(random_state=self.random_state),
                            ADASYN(random_state=self.random_state),
                            BorderlineSMOTE(random_state=self.random_state,
                                            kind='borderline-1')):

                self.X = self.full.loc[:, self.cols]
                self.X['target'] = self.full['target']
                print('X shape with selected features and binary - ',
                      self.X.shape)

                self.X = pd.get_dummies(
                    data=self.X, columns=['protocol_type', 'service', 'flag'])
                print('X shape after encoding categoricals - ', self.X.shape)

                # Re-sample based on attack_category labels
                res_x = pd.DataFrame()
                res_x, res_y_attack_category, title = self.sample(
                    sampler, self.X, self.full['attack_category'])

                res_y_target = res_x[
                    'target']  # Grab target as y from resampled x set
                res_x.drop(columns=['target'], inplace=True)
                print('X shape after sampling and removing target - ',
                      res_x.shape)
                print('y shape with attack_category after resample - ',
                      res_y_attack_category.shape)
                print(res_y_attack_category.value_counts())
                res_y_attack_category.value_counts().plot(
                    kind='bar',
                    title=title + ' - Resampled Count (attack_category)')
                plt.show()
                print('y shape with target after resample - ',
                      res_y_target.shape)

                # Scale after resampling
                qt = QuantileTransformer(output_distribution='normal')
                res_x = qt.fit_transform(res_x)
                print('X shape after scaling - ', res_x.shape)

                # Score on attack_category multi-class
                self.model_and_score(res_x, res_y_attack_category, title,
                                     'attack_category')

                # Score on binary target
                self.model_and_score(res_x, res_y_target, title, 'target')

        self.log_file()
        print('Finished')

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open(
                    'logs/{}_{}_stdout.txt'.format(self.__class__.__name__,
                                                   self.timestr), 'w')
                sys.stdout = self.logfile

    def load_data(self):
        self.ds.dataset = self.filehandler.read_csv(
            self.ds.config['path'], self.ds.config['file'] + '_processed')
        self.ds.target = self.filehandler.read_csv(
            self.ds.config['path'], self.ds.config['file'] + '_target')
        self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1)
        self.ds.shape()
        self.ds.row_count_by_target('attack_category')

    def set_y(self, label):
        self.y = self.full[label]

    def sample(self, sampler, X, y):
        title = sampler.__class__.__name__
        res_x, res_y = sampler.fit_resample(X, y)
        if isinstance(res_x, np.ndarray):
            res_x = pd.DataFrame(res_x, columns=X.columns)

        if isinstance(res_y, np.ndarray):
            res_y = pd.Series(res_y)

        print('Shape after sampling with {} - x {},  y {}'.format(
            title, res_x.shape, res_y.shape))
        return res_x, res_y, title

    def model_and_score(self, X, y, title, label):
        clf = XGBClassifier(n_estimators=50, random_state=self.random_state)
        kfold = StratifiedKFold(n_splits=5, random_state=self.random_state)
        results = cross_val_score(clf, X, y, cv=kfold)
        y_pred = cross_val_predict(clf, X, y, cv=5)
        print('{} - {} - XGBoost Accuracy: {:.2f}% (+/- {:.2f}'.format(
            title, label,
            results.mean() * 100,
            results.std() * 100))
        self.visualize.confusion_matrix(
            y, y_pred,
            '{} - {} - Label {}'.format(title, clf.__class__.__name__, label))
Exemplo n.º 11
0
class Scaling:
    def __init__(self):
        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.visualize = Visualize()
        self.random_state = 20
        self.X = None
        self.y = None
        self.full = None
        self.ac_count = {}
        self.scores = OrderedDict()
        self.scale_cols = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
                           'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
                           'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login',
                           'count', 'srv_count', 'serror_rate', 'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate',
                           'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate',
                           'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate']

        with timer('\nLoading dataset'):
            self.load_data()
            self.set_attack_category_count()
        with timer('\nEncoding categoricals'):
            le = preprocessing.LabelEncoder()
            self.full['protocol_type'] = le.fit_transform(self.full['protocol_type'])
            self.full['service'] = le.fit_transform(self.full['service'])
            self.full['flag'] = le.fit_transform(self.full['flag'])
        with timer('\nSetting X'):
            self.set_X()
            self.ds.shape()
        with timer('\nDistribution Before Scaling'):
            self.dist_before_scaling()
        with timer('\nScaling'):
            for scaler in (StandardScaler(),
                           Normalizer(),
                           MinMaxScaler(feature_range=(0, 1)),
                           Binarizer(threshold=0.0),
                           RobustScaler(quantile_range=(25, 75)),
                           PowerTransformer(method='yeo-johnson'),
                           QuantileTransformer(output_distribution='normal')):
                title, res_x = self.scale(scaler)

                label = 'attack_category'
                self.set_y(label)
                self.model_and_score(scaler, res_x, title, label)

                label = 'target'
                self.set_y(label)
                self.model_and_score(scaler, res_x, title, label)

        self.log_file()
        print('Finished')

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open('logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w')
                sys.stdout = self.logfile

    def load_data(self):
        self.ds.dataset = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_processed')
        self.ds.target = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_target')
        self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1)
        self.ds.shape()
        self.ds.row_count_by_target('attack_category')

    def set_attack_category_count(self):
        ac = self.full['attack_category'].value_counts()
        for key, value in ac.items():
            self.ac_count[key] = value

    def set_X(self):
        self.X = self.full.loc[:, self.scale_cols]

    def set_y(self, label):
        self.y = self.full[label]

    def dist_before_scaling(self):
        self.visualize.kdeplot('Distribution Before Scaling', self.X, self.scale_cols)

    def scale(self, scaler):
        x = self.X[self.scale_cols]

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            res_x = scaler.fit_transform(x)

        res_x = pd.DataFrame(res_x, columns=self.scale_cols)
        title = 'Distribution After ' + scaler.__class__.__name__
        self.visualize.kdeplot(title, res_x, self.scale_cols)
        return title, res_x

    def model_and_score(self, scaler, res_x, title, label):
        clf = XGBClassifier(n_estimators=100, random_state=self.random_state)
        kfold = StratifiedKFold(n_splits=10, random_state=self.random_state)
        results = cross_val_score(clf, res_x, self.y, cv=kfold)
        y_pred = cross_val_predict(clf, res_x, self.y, cv=10)
        print('{} - {} - XGBoost Accuracy: {:.2f}% (+/- {:.2f}'.format(title, label, results.mean() * 100,
                                                                       results.std() * 100))
        self.visualize.confusion_matrix(self.y, y_pred, '{} - {} - Label {}'.format(title, clf.__class__.__name__,
                                                                                    label))
Exemplo n.º 12
0
class Preptensorinputs:
    def __init__(self):
        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.X = None
        self.y = None
        self.full = None

        with timer('\nLoading dataset'):
            self.load_data()

        with timer('\nPreparing Tensor Input Files'):
            for t2d in (Tensor2d_type_1(), Tensor2d_type_2()):
                with timer('\nBuilding 2d tensor - ' + t2d.__class__.__name__):
                    t2d.set_X(self.full)
                    t2d.encode_categoricals()
                    t2d.set_y(self.full)
                    t2d.sample()
                    t2d.scale()
                    t2d.pca()
                    t2d.add_target()
                    self.filehandler.write_csv(
                        self.ds.config['path'],
                        self.ds.config['file'] + '_' + t2d.__class__.__name__,
                        t2d.X)
                    print('Shape of ' + self.ds.config['file'] + '_' +
                          t2d.__class__.__name__ + ' : ' + str(t2d.X.shape))

        self.log_file()
        print('Finished')

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open(
                    'logs/{}_{}_stdout.txt'.format(self.__class__.__name__,
                                                   self.timestr), 'w')
                sys.stdout = self.logfile

    def load_data(self):
        self.ds.dataset = self.filehandler.read_csv(
            self.ds.config['path'], self.ds.config['file'] + '_processed')
        self.ds.target = self.filehandler.read_csv(
            self.ds.config['path'], self.ds.config['file'] + '_target')
        self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1)
        self.ds.shape()
Exemplo n.º 13
0
class Preprocessing:
    def __init__(self):
        self.logfile = False
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.visualize = Visualize()
        self.ds = KDDCup1999()

        with timer('\nLoading dataset'):
            self.ds.dataset = self.filehandler.read_csv(
                self.ds.config['path'], self.ds.config['file'])
            self.ds.set_columns()
        with timer('\nTransforming dataset'):
            self.ds.transform()
        with timer('\nInitial dataset discovery'):
            self.ds.shape()
            self.ds.show_duplicates(self.ds.config['level_01'])
            self.ds.drop_duplicates()
            self.show_zeros()
            self.ds.drop_outliers()
            self.ds.shape()
            self.ds.discovery()
        with timer('\nSetting target'):
            self.ds.set_target()
        with timer('\nEvaluating sparse features'):
            self.ds.evaluate_sparse_features(engineer=False)
        with timer('\nVisualising pairplot for selected columns'):
            self.visualize.pairplot(self.ds.dataset,
                                    self.ds.config['pairplot_cols'],
                                    self.ds.config['pairplot_target'])
        with timer('\nDropping columns'):
            self.ds.drop_cols(self.ds.config['drop_cols_01'])
        with timer('\nEvaluating correlation'):
            self.visualize.correlation_heatmap(
                self.ds.dataset,
                title='Correlation Heatmap Before Column Drop')
            self.ds.drop_highly_correlated()
            self.visualize.correlation_heatmap(
                self.ds.dataset, title='Correlation Heatmap After Column Drop')
        with timer('\nPersisting transformed dataset and target'):
            self.filehandler.write_csv(self.ds.config['path'],
                                       self.ds.config['file'] + '_processed',
                                       self.ds.dataset)
            self.filehandler.write_csv(self.ds.config['path'],
                                       self.ds.config['file'] + '_target',
                                       self.ds.target)
            self.ds.shape()

        self.log_file()
        print('Finished')

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open(
                    'logs/{}_{}_stdout.txt'.format(self.__class__.__name__,
                                                   self.timestr), 'w')
                sys.stdout = self.logfile

    def show_zeros(self):
        df = self.ds.dataset.iloc[:, :-3]
        df[(
            df == 0
        )] = np.nan  # Transform 0's to NaN for visualisation of sparseness with missingno
        self.visualize.matrix_missing(
            df, 'Nullity matrix of features with 0 values')
        self.visualize.bar_missing(df, 'Bar plot of features with 0 values')
        self.visualize.heat_missing(df,
                                    'Heatmap of features with missing values')