Python Filehandler примеры использования

Язык программирования: Python

Пространство имен/Пакет: filehandler

Класс/Тип: Filehandler

Примеров на hotexamples.com: 27

Python Filehandler - 27 примеров найдено. Это лучшие примеры Python кода для filehandler.Filehandler, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Filehandler(14)

read_csv(13)

write_csv(2)

Основные методы

Filehandler (14)

read_csv (13)

write_csv (2)

Пример #1

Показать файл

Файл: scaling.py Проект: corticalstack/KDDCup1999

    def __init__(self):
        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.visualize = Visualize()
        self.random_state = 20
        self.X = None
        self.y = None
        self.full = None
        self.ac_count = {}
        self.scores = OrderedDict()
        self.scale_cols = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
                           'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
                           'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login',
                           'count', 'srv_count', 'serror_rate', 'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate',
                           'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate',
                           'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate']

        with timer('\nLoading dataset'):
            self.load_data()
            self.set_attack_category_count()
        with timer('\nEncoding categoricals'):
            le = preprocessing.LabelEncoder()
            self.full['protocol_type'] = le.fit_transform(self.full['protocol_type'])
            self.full['service'] = le.fit_transform(self.full['service'])
            self.full['flag'] = le.fit_transform(self.full['flag'])
        with timer('\nSetting X'):
            self.set_X()
            self.ds.shape()
        with timer('\nDistribution Before Scaling'):
            self.dist_before_scaling()
        with timer('\nScaling'):
            for scaler in (StandardScaler(),
                           Normalizer(),
                           MinMaxScaler(feature_range=(0, 1)),
                           Binarizer(threshold=0.0),
                           RobustScaler(quantile_range=(25, 75)),
                           PowerTransformer(method='yeo-johnson'),
                           QuantileTransformer(output_distribution='normal')):
                title, res_x = self.scale(scaler)

                label = 'attack_category'
                self.set_y(label)
                self.model_and_score(scaler, res_x, title, label)

                label = 'target'
                self.set_y(label)
                self.model_and_score(scaler, res_x, title, label)

        self.log_file()
        print('Finished')

Пример #2

Показать файл

    def __init__(self):
        self.logfile = False
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.visualize = Visualize()
        self.ds = KDDCup1999()

        with timer('\nLoading dataset'):
            self.ds.dataset = self.filehandler.read_csv(
                self.ds.config['path'], self.ds.config['file'])
            self.ds.set_columns()
        with timer('\nTransforming dataset'):
            self.ds.transform()
        with timer('\nInitial dataset discovery'):
            self.ds.shape()
            self.ds.show_duplicates(self.ds.config['level_01'])
            self.ds.drop_duplicates()
            self.show_zeros()
            self.ds.drop_outliers()
            self.ds.shape()
            self.ds.discovery()
        with timer('\nSetting target'):
            self.ds.set_target()
        with timer('\nEvaluating sparse features'):
            self.ds.evaluate_sparse_features(engineer=False)
        with timer('\nVisualising pairplot for selected columns'):
            self.visualize.pairplot(self.ds.dataset,
                                    self.ds.config['pairplot_cols'],
                                    self.ds.config['pairplot_target'])
        with timer('\nDropping columns'):
            self.ds.drop_cols(self.ds.config['drop_cols_01'])
        with timer('\nEvaluating correlation'):
            self.visualize.correlation_heatmap(
                self.ds.dataset,
                title='Correlation Heatmap Before Column Drop')
            self.ds.drop_highly_correlated()
            self.visualize.correlation_heatmap(
                self.ds.dataset, title='Correlation Heatmap After Column Drop')
        with timer('\nPersisting transformed dataset and target'):
            self.filehandler.write_csv(self.ds.config['path'],
                                       self.ds.config['file'] + '_processed',
                                       self.ds.dataset)
            self.filehandler.write_csv(self.ds.config['path'],
                                       self.ds.config['file'] + '_target',
                                       self.ds.target)
            self.ds.shape()

        self.log_file()
        print('Finished')

Пример #3

Показать файл

Файл: xgboostBinary.py Проект: corticalstack/KDDCup1999

    def __init__(self):
        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.random_state = 20
        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.n_classes = 2

        # Datasets
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.label_map_int_2_string = {0: 'good', 1: 'bad', '0': 'good', '1': 'bad'}
        self.label_map_string_2_int = {'normal': 0, 'dos': 1, 'u2r': 1, 'r2l': 1, 'probe': 1}
        self.max_iters = 100

        with timer('\nPreparing dataset'):
            self.load_data()
            self.set_y()
            self.remove_target_from_X()
            self.train_test_split()

        with timer('\nTesting model on unseen test set'):
            clf = XGBClassifier(n_estimators=100, random_state=self.random_state)
            clf.fit(self.X_train, self.y_train)
            self.y_pred = clf.predict(self.X_test)
            cm = confusion_matrix(self.y_test, self.y_pred)
            self.tp = cm[1, 1]
            self.tn = cm[0, 0]
            self.fp = cm[0, 1]
            self.fn = cm[1, 0]

            print('True positive (TP)', self.tp)
            print('True negative (TN)', self.tn)
            print('False positive (FP)', self.fp)
            print('false negative (FN)', self.fn)

            self.dr = self.tp / (self.tp + self.fp)
            self.far = self.fp / (self.tn + self.fp)
            self.acc = (self.tp + self.tn) / (self.tp + self.tn + self.fp + self.fn)
            print('Detection rate: ', self.dr)
            print('False alarm rate: ', self.far)
            print('Accuracy: ', self.acc)

        self.log_file()
        print('Finished')

Пример #4

Показать файл

Файл: featureselection.py Проект: corticalstack/KDDCup1999

    def __init__(self):
        self.logfile = False
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.visualize = Visualize()
        self.ds = KDDCup1999()
        self.X = None
        self.y = None
        self.full = None
        self.random_state = 20
        self.num_features = 15
        self.scale_cols = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
                           'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
                           'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login',
                           'count', 'srv_count', 'serror_rate', 'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate',
                           'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate',
                           'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate']

        with timer('\nLoading dataset'):
            self.load_data()
            self.encode_scale()
            self.set_X()
        with timer('\nFeature selection'):
            for selector in (Original(),
                             UnivariateSelector(),
                             RecursiveSelector(),
                             PCASelector(),
                             #KernelPCASelector(),
                             ExtraTreesSelector(),
                             RandomForestSelector()):
                for label in ('attack_category', 'target'):
                    self.set_y(label)
                    with timer('\nFitting selector ' + selector.__class__.__name__):
                        selector.fit_model(self.X, self.y)
                        x = selector.get_top_features(self.X, label)
                    with timer('\nXGBoost scoring of features selected by ' + selector.__class__.__name__):
                        self.score_with_xgboost(x, self.y, selector.title)

        self.log_file()
        print('Finished')

Пример #5

Показать файл

    def preprocess(self, envparm):
        self.filehandler = Filehandler()
        self.dataset_raw = self.filehandler.read_csv(
            self.filehandler.data_raw_path)
        logging.info('Original raw dataset loaded - dataset size {}'.format(
            self.dataset_raw.shape))

        logging.info('Partitioning numerical features')
        self.numerical_features_raw = self.dataset_raw.iloc[:, 0:190].copy()

        logging.info('Partitioning categorical features')
        self.categorical_features_raw = self.dataset_raw.iloc[:, 190:].copy()

        if envparm['PlotGraphs']:
            num_size = 28
            sample_df = self.dataset_raw.iloc[:, :num_size].copy()
            visualizer.matrix_missing(
                sample_df,
                'Data Completion First ' + str(num_size) + ' Numeric Features')
            visualizer.bar_missing(
                sample_df,
                'Nullity Count First ' + str(num_size) + ' Numeric Features')
            visualizer.heat_missing(
                sample_df, 'Nullity Correlation Of First ' + str(num_size) +
                ' Numeric Features')

            sample_df = self.dataset_raw.iloc[:, 190:].copy()
            visualizer.matrix_missing(sample_df,
                                      'Data Completion Categorical Features')
            visualizer.bar_missing(sample_df,
                                   'Nullity Count Categorical Features')
            visualizer.heat_missing(
                sample_df, 'Nullity Correlation Of Categorical Features')

        if envparm['ProcessDS01']:
            self.prepare_output_variant_01()

        if envparm['ProcessDS02']:
            self.prepare_output_variant_02()

        if envparm['ProcessDS03']:
            self.prepare_output_variant_03()

Пример #6

Показать файл

Файл: preptensorinput.py Проект: corticalstack/KDDCup1999

    def __init__(self):
        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.X = None
        self.y = None
        self.full = None

        with timer('\nLoading dataset'):
            self.load_data()

        with timer('\nPreparing Tensor Input Files'):
            for t2d in (Tensor2d_type_1(), Tensor2d_type_2()):
                with timer('\nBuilding 2d tensor - ' + t2d.__class__.__name__):
                    t2d.set_X(self.full)
                    t2d.encode_categoricals()
                    t2d.set_y(self.full)
                    t2d.sample()
                    t2d.scale()
                    t2d.pca()
                    t2d.add_target()
                    self.filehandler.write_csv(
                        self.ds.config['path'],
                        self.ds.config['file'] + '_' + t2d.__class__.__name__,
                        t2d.X)
                    print('Shape of ' + self.ds.config['file'] + '_' +
                          t2d.__class__.__name__ + ' : ' + str(t2d.X.shape))

        self.log_file()
        print('Finished')

Пример #7

Показать файл

Файл: main.py Проект: corticalstack/KDDCup2009

def main():
    filehandler = Filehandler()
    modeller = Modeller()

    if envparm['PrepEnabled']:
        logging.info("Executing preprocessor")
        Preprocessor(envparm)

    if envparm['ProcessDS01']:
        process_dataset(filehandler, modeller,
                        filehandler.dataset_prep_path_01)

    if envparm['ProcessDS02']:
        process_dataset(filehandler, modeller,
                        filehandler.dataset_prep_path_02)

    if envparm['ProcessDS03']:
        process_dataset(filehandler, modeller,
                        filehandler.dataset_prep_path_03)

    if modeller.scores:
        modeller.output_scores(filehandler)

Пример #8

Показать файл

class AnnMLPOptimiseEvaluateMulti:
    def __init__(self):
        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.n_classes = 5
        self.random_state = 20
        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.folder = 'viz'
        self.fprefix_multi = 'Hyper - annMLPMulti - '

        # Datasets
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.hyp = None
        self.lr = None
        self.label_map_string_2_int = {
            'normal': 0,
            'dos': 1,
            'u2r': 2,
            'r2l': 3,
            'probe': 4
        }
        self.max_iters = 100

        with timer('\nPreparing dataset'):
            self.load_data()
            self.set_y()
            self.remove_target_from_X()
            self.train_test_split()

        with timer('\nPreparing base logistic regression'):
            self.lr = LogisticRegression(max_iter=self.max_iters)
            self.lr.fit(self.X_train, self.y_train)

        with timer('\nPreparing confusion matrix and base DR'):
            self.y_pred = self.lr.predict(self.X_test)
            cm = confusion_matrix(self.y_test, self.y_pred)
            self.tp = self.get_tp_from_cm(cm)
            self.tn = self.get_tn_from_cm(cm)
            self.fp = self.get_fp_from_cm(cm)
            self.fn = self.get_fn_from_cm(cm)
            self.dr = self.tp / (self.tp + self.fp)
            print('log reg dr', self.dr)

        with timer('\nVisualising optimisation search'):
            self.load_hyp()
            self.hyp['lr'] = round(self.hyp['lr'] / 1000, 3)

            # Hyperparameter correlation with val DR
            self.hyp_val_dr = self.hyp
            self.hyp_val_dr.drop([
                'round_epochs', 'epochs', 'loss', 'dr', 'far', 'acc',
                'val_loss', 'val_acc', 'val_far'
            ],
                                 axis=1,
                                 inplace=True)
            self.dr_corr = self.hyp_val_dr.corr()
            plt.clf()
            fig, ax = plt.subplots(figsize=(10, 10))
            title = 'Validation DR Hyperparameter Correlation'
            ax.set_title(title, size=16)
            colormap = sns.diverging_palette(220, 10, as_cmap=True)
            sns.heatmap(self.dr_corr,
                        cmap=colormap,
                        annot=True,
                        fmt=".2f",
                        cbar=False,
                        vmin=-0.4,
                        vmax=0.4)
            plt.xticks(range(len(self.dr_corr.columns)), self.dr_corr.columns)
            plt.yticks(range(len(self.dr_corr.columns)), self.dr_corr.columns)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            self.hyp['val_dr_change'] = round(self.hyp.val_dr - self.dr, 3)
            pd.set_option('display.max_columns', 100)
            print(self.hyp.sort_values(by='val_dr', ascending=False).head())

            self.color = 'cornflowerblue'

            metric = 'lr'
            plt.clf()
            fig, ax = plt.subplots(figsize=(10, 6))
            ax = sns.boxplot(x=metric,
                             y='val_dr_change',
                             data=self.hyp.reset_index(),
                             color=self.color)
            title = 'Validation DR Change Over Baseline As Fn Of Learning Rate'
            plt.title(title, fontsize=16)
            plt.xlabel('Learning Rate', fontsize=12)
            plt.ylabel('Validation DR Change', fontsize=12)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            metric = 'first_neuron'
            plt.clf()
            fig, ax = plt.subplots(figsize=(10, 6))
            ax = sns.boxplot(x=metric,
                             y='val_dr_change',
                             data=self.hyp.reset_index(),
                             color=self.color)
            title = 'Validation DR Change Over Baseline As Fn Of # Neurons First Layer'
            plt.title(title, fontsize=16)
            plt.xlabel('First Neuron', fontsize=12)
            plt.ylabel('Validation DR Change', fontsize=12)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            metric = 'hidden_layers'
            plt.clf()
            fig, ax = plt.subplots(figsize=(10, 6))
            ax = sns.boxplot(x=metric,
                             y='val_dr_change',
                             data=self.hyp.reset_index(),
                             color=self.color)
            title = 'Validation DR Change Over Baseline As Fn Of # Hidden Layers'
            plt.title(title, fontsize=16)
            plt.xlabel('Hidden Layers', fontsize=12)
            plt.ylabel('Validation DR Change', fontsize=12)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            metric = 'hidden_neuron'
            plt.clf()
            fig, ax = plt.subplots(figsize=(10, 6))
            ax = sns.boxplot(x=metric,
                             y='val_dr_change',
                             data=self.hyp.reset_index(),
                             color=self.color)
            title = 'Validation DR Change Over Baseline As Fn Of # Hidden Layer Neurons'
            plt.title(title, fontsize=16)
            plt.xlabel('Hidden Neurons', fontsize=12)
            plt.ylabel('Validation DR Change', fontsize=12)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            metric = 'batch_size'
            plt.clf()
            fig, ax = plt.subplots(figsize=(10, 6))
            ax = sns.boxplot(x=metric,
                             y='val_dr_change',
                             data=self.hyp.reset_index(),
                             color=self.color)
            title = 'Validation DR Change Over Baseline As Fn Of Batch Size'
            plt.title(title, fontsize=16)
            plt.xlabel('Batch Size', fontsize=12)
            plt.ylabel('Validation DR Change', fontsize=12)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            metric = 'dropout'
            plt.clf()
            fig, ax = plt.subplots(figsize=(12, 8))
            ax = sns.boxplot(x=metric,
                             y='val_dr_change',
                             data=self.hyp.reset_index(),
                             color=self.color)
            title = 'Validation DR Change Over Baseline As Fn Of Dropout'
            plt.title(title, fontsize=16)
            plt.xlabel('Dropout', fontsize=12)
            plt.ylabel('Validation DR Change', fontsize=12)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            plt.clf()
            fig, ax = plt.subplots(figsize=(9, 7))
            df_grid = self.hyp.reset_index().groupby(
                ['first_neuron',
                 'hidden_neuron']).val_dr_change.mean().unstack()
            ax = sns.heatmap(data=df_grid,
                             cmap=(sns.diverging_palette(10, 220, sep=80,
                                                         n=7)),
                             annot=True,
                             cbar=False)
            title = 'Validation DR Change Over Baseline As Fn Of First Neuron & Hidden Neuron'
            plt.title(title, fontsize=12)
            plt.xlabel('Hidden Neuron', fontsize=10)
            plt.ylabel('First Neuron', fontsize=10)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            plt.clf()
            fig, ax = plt.subplots(figsize=(9, 7))
            df_grid = self.hyp.reset_index().groupby(
                ['hidden_layers',
                 'hidden_neuron']).val_dr_change.mean().unstack()
            ax = sns.heatmap(data=df_grid,
                             cmap=(sns.diverging_palette(10, 220, sep=80,
                                                         n=7)),
                             annot=True,
                             cbar=False)
            title = 'Validation DR Change Over Baseline As Fn Of Hidden Layers & Hidden Neuron'
            plt.title(title, fontsize=16)
            plt.xlabel('Hidden Neuron', fontsize=10)
            plt.ylabel('Hidden Layers', fontsize=10)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            plt.clf()
            fig, ax = plt.subplots(figsize=(9, 7))
            df_grid = self.hyp.reset_index().groupby(
                ['batch_size', 'dropout']).val_dr_change.mean().unstack()
            ax = sns.heatmap(data=df_grid,
                             cmap=(sns.diverging_palette(10, 220, sep=80,
                                                         n=7)),
                             annot=True,
                             cbar=False)
            title = 'Validation DR Change Over Baseline As Fn Of Batch Size & Dropout'
            plt.xlabel('Dropout', fontsize=10)
            plt.ylabel('Batch Size', fontsize=10)
            plt.title(title, fontsize=16)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            plt.clf()
            fig, ax = plt.subplots(figsize=(9, 7))
            df_grid = self.hyp.reset_index().groupby(
                ['lr', 'dropout']).val_dr_change.mean().unstack()
            ax = sns.heatmap(data=df_grid,
                             cmap=(sns.diverging_palette(10, 220, sep=80,
                                                         n=7)),
                             annot=True,
                             cbar=False)
            title = 'Validation DR Change Over Baseline As Fn Of Learning Rate & Dropout'
            plt.xlabel('Dropout', fontsize=10)
            plt.ylabel('Learning Rate', fontsize=10)
            plt.title(title, fontsize=16)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

        self.log_file()
        print('Finished')

    def get_base_dr(self):
        y_pred = pd.Series(0.5, index=self.y_train.index)
        cm = confusion_matrix(self.y_train, y_pred)
        tp = self.get_tp_from_cm(cm)
        fn = self.get_fn_from_cm(cm)
        dr = tp / (tp + fn)
        print('dr ', dr)
        return dr

    # True positives are the diagonal elements
    def get_tp_from_cm(self, cm):
        tp = np.diag(cm)
        print('tp', np.sum(np.diag(cm)))
        return np.sum(tp)

    def get_tn_from_cm(self, cm):
        tn = []
        for i in range(self.n_classes):
            temp = np.delete(cm, i, 0)  # delete ith row
            temp = np.delete(temp, i, 1)  # delete ith column
            tn.append(sum(sum(temp)))
        print('tn ', np.sum(tn))
        return np.sum(tn)

    # Sum of columns minus diagonal
    def get_fp_from_cm(self, cm):
        fp = []
        for i in range(self.n_classes):
            fp.append(sum(cm[:, i]) - cm[i, i])
        print('fp ', np.sum(fp))
        return np.sum(fp)

    # Sum of rows minus diagonal
    def get_fn_from_cm(self, cm):
        fn = []
        for i in range(self.n_classes):
            fn.append(sum(cm[i, :]) - cm[i, i])
        print('fn', np.sum(fn))
        return np.sum(fn)

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open(
                    'logs/{}_{}_stdout.txt'.format(self.__class__.__name__,
                                                   self.timestr), 'w')
                sys.stdout = self.logfile

    def load_data(self):
        self.X = self.filehandler.read_csv(
            self.ds.config['path'],
            self.ds.config['file'] + '_Tensor2d_type_1')
        print('\tRow count:\t', '{}'.format(self.X.shape[0]))
        print('\tColumn count:\t', '{}'.format(self.X.shape[1]))

    def load_hyp(self):
        self.hyp = pd.read_csv(
            'tuning/Hyperparameter tuning - AnnMLPMultiOptimize_1.csv')

    def set_y(self):
        self.y = self.X['attack_category']
        self.y = self.y.map(self.label_map_string_2_int)

    def remove_target_from_X(self):
        self.X.drop('attack_category', axis=1, inplace=True)

    def train_test_split(self):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.30, random_state=self.random_state)

    def fname(self, title):
        return '{}/{}.png'.format(self.folder, self.fprefix_multi + title)

Пример #9

Показать файл

Файл: xgboostBinary.py Проект: corticalstack/KDDCup1999

class XGBoostBinary:
    def __init__(self):
        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.random_state = 20
        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.n_classes = 2

        # Datasets
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.label_map_int_2_string = {0: 'good', 1: 'bad', '0': 'good', '1': 'bad'}
        self.label_map_string_2_int = {'normal': 0, 'dos': 1, 'u2r': 1, 'r2l': 1, 'probe': 1}
        self.max_iters = 100

        with timer('\nPreparing dataset'):
            self.load_data()
            self.set_y()
            self.remove_target_from_X()
            self.train_test_split()

        with timer('\nTesting model on unseen test set'):
            clf = XGBClassifier(n_estimators=100, random_state=self.random_state)
            clf.fit(self.X_train, self.y_train)
            self.y_pred = clf.predict(self.X_test)
            cm = confusion_matrix(self.y_test, self.y_pred)
            self.tp = cm[1, 1]
            self.tn = cm[0, 0]
            self.fp = cm[0, 1]
            self.fn = cm[1, 0]

            print('True positive (TP)', self.tp)
            print('True negative (TN)', self.tn)
            print('False positive (FP)', self.fp)
            print('false negative (FN)', self.fn)

            self.dr = self.tp / (self.tp + self.fp)
            self.far = self.fp / (self.tn + self.fp)
            self.acc = (self.tp + self.tn) / (self.tp + self.tn + self.fp + self.fn)
            print('Detection rate: ', self.dr)
            print('False alarm rate: ', self.far)
            print('Accuracy: ', self.acc)

        self.log_file()
        print('Finished')

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open('logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w')
                sys.stdout = self.logfile

    def load_data(self):
        self.X = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_Tensor2d_type_1')
        print('\tRow count:\t', '{}'.format(self.X.shape[0]))
        print('\tColumn count:\t', '{}'.format(self.X.shape[1]))

    def set_y(self):
        self.y = self.X['attack_category']
        self.y = self.y.map(self.label_map_string_2_int)

    def remove_target_from_X(self):
        self.X.drop('attack_category', axis=1, inplace=True)

    def train_test_split(self):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.30,
                                                                                random_state=self.random_state)

    def map_target_to_label(self, t):
        return np.vectorize(self.label_map_int_2_string.get)(t)

    def fname(self, title):
        return '{}/{}.png'.format(self.folder, title)

Пример #10

Показать файл

Файл: clustering.py Проект: corticalstack/KDDCup1999

    def __init__(self):
        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.visualize = Visualize()
        self.random_state = 20
        self.clusters_stop = 11
        self.x = None
        self.y = None
        self.full = None
        self.ac_count = {}
        self.feature_idx = {0: 0, 1: 0, 2: 0}
        self.pca_idx = {0: 0, 1: 1, 2: 2, 'pca': True}
        self.kernelpca_idx = {0: 0, 1: 1, 2: 2, 'kpca': True}
        self.scale_cols = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
                           'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
                           'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login',
                           'count', 'srv_count', 'serror_rate', 'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate',
                           'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate',
                           'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate']
        self.cluster_cols = [('count', 'diff_srv_rate', 'src_bytes'),
                             ('src_bytes', 'dst_host_srv_count', 'dst_bytes'),
                             ('srv_diff_host_rate', 'srv_count', 'serror_rate'),
                             ('serror_rate', 'dst_host_diff_srv_rate', 'flag')]

        with timer('\nLoading dataset'):
            self.load_data()
            self.ds.shape()
        with timer('\nEncode and Scale dataset'):
            self.encode_scale()
        with timer('\nSetting X and y'):
            self.set_x_y()
        with timer('\nPlotting clusters for specific columns'):
            for cola, colb, colc in self.cluster_cols:
                for c in range(2, self.clusters_stop):
                    self.set_indexes(cola, colb, colc)
                    with timer('\n2D clustering without PCA'):
                        self.cluster(idx=self.feature_idx, n_clusters=c)
                    with timer('\n3D clustering without PCA'):
                        self.cluster(idx=self.feature_idx, n_clusters=c, projection='3d')
        with timer('\nPlotting clusters applying PCA'):
            for c in range(2, self.clusters_stop):
                with timer('\n2D clustering with PCA'):
                    self.cluster(idx=self.pca_idx, n_clusters=c)
                with timer('\n3D clustering with PCA'):
                    self.cluster(idx=self.pca_idx, n_clusters=c, projection='3d')
        # Commented out due to memory error
        #with timer('\nPlotting clusters Kernel applying PCA'):
        #    for c in range(2, 7):
        #        with timer('\n2D clustering with Kernel PCA'):
        #            self.cluster(idx=self.kernelpca_idx, n_clusters=c)
        #        with timer('\n3D clustering with Kernel PCA'):
        #            self.cluster(idx=self.kernelpca_idx, n_clusters=c, projection='3d')

        self.log_file()
        print('Finished')

Пример #11

Показать файл

Файл: clustering.py Проект: corticalstack/KDDCup1999

class Clustering:
    def __init__(self):
        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.visualize = Visualize()
        self.random_state = 20
        self.clusters_stop = 11
        self.x = None
        self.y = None
        self.full = None
        self.ac_count = {}
        self.feature_idx = {0: 0, 1: 0, 2: 0}
        self.pca_idx = {0: 0, 1: 1, 2: 2, 'pca': True}
        self.kernelpca_idx = {0: 0, 1: 1, 2: 2, 'kpca': True}
        self.scale_cols = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
                           'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
                           'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login',
                           'count', 'srv_count', 'serror_rate', 'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate',
                           'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate',
                           'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate']
        self.cluster_cols = [('count', 'diff_srv_rate', 'src_bytes'),
                             ('src_bytes', 'dst_host_srv_count', 'dst_bytes'),
                             ('srv_diff_host_rate', 'srv_count', 'serror_rate'),
                             ('serror_rate', 'dst_host_diff_srv_rate', 'flag')]

        with timer('\nLoading dataset'):
            self.load_data()
            self.ds.shape()
        with timer('\nEncode and Scale dataset'):
            self.encode_scale()
        with timer('\nSetting X and y'):
            self.set_x_y()
        with timer('\nPlotting clusters for specific columns'):
            for cola, colb, colc in self.cluster_cols:
                for c in range(2, self.clusters_stop):
                    self.set_indexes(cola, colb, colc)
                    with timer('\n2D clustering without PCA'):
                        self.cluster(idx=self.feature_idx, n_clusters=c)
                    with timer('\n3D clustering without PCA'):
                        self.cluster(idx=self.feature_idx, n_clusters=c, projection='3d')
        with timer('\nPlotting clusters applying PCA'):
            for c in range(2, self.clusters_stop):
                with timer('\n2D clustering with PCA'):
                    self.cluster(idx=self.pca_idx, n_clusters=c)
                with timer('\n3D clustering with PCA'):
                    self.cluster(idx=self.pca_idx, n_clusters=c, projection='3d')
        # Commented out due to memory error
        #with timer('\nPlotting clusters Kernel applying PCA'):
        #    for c in range(2, 7):
        #        with timer('\n2D clustering with Kernel PCA'):
        #            self.cluster(idx=self.kernelpca_idx, n_clusters=c)
        #        with timer('\n3D clustering with Kernel PCA'):
        #            self.cluster(idx=self.kernelpca_idx, n_clusters=c, projection='3d')

        self.log_file()
        print('Finished')

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open('logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w')
                sys.stdout = self.logfile

    def load_data(self):
        self.ds.dataset = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_processed')
        self.ds.target = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_target')
        self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1)

    def encode_scale(self):
        # Encode categoricals
        le = preprocessing.LabelEncoder()
        self.full['protocol_type'] = le.fit_transform(self.full['protocol_type'])
        self.full['service'] = le.fit_transform(self.full['service'])
        self.full['flag'] = le.fit_transform(self.full['flag'])

        # Scale
        sc = StandardScaler()
        self.full[self.scale_cols] = sc.fit_transform(self.full[self.scale_cols])

    def set_x_y(self):
        self.x = self.full.iloc[:, :-2]
        self.y = self.full['target']

    def set_indexes(self, cola, colb, colc):
        self.feature_idx[0] = self.x.columns.get_loc(cola)
        self.feature_idx[1] = self.x.columns.get_loc(colb)
        self.feature_idx[2] = self.x.columns.get_loc(colc)

    def cluster(self, idx, n_clusters, projection=None):
        df_x = self.x
        kmeans = KMeans(n_clusters=n_clusters, random_state=self.random_state)
        kmeans.fit(df_x)
        y_km = kmeans.fit_predict(df_x)
        self.visualize.scatter_clusters(self.x, n_clusters, y_km, idx, projection)

Пример #12

Показать файл

    def __init__(self):
        os.environ[
            'TF_CPP_MIN_LOG_LEVEL'] = '2'  # Ignore low level instruction warnings
        tf.logging.set_verbosity(tf.logging.ERROR)  # Set tensorflow verbosity
        sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

        print(__doc__)

        self.random_state = 20
        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.folder = 'tuning'

        # Datasets
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.n_features = None
        self.label_map_string_2_int = {
            'normal': 0,
            'dos': 1,
            'u2r': 2,
            'r2l': 3,
            'probe': 4
        }

        with timer('\nPreparing dataset'):
            self.load_data()
            self.set_y()
            self.remove_target_from_X()
            self.n_features_all = self.X.shape[1]
            self.n_features_50pct = int(self.n_features_all * 0.5)
            self.n_features_80pct = int(self.n_features_all * 0.8)
            self.y = pd.get_dummies(self.y)
            self.X = self.X.values
            self.y = self.y.values

        with timer('\nSearching parameter space'):
            # self.p = {'lr': (0.5, 5, 10),
            #      'first_neuron': [self.n_features_70pct, self.n_features_all],
            #      'hidden_layers': [0, 1, 2],
            #      'hidden_neuron': [self.n_features_70pct, self.n_features_all],
            #      'batch_size': [100, 200],
            #      'epochs': [30],
            #      'dropout': (0, 0.2, 0.5),
            #      'weight_regulizer': [None],
            #      'emb_output_dims': [None],
            #      'shape': ['brick', 'long_funnel'],
            #      'optimizer': [Adam, RMSprop],
            #      'losses': [binary_crossentropy],
            #      'activation': [relu],
            #      'last_activation': [sigmoid]}

            self.ptest = {
                'lr': [10],
                'first_neuron': [self.n_features_all],
                'hidden_layers': [1],
                'hidden_neuron': [self.n_features_all],
                'batch_size': [100],
                'epochs': [5],
                'dropout': [0.2],
                'optimizer': [SGD],
                'activation': [relu],
                'last_activation': [softmax]
            }

            self.p1 = {
                'lr': (0.5, 5, 10),
                'first_neuron': [
                    self.n_features_50pct, self.n_features_80pct,
                    self.n_features_all
                ],
                'hidden_layers': [1, 2, 3],
                'hidden_neuron': [
                    self.n_features_50pct, self.n_features_80pct,
                    self.n_features_all
                ],
                'batch_size': [100, 500, 1000],
                'epochs': [20],
                'dropout': (0, 0.2, 5),
                'optimizer': [SGD, RMSprop],
                'activation': [relu],
                'last_activation': [softmax]
            }

            dataset_name = self.folder + '/Hyperparameter tuning - ' + self.__class__.__name__
            scan = ta.Scan(x=self.X,
                           y=self.y,
                           model=self.get_model,
                           params=self.p1,
                           grid_downsample=0.01,
                           dataset_name=dataset_name,
                           experiment_no='1')

            with timer('\nEvaluating Scan'):
                r = ta.Reporting(scan)

                # get the number of rounds in the Scan
                print('\nNumber of rounds in scan ', r.rounds())

                # get highest results
                print('\nHighest validation accuracy', r.high('val_dr'))
                print('\nHighest validation detection rate', r.high('val_dr'))
                print('\nHighest validation false alarm rate',
                      r.high('val_far'))

                # get the highest result for any metric
                print(r.high('val_dr'))

                # get the round with the best result
                print('Best round', r.rounds2high())

                # get the best paramaters
                print(r.best_params())

                #r.plot_corr()
                #plt.show()

                # a four dimensional bar grid
                #r.plot_bars('batch_size', 'val_dr', 'hidden_layers', 'lr')
                #plt.show()

        print('Finished')

Пример #13

Показать файл

class AnnMLPMultiOptimize:
    def __init__(self):
        os.environ[
            'TF_CPP_MIN_LOG_LEVEL'] = '2'  # Ignore low level instruction warnings
        tf.logging.set_verbosity(tf.logging.ERROR)  # Set tensorflow verbosity
        sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

        print(__doc__)

        self.random_state = 20
        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.folder = 'tuning'

        # Datasets
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.n_features = None
        self.label_map_string_2_int = {
            'normal': 0,
            'dos': 1,
            'u2r': 2,
            'r2l': 3,
            'probe': 4
        }

        with timer('\nPreparing dataset'):
            self.load_data()
            self.set_y()
            self.remove_target_from_X()
            self.n_features_all = self.X.shape[1]
            self.n_features_50pct = int(self.n_features_all * 0.5)
            self.n_features_80pct = int(self.n_features_all * 0.8)
            self.y = pd.get_dummies(self.y)
            self.X = self.X.values
            self.y = self.y.values

        with timer('\nSearching parameter space'):
            # self.p = {'lr': (0.5, 5, 10),
            #      'first_neuron': [self.n_features_70pct, self.n_features_all],
            #      'hidden_layers': [0, 1, 2],
            #      'hidden_neuron': [self.n_features_70pct, self.n_features_all],
            #      'batch_size': [100, 200],
            #      'epochs': [30],
            #      'dropout': (0, 0.2, 0.5),
            #      'weight_regulizer': [None],
            #      'emb_output_dims': [None],
            #      'shape': ['brick', 'long_funnel'],
            #      'optimizer': [Adam, RMSprop],
            #      'losses': [binary_crossentropy],
            #      'activation': [relu],
            #      'last_activation': [sigmoid]}

            self.ptest = {
                'lr': [10],
                'first_neuron': [self.n_features_all],
                'hidden_layers': [1],
                'hidden_neuron': [self.n_features_all],
                'batch_size': [100],
                'epochs': [5],
                'dropout': [0.2],
                'optimizer': [SGD],
                'activation': [relu],
                'last_activation': [softmax]
            }

            self.p1 = {
                'lr': (0.5, 5, 10),
                'first_neuron': [
                    self.n_features_50pct, self.n_features_80pct,
                    self.n_features_all
                ],
                'hidden_layers': [1, 2, 3],
                'hidden_neuron': [
                    self.n_features_50pct, self.n_features_80pct,
                    self.n_features_all
                ],
                'batch_size': [100, 500, 1000],
                'epochs': [20],
                'dropout': (0, 0.2, 5),
                'optimizer': [SGD, RMSprop],
                'activation': [relu],
                'last_activation': [softmax]
            }

            dataset_name = self.folder + '/Hyperparameter tuning - ' + self.__class__.__name__
            scan = ta.Scan(x=self.X,
                           y=self.y,
                           model=self.get_model,
                           params=self.p1,
                           grid_downsample=0.01,
                           dataset_name=dataset_name,
                           experiment_no='1')

            with timer('\nEvaluating Scan'):
                r = ta.Reporting(scan)

                # get the number of rounds in the Scan
                print('\nNumber of rounds in scan ', r.rounds())

                # get highest results
                print('\nHighest validation accuracy', r.high('val_dr'))
                print('\nHighest validation detection rate', r.high('val_dr'))
                print('\nHighest validation false alarm rate',
                      r.high('val_far'))

                # get the highest result for any metric
                print(r.high('val_dr'))

                # get the round with the best result
                print('Best round', r.rounds2high())

                # get the best paramaters
                print(r.best_params())

                #r.plot_corr()
                #plt.show()

                # a four dimensional bar grid
                #r.plot_bars('batch_size', 'val_dr', 'hidden_layers', 'lr')
                #plt.show()

        print('Finished')

    @staticmethod
    def dr(y_true, y_pred):
        y_pred_pos = K.round(K.clip(y_pred, 0, 1))
        y_pred_neg = 1 - y_pred_pos
        y_pos = K.round(K.clip(y_true, 0, 1))
        tp = K.sum(y_pos * y_pred_pos)
        fn = K.sum(y_pos * y_pred_neg)
        return tp / (tp + fn + K.epsilon())

    @staticmethod
    def far(y_true, y_pred):
        y_pred_pos = K.round(K.clip(y_pred, 0, 1))
        y_pred_neg = 1 - y_pred_pos
        y_pos = K.round(K.clip(y_true, 0, 1))
        y_neg = 1 - y_pos
        tn = K.sum(y_neg * y_pred_neg)
        fp = K.sum(y_neg * y_pred_pos)
        return fp / (tn + fp + K.epsilon())

    def get_model(self, x_train, y_train, x_val, y_val, params):

        model = models.Sequential()

        # Input layer with dropout
        model.add(
            layers.Dense(params['first_neuron'],
                         activation=params['activation'],
                         input_shape=(self.n_features_all, )))
        model.add(layers.Dropout(params['dropout']))

        # Hidden layers with dropout
        for i in range(params['hidden_layers']):
            model.add(
                layers.Dense(params['hidden_neuron'],
                             activation=params['activation']))
            model.add(layers.Dropout(params['dropout']))

        # Output layer
        model.add(layers.Dense(5, activation=params['last_activation']))

        # Build model
        model.compile(params['optimizer'](
            lr=lr_normalizer(params['lr'], params['optimizer'])),
                      loss='categorical_crossentropy',
                      metrics=['accuracy', self.dr, self.far])

        history = model.fit(x_train,
                            y_train,
                            validation_data=(x_val, y_val),
                            batch_size=params['batch_size'],
                            epochs=params['epochs'],
                            verbose=0)

        return history, model

    def load_data(self):
        self.X = self.filehandler.read_csv(
            self.ds.config['path'],
            self.ds.config['file'] + '_Tensor2d_type_1')
        print('\tRow count:\t', '{}'.format(self.X.shape[0]))
        print('\tColumn count:\t', '{}'.format(self.X.shape[1]))

    def set_y(self):
        self.y = self.X['attack_category']
        self.y = self.y.map(self.label_map_string_2_int)

    def remove_target_from_X(self):
        self.X.drop('attack_category', axis=1, inplace=True)

    def train_test_split(self):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.30, random_state=self.random_state)

Пример #14

Показать файл

Файл: preptensorinput.py Проект: corticalstack/KDDCup1999

class Preptensorinputs:
    def __init__(self):
        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.X = None
        self.y = None
        self.full = None

        with timer('\nLoading dataset'):
            self.load_data()

        with timer('\nPreparing Tensor Input Files'):
            for t2d in (Tensor2d_type_1(), Tensor2d_type_2()):
                with timer('\nBuilding 2d tensor - ' + t2d.__class__.__name__):
                    t2d.set_X(self.full)
                    t2d.encode_categoricals()
                    t2d.set_y(self.full)
                    t2d.sample()
                    t2d.scale()
                    t2d.pca()
                    t2d.add_target()
                    self.filehandler.write_csv(
                        self.ds.config['path'],
                        self.ds.config['file'] + '_' + t2d.__class__.__name__,
                        t2d.X)
                    print('Shape of ' + self.ds.config['file'] + '_' +
                          t2d.__class__.__name__ + ' : ' + str(t2d.X.shape))

        self.log_file()
        print('Finished')

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open(
                    'logs/{}_{}_stdout.txt'.format(self.__class__.__name__,
                                                   self.timestr), 'w')
                sys.stdout = self.logfile

    def load_data(self):
        self.ds.dataset = self.filehandler.read_csv(
            self.ds.config['path'], self.ds.config['file'] + '_processed')
        self.ds.target = self.filehandler.read_csv(
            self.ds.config['path'], self.ds.config['file'] + '_target')
        self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1)
        self.ds.shape()

Пример #15

Показать файл

class AnnMLPBinary:
    def __init__(self):
        os.environ[
            'TF_CPP_MIN_LOG_LEVEL'] = '2'  # Ignore low level instruction warnings
        tf.logging.set_verbosity(tf.logging.ERROR)  # Set tensorflow verbosity
        self.g = tf.Graph()
        self.tf_sess = tf.Session(
            config=tf.ConfigProto(log_device_placement=True), graph=self.g)

        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.random_state = 20
        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.visualize = Visualize()
        self.folder = 'viz'

        # Datasets
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.n_features = None
        self.label_map_int_2_string = {
            0: 'good',
            1: 'bad',
            '0': 'good',
            '1': 'bad'
        }
        self.label_map_string_2_int = {
            'normal': 0,
            'dos': 1,
            'u2r': 1,
            'r2l': 1,
            'probe': 1
        }

        # K-fold validation
        self.splits = 5
        self.kfold = StratifiedKFold(n_splits=self.splits,
                                     shuffle=True,
                                     random_state=self.random_state)

        # Network parameters
        self.epochs = 20
        self.batch_size = 100
        self.verbose = 0

        # Scores
        self.metric_loss = []
        self.metric_acc = []
        self.metric_dr = []
        self.metric_far = []

        self.metric_val_loss = []
        self.metric_val_acc = []
        self.metric_val_dr = []
        self.metric_val_far = []

        with timer('\nPreparing dataset'):
            self.load_data()
            self.set_y()
            self.remove_target_from_X()
            self.n_features = self.X.shape[1]
            self.train_test_split()

        with timer('\nTraining & validating model with kfold'):
            self.g.as_default()  # Reset graph for tensorboard display
            K.clear_session()

            # Train model on K-1 and validate using remaining fold
            for train, val in self.kfold.split(self.X_train, self.y_train):
                #self.tensorboard = TensorBoard(log_dir='logs/tb/annmlpbinary_cv')
                self.model = self.get_model()

                self.history = self.model.fit(
                    self.X_train.iloc[train],
                    self.y_train.iloc[train],
                    validation_data=(self.X_train.iloc[val],
                                     self.y_train.iloc[val]),
                    epochs=self.epochs,
                    batch_size=self.batch_size,
                    verbose=self.verbose)
                #callbacks=[self.tensorboard])

                self.metric_loss.append(self.history.history['loss'])
                self.metric_acc.append(self.history.history['acc'])
                self.metric_dr.append(self.history.history['dr'])
                self.metric_far.append(self.history.history['far'])
                self.metric_val_loss.append(self.history.history['val_loss'])
                self.metric_val_acc.append(self.history.history['val_acc'])
                self.metric_val_dr.append(self.history.history['val_dr'])
                self.metric_val_far.append(self.history.history['val_far'])

            print('\nTraining mean loss', np.mean(self.metric_loss))
            print('Training mean acc', np.mean(self.metric_acc))
            print('Training mean dr', np.mean(self.metric_dr))
            print('Training mean far', np.mean(self.metric_far))
            print('\nValidation mean loss', np.mean(self.metric_val_loss))
            print('Validation mean acc', np.mean(self.metric_val_acc))
            print('Validation mean dr', np.mean(self.metric_val_dr))
            print('Validation mean far', np.mean(self.metric_val_far))

        with timer('\nTesting model on unseen test set'):
            self.g.as_default()  # Reset graph for tensorboard display
            K.clear_session()

            self.tensorboard = TensorBoard(log_dir='logs/tb/annmlpbinary_test')
            self.model = self.get_model()

            # Train model on complete train set and validate with unseen test set
            self.history = self.model.fit(self.X_train,
                                          self.y_train,
                                          validation_data=(self.X_test,
                                                           self.y_test),
                                          epochs=self.epochs,
                                          batch_size=self.batch_size,
                                          verbose=self.verbose,
                                          callbacks=[self.tensorboard])

        with timer('\nVisualising results'):
            # Plot model
            plot_model(self.model, to_file='viz/annMLPBinary - model plot.png')

            # Get single class prediction (rather than multi class probability summing to 1)
            y_pred = self.model.predict_classes(self.X_test)

            print('Test loss', np.mean(self.history.history['loss']))
            print('Test acc', np.mean(self.history.history['acc']))
            print('Test dr', np.mean(self.history.history['dr']))
            print('Test far', np.mean(self.history.history['far']))

            # Remap to string class targets
            self.y_pred = self.map_target_to_label(y_pred)
            self.y_pred = self.y_pred.ravel()
            self.y_test = self.map_target_to_label(self.y_test)

            self.visualize.confusion_matrix(self.y_test, self.y_pred,
                                            self.__class__.__name__)

            epochs = range(1, len(self.history.history['loss']) + 1)

            # Plot loss
            fig, ax = plt.subplots(figsize=(15, 8))
            plt.style.use('ggplot')
            ax.xaxis.set_major_locator(MaxNLocator(integer=True))
            ax.tick_params(axis='both', which='major', labelsize=12)
            ax.plot(epochs,
                    np.mean(self.metric_loss, axis=0),
                    'g',
                    label='Training')
            ax.plot(epochs,
                    np.mean(self.metric_val_loss, axis=0),
                    'b',
                    label='Validation')
            ax.plot(epochs, self.history.history['loss'], 'r', label='Test')
            self.title = '{} - {}'.format(self.__class__.__name__, 'Loss')
            plt.title(self.title, fontsize=18)
            plt.xlabel('Epochs', fontsize=14)
            plt.ylabel('Loss', fontsize=14)
            plt.legend(loc=1, prop={'size': 14})
            plt.savefig(fname=self.fname(self.title), dpi=300, format='png')
            plt.show()

            # Plot accuracy
            plt.clf()
            fig, ax = plt.subplots(figsize=(15, 8))
            plt.style.use('ggplot')
            ax.xaxis.set_major_locator(MaxNLocator(integer=True))
            ax.tick_params(axis='both', which='major', labelsize=12)
            ax.plot(epochs,
                    np.mean(self.metric_acc, axis=0),
                    'g',
                    label='Training')
            ax.plot(epochs,
                    np.mean(self.metric_val_acc, axis=0),
                    'b',
                    label='Validation')
            ax.plot(epochs, self.history.history['acc'], 'r', label='Test')
            self.title = '{} - {}'.format(self.__class__.__name__, 'Accuracy')
            plt.title(self.title, fontsize=18)
            plt.xlabel('Epochs', fontsize=14)
            plt.ylabel('Accuracy', fontsize=14)
            plt.legend(loc=4, prop={'size': 14})
            plt.savefig(fname=self.fname(self.title), dpi=300, format='png')
            plt.show()

            # Plot detection rate
            plt.clf()
            fig, ax = plt.subplots(figsize=(15, 8))
            plt.style.use('ggplot')
            ax.xaxis.set_major_locator(MaxNLocator(integer=True))
            ax.tick_params(axis='both', which='major', labelsize=12)
            ax.plot(epochs,
                    np.mean(self.metric_dr, axis=0),
                    'g',
                    label='Training')
            ax.plot(epochs,
                    np.mean(self.metric_val_dr, axis=0),
                    'b',
                    label='Validation')
            ax.plot(epochs, self.history.history['dr'], 'r', label='Test')
            self.title = '{} - {}'.format(self.__class__.__name__,
                                          'Detection Rate')
            plt.title(self.title, fontsize=18)
            plt.xlabel('Epochs', fontsize=14)
            plt.ylabel('Detection Rate', fontsize=14)
            plt.legend(loc=4, prop={'size': 14})
            plt.savefig(fname=self.fname(self.title), dpi=300, format='png')
            plt.show()

            # Plot false alarm rate
            plt.clf()
            fig, ax = plt.subplots(figsize=(15, 8))
            plt.style.use('ggplot')
            ax.xaxis.set_major_locator(MaxNLocator(integer=True))
            ax.tick_params(axis='both', which='major', labelsize=12)
            ax.plot(epochs,
                    np.mean(self.metric_far, axis=0),
                    'g',
                    label='Training')
            ax.plot(epochs,
                    np.mean(self.metric_val_far, axis=0),
                    'b',
                    label='Validation')
            ax.plot(epochs, self.history.history['far'], 'r', label='Test')
            self.title = '{} - {}'.format(self.__class__.__name__,
                                          'False Alarm Rate')
            plt.title(self.title, fontsize=18)
            plt.xlabel('Epochs', fontsize=14)
            plt.ylabel('False Alarm Rate', fontsize=14)
            plt.legend(loc=1, prop={'size': 14})
            plt.savefig(fname=self.fname(self.title), dpi=300, format='png')
            plt.show()

        self.log_file()
        print('Finished')

    @staticmethod
    def dr(y_true, y_pred):
        y_pred_pos = K.round(K.clip(y_pred, 0, 1))
        y_pred_neg = 1 - y_pred_pos
        y_pos = K.round(K.clip(y_true, 0, 1))
        tp = K.sum(y_pos * y_pred_pos)
        fn = K.sum(y_pos * y_pred_neg)
        return tp / (tp + fn + K.epsilon())

    @staticmethod
    def far(y_true, y_pred):
        y_pred_pos = K.round(K.clip(y_pred, 0, 1))
        y_pred_neg = 1 - y_pred_pos
        y_pos = K.round(K.clip(y_true, 0, 1))
        y_neg = 1 - y_pos
        tn = K.sum(y_neg * y_pred_neg)
        fp = K.sum(y_neg * y_pred_pos)
        return fp / (tn + fp + K.epsilon())

    def get_model(self):
        model = models.Sequential()
        model.add(
            layers.Dense(25,
                         activation='relu',
                         input_shape=(self.n_features, )))
        model.add(layers.Dropout(0.08))
        model.add(layers.Dense(25, activation='relu'))
        model.add(layers.Dropout(0.08))
        model.add(layers.Dense(25, activation='relu'))
        model.add(layers.Dropout(0.08))
        model.add(layers.Dense(25, activation='relu'))
        model.add(layers.Dropout(0.08))
        model.add(layers.Dense(1, activation='sigmoid'))
        model.compile(optimizer=optimizers.RMSprop(lr=0.0023),
                      loss='binary_crossentropy',
                      metrics=['accuracy', self.dr, self.far])
        return model

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open(
                    'logs/{}_{}_stdout.txt'.format(self.__class__.__name__,
                                                   self.timestr), 'w')
                sys.stdout = self.logfile

    def load_data(self):
        self.X = self.filehandler.read_csv(
            self.ds.config['path'],
            self.ds.config['file'] + '_Tensor2d_type_2')
        print('\tRow count:\t', '{}'.format(self.X.shape[0]))
        print('\tColumn count:\t', '{}'.format(self.X.shape[1]))

    def set_y(self):
        self.y = self.X['attack_category']
        self.y = self.y.map(self.label_map_string_2_int)

    def remove_target_from_X(self):
        self.X.drop('attack_category', axis=1, inplace=True)

    def train_test_split(self):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.30, random_state=self.random_state)

    def map_target_to_label(self, t):
        return np.vectorize(self.label_map_int_2_string.get)(t)

    def fname(self, title):
        return '{}/{}.png'.format(self.folder, title)

Пример #16

Показать файл

class Modelling:
    def __init__(self):
        os.environ[
            'TF_CPP_MIN_LOG_LEVEL'] = '2'  # Ignore low level instruction warnings
        tf.logging.set_verbosity(tf.logging.ERROR)  # Set tensorflow verbosity

        # self.logfile = None
        # self.gettrace = getattr(sys, 'gettrace', None)
        # self.original_stdout = sys.stdout
        # self.timestr = time.strftime("%Y%m%d-%H%M%S")
        # self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.visualize = Visualize()
        self.full = None
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.n_features = None
        self.random_state = 20
        self.label_multi = {
            0: 'normal',
            '0': 'normal',
            1: 'dos',
            '1': 'dos',
            2: 'u2r',
            '2': 'u2r',
            3: 'r2l',
            '3': 'r2l',
            4: 'probe',
            '4': 'probe'
        }
        self.label_binary = {0: 'good', '0': 'good', 1: 'bad', '1': 'bad'}

        with timer('\nLoading dataset'):
            self.load_data()

        with timer('\nSetting X and y'):
            self.set_X()
            self.n_features = self.X.shape[1]

        models = (RandomForestClf(), AnnSLPBinary(self.n_features),
                  AnnMLPBinary(self.n_features), AnnMLPMulti(self.n_features))
        classification_type = ('Binary', 'Multi')

        for m, ctype in itertools.product(models, classification_type):
            score = False
            if ctype == 'Binary' and m.binary_enabled:
                self.set_y_binary()
                score = True
            elif ctype == 'Multi' and m.multi_enabled:
                self.set_y_multi()
                score = True

            if not score:
                continue

            with timer('\nTraining and scoring {} - {} target'.format(
                    m.__class__.__name__, ctype)):
                m.base['model'] = m.get_model()
                #self.train_test_split()
                m.score(self.X, self.y, ctype)

            m.y_test[ctype] = pd.Series(m.y_test[ctype])
            m.y_pred[ctype] = pd.Series(m.y_pred[ctype])
            m.y_test[ctype] = m.y_test[ctype].astype(int)
            m.y_pred[ctype] = m.y_pred[ctype].astype(int)

            if ctype == 'Binary':
                m.y_test[ctype] = self.series_map_ac_binary_to_label(
                    m.y_test[ctype])
                m.y_pred[ctype] = self.series_map_ac_binary_to_label(
                    m.y_pred[ctype])
            else:
                m.y_test[ctype] = self.series_map_ac_multi_to_label(
                    m.y_test[ctype])
                m.y_pred[ctype] = self.series_map_ac_multi_to_label(
                    m.y_pred[ctype])

            title = '{} - {} - {} '.format('CM', m.__class__.__name__, ctype)
            self.visualize.confusion_matrix(m.y_test[ctype], m.y_pred[ctype],
                                            title)
            self.scores(m.y_test[ctype], m.y_pred[ctype])

    # Append the scores to a scores array. I could then do an np.mean(scores) to get the mean(average) from all the kfolds
    # save the epoch number and gfold number if possible as well, to get a per/epoch score

    # self.log_file()
        print('Finished')

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open(
                    'logs/{}_{}_stdout.txt'.format(self.__class__.__name__,
                                                   self.timestr), 'w')
                sys.stdout = self.logfile

    def load_data(self):
        self.full = self.filehandler.read_csv(
            self.ds.config['path'],
            self.ds.config['file'] + '_Tensor2d_type_1')

    def set_X(self):
        self.X = self.full.loc[:, self.full.columns != 'attack_category']

    def set_y_binary(self):
        self.y = self.full.loc[:, ['attack_category']]
        self.df_map_ac_label_to_binary()
        self.y = self.y.values.ravel()

    def set_y_multi(self):
        self.y = self.full.loc[:, ['attack_category']]
        self.df_map_ac_label_to_multi()
        self.y = self.y.values.ravel()

    def train_test_split(self):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.30, random_state=self.random_state)

    def df_map_ac_label_to_binary(self):
        conditions = [(self.y['attack_category'] == 'normal'),
                      (self.y['attack_category'] == 'dos') |
                      (self.y['attack_category'] == 'u2r') |
                      (self.y['attack_category'] == 'r2l') |
                      (self.y['attack_category'] == 'probe')]
        self.y['attack_category'] = np.select(conditions, [0, 1])

    def df_map_ac_label_to_multi(self):
        conditions = [(self.y['attack_category'] == 'normal'),
                      (self.y['attack_category'] == 'dos'),
                      (self.y['attack_category'] == 'u2r'),
                      (self.y['attack_category'] == 'r2l'),
                      (self.y['attack_category'] == 'probe')]
        self.y['attack_category'] = np.select(
            conditions,
            ['0', '1', '2', '3', '4'])  # string for get_dummies encoding

    def series_map_ac_multi_to_label(self, s):
        return s.map(self.label_multi)

    def series_map_ac_binary_to_label(self, s):
        return s.map(self.label_binary)

    def scores(self, y_test, y_pred):
        print('Accuracy {}'.format(accuracy_score(y_test, y_pred)))
        print('F1 {}'.format(classification_report(y_test, y_pred, digits=10)))

Пример #17

Показать файл

class Preprocessor:
    def __init__(self, envparm):
        self.dataset_raw = None
        self.numerical_features_raw = None
        self.categorical_features_raw = None
        self.filehandler = None
        self.preprocess(envparm)

    @staticmethod
    def drop_features_min_unique(dataset, min_threshold):
        features_dropped_str = ''
        for col in dataset:
            if len(dataset[col].unique()) <= min_threshold:
                features_dropped_str += str(col) + ' '
                dataset.drop(col, inplace=True, axis=1)

        logging.info(
            'Features dropped with unique value count <= {} - {}'.format(
                min_threshold, features_dropped_str))
        return dataset

    @staticmethod
    def drop_features_max_unique(dataset, max_threshold):
        features_dropped_str = ''
        for col in dataset:
            if len(dataset[col].unique()) >= max_threshold:
                features_dropped_str += str(col) + ' '
                dataset.drop(col, inplace=True, axis=1)

        logging.info(
            'Features dropped with unique value count >= {} - {}'.format(
                max_threshold, features_dropped_str))
        return dataset

    @staticmethod
    def drop_features_max_null(dataset, max_threshold):
        features_dropped_str = ''
        for col in dataset:
            if sum(dataset[col].isnull()) >= max_threshold:
                features_dropped_str += str(col) + ' '
                dataset.drop(col, inplace=True, axis=1)

        logging.info(
            'Features dropped with null value count >= {} - {}'.format(
                max_threshold, features_dropped_str))
        return dataset

    @staticmethod
    def plot_num_obs_missing_values(dataset):
        df = pd.DataFrame(data=dataset.isnull().sum(), columns=['Count'])
        df['bin'] = pd.cut(df['Count'], [
            -1, 10, 20, 30, 40, 50, 100, 200, 300, 400, 500, 1000, 2000, 3000,
            4000, 5000, 6000, 7000, 8000, 9000, 10000, 50000
        ],
                           labels=[
                               '0-10', '10-20', '20-30', '30-40', '40-50',
                               '50-100', '100-200', '200-300', '300-400',
                               '400-500', '500-1K', '1K-2K', '2K-3K', '3K-4K',
                               '4K-5K', '5K-6K', '6K-7K', '7K-8K', '8K-9K',
                               '9K-10K', '10K-50K'
                           ])
        countplot = sns.countplot(y="bin", data=df)
        countplot.set(ylabel="Observations With Null Values",
                      xlabel="Feature Count",
                      title="Observations With Null Values Per Feature")
        plt.show()

    @staticmethod
    def impute_numeric_feature_with_zero(dataset):
        dataset.fillna(0, inplace=True)

    @staticmethod
    def impute_categorical_feature_with_blank(dataset):
        dataset.fillna('', inplace=True)

    def prepare_output_variant_01(self):
        logging.info('Preparing output variant 01')
        numerical_features = self.numerical_features_raw.copy()

        logging.info('Validating numerical features')
        numerical_features = self.drop_features_min_unique(
            numerical_features, 1)

        logging.info('Validating categorical features')
        categorical_features = self.categorical_features_raw.copy()
        categorical_features = self.drop_features_min_unique(
            categorical_features, 1)

        logging.info('Imputing numerical features with mean')
        numerical_features.fillna(numerical_features.mean(), inplace=True)

        logging.info('Imputing categorical features with "missing"')
        categorical_features.fillna('missing', inplace=True)

        # Random Forest needs the categorical features encoding otherwise string to float error
        logging.info('Label encoding categorical features')
        labelencoder_categorical = LabelEncoder()
        labelencoder_categorical = categorical_features.apply(
            labelencoder_categorical.fit_transform)

        dataset_output = self.filehandler.output_prep_dataset(
            self.filehandler.dataset_prep_path_01, numerical_features,
            labelencoder_categorical)

        logging.info('Dataset size after feature transformation - {}'.format(
            dataset_output.shape))
        logging.info('Completed Preparing output variant 01')

    def prepare_output_variant_02(self):
        logging.info('Preparing output variant 02')
        numerical_features = self.numerical_features_raw.copy()

        logging.info('Validating numerical features')
        numerical_features = self.drop_features_min_unique(
            numerical_features, 1)

        logging.info('Validating categorical features')
        categorical_features = self.categorical_features_raw.copy()
        categorical_features = self.drop_features_min_unique(
            categorical_features, 1)

        logging.info('Imputing numerical features with zero')
        numerical_features.fillna(0, inplace=True)

        logging.info('Imputing categorical features with "missing"')
        categorical_features.fillna('missing', inplace=True)

        # Random Forest needs the categorical features encoding otherwise string to float error
        logging.info('Label encoding categorical features')
        labelencoder_categorical = LabelEncoder()
        labelencoder_categorical = categorical_features.apply(
            labelencoder_categorical.fit_transform)

        dataset_output = self.filehandler.output_prep_dataset(
            self.filehandler.dataset_prep_path_02, numerical_features,
            labelencoder_categorical)

        logging.info('Dataset size after feature transformation - {}'.format(
            dataset_output.shape))
        logging.info('Completed Preparing output variant 02')

    def prepare_output_variant_03(self):
        logging.info('Preparing output variant 03')
        numerical_features = self.numerical_features_raw.copy()

        logging.info('Validating numerical features')
        numerical_features = self.drop_features_min_unique(
            numerical_features, 1)

        logging.info('Validating categorical features')
        categorical_features = self.categorical_features_raw.copy()
        categorical_features = self.drop_features_min_unique(
            categorical_features, 1)
        categorical_features = self.drop_features_max_unique(
            categorical_features, 11)

        logging.info('Imputing numerical features with -9876')
        numerical_features.fillna(-9876, inplace=True)

        logging.info('Imputing categorical features with "missing"')
        categorical_features.fillna('missing', inplace=True)

        # Random Forest needs the categorical features encoding otherwise string to float error
        logging.info('Label encoding categorical features')
        labelencoder_categorical = LabelEncoder()
        labelencoder_categorical = categorical_features.apply(
            labelencoder_categorical.fit_transform)

        # One hot encoding results in memory error due to wide dataset
        onehotencoder = OneHotEncoder()
        labelencoder_categorical = onehotencoder.fit_transform(
            labelencoder_categorical).toarray()
        labelencoder_categorical_df = pd.DataFrame(labelencoder_categorical)

        dataset_output = self.filehandler.output_prep_dataset(
            self.filehandler.dataset_prep_path_03, numerical_features,
            labelencoder_categorical_df)

        logging.info('Dataset size after feature transformation - {}'.format(
            dataset_output.shape))
        logging.info('Completed Preparing output variant 03')

    def preprocess(self, envparm):
        self.filehandler = Filehandler()
        self.dataset_raw = self.filehandler.read_csv(
            self.filehandler.data_raw_path)
        logging.info('Original raw dataset loaded - dataset size {}'.format(
            self.dataset_raw.shape))

        logging.info('Partitioning numerical features')
        self.numerical_features_raw = self.dataset_raw.iloc[:, 0:190].copy()

        logging.info('Partitioning categorical features')
        self.categorical_features_raw = self.dataset_raw.iloc[:, 190:].copy()

        if envparm['PlotGraphs']:
            num_size = 28
            sample_df = self.dataset_raw.iloc[:, :num_size].copy()
            visualizer.matrix_missing(
                sample_df,
                'Data Completion First ' + str(num_size) + ' Numeric Features')
            visualizer.bar_missing(
                sample_df,
                'Nullity Count First ' + str(num_size) + ' Numeric Features')
            visualizer.heat_missing(
                sample_df, 'Nullity Correlation Of First ' + str(num_size) +
                ' Numeric Features')

            sample_df = self.dataset_raw.iloc[:, 190:].copy()
            visualizer.matrix_missing(sample_df,
                                      'Data Completion Categorical Features')
            visualizer.bar_missing(sample_df,
                                   'Nullity Count Categorical Features')
            visualizer.heat_missing(
                sample_df, 'Nullity Correlation Of Categorical Features')

        if envparm['ProcessDS01']:
            self.prepare_output_variant_01()

        if envparm['ProcessDS02']:
            self.prepare_output_variant_02()

        if envparm['ProcessDS03']:
            self.prepare_output_variant_03()

Пример #18

Показать файл

Файл: featureselection.py Проект: corticalstack/KDDCup1999

class FeatureSelection:
    def __init__(self):
        self.logfile = False
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.visualize = Visualize()
        self.ds = KDDCup1999()
        self.X = None
        self.y = None
        self.full = None
        self.random_state = 20
        self.num_features = 15
        self.scale_cols = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
                           'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
                           'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login',
                           'count', 'srv_count', 'serror_rate', 'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate',
                           'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate',
                           'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate']

        with timer('\nLoading dataset'):
            self.load_data()
            self.encode_scale()
            self.set_X()
        with timer('\nFeature selection'):
            for selector in (Original(),
                             UnivariateSelector(),
                             RecursiveSelector(),
                             PCASelector(),
                             #KernelPCASelector(),
                             ExtraTreesSelector(),
                             RandomForestSelector()):
                for label in ('attack_category', 'target'):
                    self.set_y(label)
                    with timer('\nFitting selector ' + selector.__class__.__name__):
                        selector.fit_model(self.X, self.y)
                        x = selector.get_top_features(self.X, label)
                    with timer('\nXGBoost scoring of features selected by ' + selector.__class__.__name__):
                        self.score_with_xgboost(x, self.y, selector.title)

        self.log_file()
        print('Finished')

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open('logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w')
                sys.stdout = self.logfile

    def load_data(self):
        self.ds.dataset = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_processed')
        self.ds.target = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_target')
        self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1)
        self.ds.shape()
        print(self.ds.dataset.columns)
        self.ds.row_count_by_target('attack_category')

    def encode_scale(self):
        # Encode categoricals
        le = preprocessing.LabelEncoder()
        self.full['protocol_type'] = le.fit_transform(self.full['protocol_type'])
        self.full['service'] = le.fit_transform(self.full['service'])
        self.full['flag'] = le.fit_transform(self.full['flag'])

        # Scale
        sc = MinMaxScaler()
        self.full[self.scale_cols] = sc.fit_transform(self.full[self.scale_cols])

    def set_X(self):
        self.X = self.full.iloc[:, :-2]

    def set_y(self, label):
        self.y = self.full[label]

    def score_with_xgboost(self, x, y, title):
        clf = XGBClassifier(n_estimators=100, random_state=self.random_state)
        kfold = StratifiedKFold(n_splits=10, random_state=self.random_state)
        results = cross_val_score(clf, x, y, cv=kfold)
        print("XGBoost Accuracy: %.2f%% (+/- %.2f%%)" % (results.mean() * 100, results.std() * 100))
        y_pred = cross_val_predict(clf, x, y, cv=10)
        self.visualize.confusion_matrix(y, y_pred, title)

Пример #19

Показать файл

Файл: sampling.py Проект: corticalstack/KDDCup1999

    def __init__(self):
        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.visualize = Visualize()
        self.random_state = 20
        self.X = None
        self.y = None
        self.full = None

        # RF Feature selected plus sparse cols
        self.cols = [
            'count', 'diff_srv_rate', 'src_bytes', 'dst_host_srv_count',
            'flag', 'dst_bytes', 'serror_rate', 'dst_host_diff_srv_rate',
            'service', 'dst_host_count', 'dst_host_srv_diff_host_rate',
            'logged_in', 'protocol_type', 'dst_host_same_src_port_rate', 'hot',
            'srv_count', 'wrong_fragment', 'num_compromised', 'rerror_rate',
            'srv_diff_host_rate', 'urgent', 'num_failed_logins', 'root_shell',
            'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
            'num_access_files', 'is_guest_login'
        ]

        with timer('\nLoading dataset'):
            self.load_data()

        with timer('\nScaling'):
            # Sampling options
            for sampler in (Original(), RandomOverSampler(),
                            SMOTE(random_state=self.random_state),
                            ADASYN(random_state=self.random_state),
                            BorderlineSMOTE(random_state=self.random_state,
                                            kind='borderline-1')):

                self.X = self.full.loc[:, self.cols]
                self.X['target'] = self.full['target']
                print('X shape with selected features and binary - ',
                      self.X.shape)

                self.X = pd.get_dummies(
                    data=self.X, columns=['protocol_type', 'service', 'flag'])
                print('X shape after encoding categoricals - ', self.X.shape)

                # Re-sample based on attack_category labels
                res_x = pd.DataFrame()
                res_x, res_y_attack_category, title = self.sample(
                    sampler, self.X, self.full['attack_category'])

                res_y_target = res_x[
                    'target']  # Grab target as y from resampled x set
                res_x.drop(columns=['target'], inplace=True)
                print('X shape after sampling and removing target - ',
                      res_x.shape)
                print('y shape with attack_category after resample - ',
                      res_y_attack_category.shape)
                print(res_y_attack_category.value_counts())
                res_y_attack_category.value_counts().plot(
                    kind='bar',
                    title=title + ' - Resampled Count (attack_category)')
                plt.show()
                print('y shape with target after resample - ',
                      res_y_target.shape)

                # Scale after resampling
                qt = QuantileTransformer(output_distribution='normal')
                res_x = qt.fit_transform(res_x)
                print('X shape after scaling - ', res_x.shape)

                # Score on attack_category multi-class
                self.model_and_score(res_x, res_y_attack_category, title,
                                     'attack_category')

                # Score on binary target
                self.model_and_score(res_x, res_y_target, title, 'target')

        self.log_file()
        print('Finished')

Пример #20

Показать файл

Файл: sampling.py Проект: corticalstack/KDDCup1999

class Sampling:
    def __init__(self):
        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.visualize = Visualize()
        self.random_state = 20
        self.X = None
        self.y = None
        self.full = None

        # RF Feature selected plus sparse cols
        self.cols = [
            'count', 'diff_srv_rate', 'src_bytes', 'dst_host_srv_count',
            'flag', 'dst_bytes', 'serror_rate', 'dst_host_diff_srv_rate',
            'service', 'dst_host_count', 'dst_host_srv_diff_host_rate',
            'logged_in', 'protocol_type', 'dst_host_same_src_port_rate', 'hot',
            'srv_count', 'wrong_fragment', 'num_compromised', 'rerror_rate',
            'srv_diff_host_rate', 'urgent', 'num_failed_logins', 'root_shell',
            'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
            'num_access_files', 'is_guest_login'
        ]

        with timer('\nLoading dataset'):
            self.load_data()

        with timer('\nScaling'):
            # Sampling options
            for sampler in (Original(), RandomOverSampler(),
                            SMOTE(random_state=self.random_state),
                            ADASYN(random_state=self.random_state),
                            BorderlineSMOTE(random_state=self.random_state,
                                            kind='borderline-1')):

                self.X = self.full.loc[:, self.cols]
                self.X['target'] = self.full['target']
                print('X shape with selected features and binary - ',
                      self.X.shape)

                self.X = pd.get_dummies(
                    data=self.X, columns=['protocol_type', 'service', 'flag'])
                print('X shape after encoding categoricals - ', self.X.shape)

                # Re-sample based on attack_category labels
                res_x = pd.DataFrame()
                res_x, res_y_attack_category, title = self.sample(
                    sampler, self.X, self.full['attack_category'])

                res_y_target = res_x[
                    'target']  # Grab target as y from resampled x set
                res_x.drop(columns=['target'], inplace=True)
                print('X shape after sampling and removing target - ',
                      res_x.shape)
                print('y shape with attack_category after resample - ',
                      res_y_attack_category.shape)
                print(res_y_attack_category.value_counts())
                res_y_attack_category.value_counts().plot(
                    kind='bar',
                    title=title + ' - Resampled Count (attack_category)')
                plt.show()
                print('y shape with target after resample - ',
                      res_y_target.shape)

                # Scale after resampling
                qt = QuantileTransformer(output_distribution='normal')
                res_x = qt.fit_transform(res_x)
                print('X shape after scaling - ', res_x.shape)

                # Score on attack_category multi-class
                self.model_and_score(res_x, res_y_attack_category, title,
                                     'attack_category')

                # Score on binary target
                self.model_and_score(res_x, res_y_target, title, 'target')

        self.log_file()
        print('Finished')

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open(
                    'logs/{}_{}_stdout.txt'.format(self.__class__.__name__,
                                                   self.timestr), 'w')
                sys.stdout = self.logfile

    def load_data(self):
        self.ds.dataset = self.filehandler.read_csv(
            self.ds.config['path'], self.ds.config['file'] + '_processed')
        self.ds.target = self.filehandler.read_csv(
            self.ds.config['path'], self.ds.config['file'] + '_target')
        self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1)
        self.ds.shape()
        self.ds.row_count_by_target('attack_category')

    def set_y(self, label):
        self.y = self.full[label]

    def sample(self, sampler, X, y):
        title = sampler.__class__.__name__
        res_x, res_y = sampler.fit_resample(X, y)
        if isinstance(res_x, np.ndarray):
            res_x = pd.DataFrame(res_x, columns=X.columns)

        if isinstance(res_y, np.ndarray):
            res_y = pd.Series(res_y)

        print('Shape after sampling with {} - x {},  y {}'.format(
            title, res_x.shape, res_y.shape))
        return res_x, res_y, title

    def model_and_score(self, X, y, title, label):
        clf = XGBClassifier(n_estimators=50, random_state=self.random_state)
        kfold = StratifiedKFold(n_splits=5, random_state=self.random_state)
        results = cross_val_score(clf, X, y, cv=kfold)
        y_pred = cross_val_predict(clf, X, y, cv=5)
        print('{} - {} - XGBoost Accuracy: {:.2f}% (+/- {:.2f}'.format(
            title, label,
            results.mean() * 100,
            results.std() * 100))
        self.visualize.confusion_matrix(
            y, y_pred,
            '{} - {} - Label {}'.format(title, clf.__class__.__name__, label))

Пример #21

Показать файл

Файл: scaling.py Проект: corticalstack/KDDCup1999

class Scaling:
    def __init__(self):
        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.visualize = Visualize()
        self.random_state = 20
        self.X = None
        self.y = None
        self.full = None
        self.ac_count = {}
        self.scores = OrderedDict()
        self.scale_cols = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
                           'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
                           'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login',
                           'count', 'srv_count', 'serror_rate', 'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate',
                           'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate',
                           'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate']

        with timer('\nLoading dataset'):
            self.load_data()
            self.set_attack_category_count()
        with timer('\nEncoding categoricals'):
            le = preprocessing.LabelEncoder()
            self.full['protocol_type'] = le.fit_transform(self.full['protocol_type'])
            self.full['service'] = le.fit_transform(self.full['service'])
            self.full['flag'] = le.fit_transform(self.full['flag'])
        with timer('\nSetting X'):
            self.set_X()
            self.ds.shape()
        with timer('\nDistribution Before Scaling'):
            self.dist_before_scaling()
        with timer('\nScaling'):
            for scaler in (StandardScaler(),
                           Normalizer(),
                           MinMaxScaler(feature_range=(0, 1)),
                           Binarizer(threshold=0.0),
                           RobustScaler(quantile_range=(25, 75)),
                           PowerTransformer(method='yeo-johnson'),
                           QuantileTransformer(output_distribution='normal')):
                title, res_x = self.scale(scaler)

                label = 'attack_category'
                self.set_y(label)
                self.model_and_score(scaler, res_x, title, label)

                label = 'target'
                self.set_y(label)
                self.model_and_score(scaler, res_x, title, label)

        self.log_file()
        print('Finished')

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open('logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w')
                sys.stdout = self.logfile

    def load_data(self):
        self.ds.dataset = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_processed')
        self.ds.target = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_target')
        self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1)
        self.ds.shape()
        self.ds.row_count_by_target('attack_category')

    def set_attack_category_count(self):
        ac = self.full['attack_category'].value_counts()
        for key, value in ac.items():
            self.ac_count[key] = value

    def set_X(self):
        self.X = self.full.loc[:, self.scale_cols]

    def set_y(self, label):
        self.y = self.full[label]

    def dist_before_scaling(self):
        self.visualize.kdeplot('Distribution Before Scaling', self.X, self.scale_cols)

    def scale(self, scaler):
        x = self.X[self.scale_cols]

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            res_x = scaler.fit_transform(x)

        res_x = pd.DataFrame(res_x, columns=self.scale_cols)
        title = 'Distribution After ' + scaler.__class__.__name__
        self.visualize.kdeplot(title, res_x, self.scale_cols)
        return title, res_x

    def model_and_score(self, scaler, res_x, title, label):
        clf = XGBClassifier(n_estimators=100, random_state=self.random_state)
        kfold = StratifiedKFold(n_splits=10, random_state=self.random_state)
        results = cross_val_score(clf, res_x, self.y, cv=kfold)
        y_pred = cross_val_predict(clf, res_x, self.y, cv=10)
        print('{} - {} - XGBoost Accuracy: {:.2f}% (+/- {:.2f}'.format(title, label, results.mean() * 100,
                                                                       results.std() * 100))
        self.visualize.confusion_matrix(self.y, y_pred, '{} - {} - Label {}'.format(title, clf.__class__.__name__,
                                                                                    label))

Пример #22

Показать файл

    def __init__(self):
        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.n_classes = 5
        self.random_state = 20
        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.folder = 'viz'
        self.fprefix_multi = 'Hyper - annMLPMulti - '

        # Datasets
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.hyp = None
        self.lr = None
        self.label_map_string_2_int = {
            'normal': 0,
            'dos': 1,
            'u2r': 2,
            'r2l': 3,
            'probe': 4
        }
        self.max_iters = 100

        with timer('\nPreparing dataset'):
            self.load_data()
            self.set_y()
            self.remove_target_from_X()
            self.train_test_split()

        with timer('\nPreparing base logistic regression'):
            self.lr = LogisticRegression(max_iter=self.max_iters)
            self.lr.fit(self.X_train, self.y_train)

        with timer('\nPreparing confusion matrix and base DR'):
            self.y_pred = self.lr.predict(self.X_test)
            cm = confusion_matrix(self.y_test, self.y_pred)
            self.tp = self.get_tp_from_cm(cm)
            self.tn = self.get_tn_from_cm(cm)
            self.fp = self.get_fp_from_cm(cm)
            self.fn = self.get_fn_from_cm(cm)
            self.dr = self.tp / (self.tp + self.fp)
            print('log reg dr', self.dr)

        with timer('\nVisualising optimisation search'):
            self.load_hyp()
            self.hyp['lr'] = round(self.hyp['lr'] / 1000, 3)

            # Hyperparameter correlation with val DR
            self.hyp_val_dr = self.hyp
            self.hyp_val_dr.drop([
                'round_epochs', 'epochs', 'loss', 'dr', 'far', 'acc',
                'val_loss', 'val_acc', 'val_far'
            ],
                                 axis=1,
                                 inplace=True)
            self.dr_corr = self.hyp_val_dr.corr()
            plt.clf()
            fig, ax = plt.subplots(figsize=(10, 10))
            title = 'Validation DR Hyperparameter Correlation'
            ax.set_title(title, size=16)
            colormap = sns.diverging_palette(220, 10, as_cmap=True)
            sns.heatmap(self.dr_corr,
                        cmap=colormap,
                        annot=True,
                        fmt=".2f",
                        cbar=False,
                        vmin=-0.4,
                        vmax=0.4)
            plt.xticks(range(len(self.dr_corr.columns)), self.dr_corr.columns)
            plt.yticks(range(len(self.dr_corr.columns)), self.dr_corr.columns)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            self.hyp['val_dr_change'] = round(self.hyp.val_dr - self.dr, 3)
            pd.set_option('display.max_columns', 100)
            print(self.hyp.sort_values(by='val_dr', ascending=False).head())

            self.color = 'cornflowerblue'

            metric = 'lr'
            plt.clf()
            fig, ax = plt.subplots(figsize=(10, 6))
            ax = sns.boxplot(x=metric,
                             y='val_dr_change',
                             data=self.hyp.reset_index(),
                             color=self.color)
            title = 'Validation DR Change Over Baseline As Fn Of Learning Rate'
            plt.title(title, fontsize=16)
            plt.xlabel('Learning Rate', fontsize=12)
            plt.ylabel('Validation DR Change', fontsize=12)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            metric = 'first_neuron'
            plt.clf()
            fig, ax = plt.subplots(figsize=(10, 6))
            ax = sns.boxplot(x=metric,
                             y='val_dr_change',
                             data=self.hyp.reset_index(),
                             color=self.color)
            title = 'Validation DR Change Over Baseline As Fn Of # Neurons First Layer'
            plt.title(title, fontsize=16)
            plt.xlabel('First Neuron', fontsize=12)
            plt.ylabel('Validation DR Change', fontsize=12)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            metric = 'hidden_layers'
            plt.clf()
            fig, ax = plt.subplots(figsize=(10, 6))
            ax = sns.boxplot(x=metric,
                             y='val_dr_change',
                             data=self.hyp.reset_index(),
                             color=self.color)
            title = 'Validation DR Change Over Baseline As Fn Of # Hidden Layers'
            plt.title(title, fontsize=16)
            plt.xlabel('Hidden Layers', fontsize=12)
            plt.ylabel('Validation DR Change', fontsize=12)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            metric = 'hidden_neuron'
            plt.clf()
            fig, ax = plt.subplots(figsize=(10, 6))
            ax = sns.boxplot(x=metric,
                             y='val_dr_change',
                             data=self.hyp.reset_index(),
                             color=self.color)
            title = 'Validation DR Change Over Baseline As Fn Of # Hidden Layer Neurons'
            plt.title(title, fontsize=16)
            plt.xlabel('Hidden Neurons', fontsize=12)
            plt.ylabel('Validation DR Change', fontsize=12)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            metric = 'batch_size'
            plt.clf()
            fig, ax = plt.subplots(figsize=(10, 6))
            ax = sns.boxplot(x=metric,
                             y='val_dr_change',
                             data=self.hyp.reset_index(),
                             color=self.color)
            title = 'Validation DR Change Over Baseline As Fn Of Batch Size'
            plt.title(title, fontsize=16)
            plt.xlabel('Batch Size', fontsize=12)
            plt.ylabel('Validation DR Change', fontsize=12)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            metric = 'dropout'
            plt.clf()
            fig, ax = plt.subplots(figsize=(12, 8))
            ax = sns.boxplot(x=metric,
                             y='val_dr_change',
                             data=self.hyp.reset_index(),
                             color=self.color)
            title = 'Validation DR Change Over Baseline As Fn Of Dropout'
            plt.title(title, fontsize=16)
            plt.xlabel('Dropout', fontsize=12)
            plt.ylabel('Validation DR Change', fontsize=12)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            plt.clf()
            fig, ax = plt.subplots(figsize=(9, 7))
            df_grid = self.hyp.reset_index().groupby(
                ['first_neuron',
                 'hidden_neuron']).val_dr_change.mean().unstack()
            ax = sns.heatmap(data=df_grid,
                             cmap=(sns.diverging_palette(10, 220, sep=80,
                                                         n=7)),
                             annot=True,
                             cbar=False)
            title = 'Validation DR Change Over Baseline As Fn Of First Neuron & Hidden Neuron'
            plt.title(title, fontsize=12)
            plt.xlabel('Hidden Neuron', fontsize=10)
            plt.ylabel('First Neuron', fontsize=10)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            plt.clf()
            fig, ax = plt.subplots(figsize=(9, 7))
            df_grid = self.hyp.reset_index().groupby(
                ['hidden_layers',
                 'hidden_neuron']).val_dr_change.mean().unstack()
            ax = sns.heatmap(data=df_grid,
                             cmap=(sns.diverging_palette(10, 220, sep=80,
                                                         n=7)),
                             annot=True,
                             cbar=False)
            title = 'Validation DR Change Over Baseline As Fn Of Hidden Layers & Hidden Neuron'
            plt.title(title, fontsize=16)
            plt.xlabel('Hidden Neuron', fontsize=10)
            plt.ylabel('Hidden Layers', fontsize=10)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            plt.clf()
            fig, ax = plt.subplots(figsize=(9, 7))
            df_grid = self.hyp.reset_index().groupby(
                ['batch_size', 'dropout']).val_dr_change.mean().unstack()
            ax = sns.heatmap(data=df_grid,
                             cmap=(sns.diverging_palette(10, 220, sep=80,
                                                         n=7)),
                             annot=True,
                             cbar=False)
            title = 'Validation DR Change Over Baseline As Fn Of Batch Size & Dropout'
            plt.xlabel('Dropout', fontsize=10)
            plt.ylabel('Batch Size', fontsize=10)
            plt.title(title, fontsize=16)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

            plt.clf()
            fig, ax = plt.subplots(figsize=(9, 7))
            df_grid = self.hyp.reset_index().groupby(
                ['lr', 'dropout']).val_dr_change.mean().unstack()
            ax = sns.heatmap(data=df_grid,
                             cmap=(sns.diverging_palette(10, 220, sep=80,
                                                         n=7)),
                             annot=True,
                             cbar=False)
            title = 'Validation DR Change Over Baseline As Fn Of Learning Rate & Dropout'
            plt.xlabel('Dropout', fontsize=10)
            plt.ylabel('Learning Rate', fontsize=10)
            plt.title(title, fontsize=16)
            plt.savefig(fname=self.fname(title), dpi=300, format='png')
            plt.show()

        self.log_file()
        print('Finished')

Пример #23

Показать файл

Файл: linearity.py Проект: corticalstack/KDDCup1999

class Linearity:
    def __init__(self):
        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.visualize = Visualize()
        self.random_state = 20

        self.X = None
        self.y = None
        self.sample = None
        self.full = None
        self.ac_count = {}
        self.scale_cols = [
            'duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
            'urgent', 'hot', 'num_failed_logins', 'logged_in',
            'num_compromised', 'root_shell', 'su_attempted', 'num_root',
            'num_file_creations', 'num_shells', 'num_access_files',
            'is_guest_login', 'count', 'srv_count', 'serror_rate',
            'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate',
            'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate',
            'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate'
        ]
        self.full_weights = {
            'normal': 1,
            'dos': 1,
            'probe': 1,
            'u2r': 1,
            'r2l': 1
        }
        self.minimal_weights = {
            'normal': 0.01,
            'dos': 0.01,
            'probe': 0.2,
            'u2r': 0.5,
            'r2l': 0.5
        }

        with timer('\nLoading dataset'):
            self.load_data()
            self.set_attack_category_count()
            self.ds.shape()
        with timer('\nEncode and Scale dataset'):
            # Encode categoricals
            le = preprocessing.LabelEncoder()
            self.full['protocol_type'] = le.fit_transform(
                self.full['protocol_type'])
            self.full['service'] = le.fit_transform(self.full['service'])
            self.full['flag'] = le.fit_transform(self.full['flag'])

            # Scale
            sc = StandardScaler()
            self.full[self.scale_cols] = sc.fit_transform(
                self.full[self.scale_cols])
        with timer('\nPlotting scatter graphs'):
            self.sample_dataset(self.full_weights)
            print(self.sample.shape)
            self.set_X_y('target')
            self.scatter()
        with timer('\nPlotting scatter graphs with convex hull'):
            self.sample_dataset(self.full_weights)
            print(self.sample.shape)
            self.set_X_y('target')
            self.convex_hull()
        with timer('\nPlotting linear separability with classifiers'):
            self.sample_dataset(self.minimal_weights)
            print(self.sample.shape)
            self.set_X_y('target')
            self.classifiers()

        self.log_file()
        print('Finished')

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open(
                    'logs/{}_{}_stdout.txt'.format(self.__class__.__name__,
                                                   self.timestr), 'w')
                sys.stdout = self.logfile

    def scatter(self):
        self.visualize.scatter(self.X,
                               cola='src_bytes',
                               colb='dst_bytes',
                               hue='target')
        self.visualize.scatter(self.X,
                               cola='count',
                               colb='diff_srv_rate',
                               hue='target')
        self.visualize.scatter(self.X,
                               cola='duration',
                               colb='src_bytes',
                               hue='target')
        self.visualize.scatter(self.X,
                               cola='dst_host_srv_count',
                               colb='dst_bytes',
                               hue='target')
        self.visualize.scatter(self.X,
                               cola='serror_rate',
                               colb='rerror_rate',
                               hue='target')
        self.visualize.scatter(self.X,
                               cola='dst_host_srv_count',
                               colb='dst_bytes',
                               hue='target')
        self.visualize.scatter(self.X,
                               cola='srv_diff_host_rate',
                               colb='srv_count',
                               hue='target')

    def convex_hull(self):
        buckets = self.y.unique()
        self.visualize.convex_hull(self.X,
                                   buckets,
                                   cola='src_bytes',
                                   colb='dst_bytes',
                                   target='target')
        self.visualize.convex_hull(self.X,
                                   buckets,
                                   cola='count',
                                   colb='diff_srv_rate',
                                   target='target')
        self.visualize.convex_hull(self.X,
                                   buckets,
                                   cola='duration',
                                   colb='src_bytes',
                                   target='target')
        self.visualize.convex_hull(self.X,
                                   buckets,
                                   cola='dst_host_srv_count',
                                   colb='dst_bytes',
                                   target='target')
        self.visualize.convex_hull(self.X,
                                   buckets,
                                   cola='serror_rate',
                                   colb='rerror_rate',
                                   target='target')
        self.visualize.convex_hull(self.X,
                                   buckets,
                                   cola='dst_host_srv_count',
                                   colb='dst_bytes',
                                   target='target')
        self.visualize.convex_hull(self.X,
                                   buckets,
                                   cola='srv_diff_host_rate',
                                   colb='srv_count',
                                   target='target')

    def load_data(self):
        self.ds.dataset = self.filehandler.read_csv(
            self.ds.config['path'], self.ds.config['file'] + '_processed')
        self.ds.target = self.filehandler.read_csv(
            self.ds.config['path'], self.ds.config['file'] + '_target')
        self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1)

    def set_attack_category_count(self):
        ac = self.full['attack_category'].value_counts()
        for key, value in ac.items():
            self.ac_count[key] = value

    def set_X_y(self, target):
        print('Setting X, with y as {}'.format(target))
        self.X = self.sample
        self.y = self.sample[target]

    def sample_dataset(self, weights):
        print('Sampling dataset with weights {}'.format(weights))
        self.sample = pd.DataFrame()
        for key, value in self.ac_count.items():
            samples = int(value * weights[key])
            df = self.full[self.full.attack_category == key].sample(
                samples, random_state=self.random_state)
            self.sample = self.sample.append(df)

    def classifiers(self):
        le = preprocessing.LabelEncoder()
        self.y = le.fit_transform(self.y)
        _y = self.y

        models = (Perceptron(max_iter=100,
                             tol=1e-3,
                             random_state=self.random_state),
                  LinearSVC(max_iter=500,
                            random_state=self.random_state,
                            tol=1e-5),
                  SVC(kernel='rbf',
                      gamma=5,
                      C=10.0,
                      random_state=self.random_state))

        titles = ('Perceptron', 'LinearSVC (linear kernel)',
                  'SVC with RBF kernel')
        columns = [('srv_diff_host_rate', 'srv_count'),
                   ('dst_host_srv_count', 'count'),
                   ('dst_host_srv_count', 'dst_bytes')]
        for clf, title in zip(models, titles):
            for cola, colb in columns:
                _x = self.X.loc[:, [cola, colb]]
                clf.fit(_x, _y)
                _y_pred = clf.predict(_x)
                self.visualize.boundary(_x, _y, clf, title, cola, colb)
                self.visualize.confusion_matrix(
                    _y, _y_pred, title + ' - ' + cola + ' vs ' + colb)

Пример #24

Показать файл

    def __init__(self):
        os.environ[
            'TF_CPP_MIN_LOG_LEVEL'] = '2'  # Ignore low level instruction warnings
        tf.logging.set_verbosity(tf.logging.ERROR)  # Set tensorflow verbosity

        # self.logfile = None
        # self.gettrace = getattr(sys, 'gettrace', None)
        # self.original_stdout = sys.stdout
        # self.timestr = time.strftime("%Y%m%d-%H%M%S")
        # self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.visualize = Visualize()
        self.full = None
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.n_features = None
        self.random_state = 20
        self.label_multi = {
            0: 'normal',
            '0': 'normal',
            1: 'dos',
            '1': 'dos',
            2: 'u2r',
            '2': 'u2r',
            3: 'r2l',
            '3': 'r2l',
            4: 'probe',
            '4': 'probe'
        }
        self.label_binary = {0: 'good', '0': 'good', 1: 'bad', '1': 'bad'}

        with timer('\nLoading dataset'):
            self.load_data()

        with timer('\nSetting X and y'):
            self.set_X()
            self.n_features = self.X.shape[1]

        models = (RandomForestClf(), AnnSLPBinary(self.n_features),
                  AnnMLPBinary(self.n_features), AnnMLPMulti(self.n_features))
        classification_type = ('Binary', 'Multi')

        for m, ctype in itertools.product(models, classification_type):
            score = False
            if ctype == 'Binary' and m.binary_enabled:
                self.set_y_binary()
                score = True
            elif ctype == 'Multi' and m.multi_enabled:
                self.set_y_multi()
                score = True

            if not score:
                continue

            with timer('\nTraining and scoring {} - {} target'.format(
                    m.__class__.__name__, ctype)):
                m.base['model'] = m.get_model()
                #self.train_test_split()
                m.score(self.X, self.y, ctype)

            m.y_test[ctype] = pd.Series(m.y_test[ctype])
            m.y_pred[ctype] = pd.Series(m.y_pred[ctype])
            m.y_test[ctype] = m.y_test[ctype].astype(int)
            m.y_pred[ctype] = m.y_pred[ctype].astype(int)

            if ctype == 'Binary':
                m.y_test[ctype] = self.series_map_ac_binary_to_label(
                    m.y_test[ctype])
                m.y_pred[ctype] = self.series_map_ac_binary_to_label(
                    m.y_pred[ctype])
            else:
                m.y_test[ctype] = self.series_map_ac_multi_to_label(
                    m.y_test[ctype])
                m.y_pred[ctype] = self.series_map_ac_multi_to_label(
                    m.y_pred[ctype])

            title = '{} - {} - {} '.format('CM', m.__class__.__name__, ctype)
            self.visualize.confusion_matrix(m.y_test[ctype], m.y_pred[ctype],
                                            title)
            self.scores(m.y_test[ctype], m.y_pred[ctype])

    # Append the scores to a scores array. I could then do an np.mean(scores) to get the mean(average) from all the kfolds
    # save the epoch number and gfold number if possible as well, to get a per/epoch score

    # self.log_file()
        print('Finished')

Пример #25

Показать файл

Файл: linearity.py Проект: corticalstack/KDDCup1999

    def __init__(self):
        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.visualize = Visualize()
        self.random_state = 20

        self.X = None
        self.y = None
        self.sample = None
        self.full = None
        self.ac_count = {}
        self.scale_cols = [
            'duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
            'urgent', 'hot', 'num_failed_logins', 'logged_in',
            'num_compromised', 'root_shell', 'su_attempted', 'num_root',
            'num_file_creations', 'num_shells', 'num_access_files',
            'is_guest_login', 'count', 'srv_count', 'serror_rate',
            'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate',
            'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate',
            'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate'
        ]
        self.full_weights = {
            'normal': 1,
            'dos': 1,
            'probe': 1,
            'u2r': 1,
            'r2l': 1
        }
        self.minimal_weights = {
            'normal': 0.01,
            'dos': 0.01,
            'probe': 0.2,
            'u2r': 0.5,
            'r2l': 0.5
        }

        with timer('\nLoading dataset'):
            self.load_data()
            self.set_attack_category_count()
            self.ds.shape()
        with timer('\nEncode and Scale dataset'):
            # Encode categoricals
            le = preprocessing.LabelEncoder()
            self.full['protocol_type'] = le.fit_transform(
                self.full['protocol_type'])
            self.full['service'] = le.fit_transform(self.full['service'])
            self.full['flag'] = le.fit_transform(self.full['flag'])

            # Scale
            sc = StandardScaler()
            self.full[self.scale_cols] = sc.fit_transform(
                self.full[self.scale_cols])
        with timer('\nPlotting scatter graphs'):
            self.sample_dataset(self.full_weights)
            print(self.sample.shape)
            self.set_X_y('target')
            self.scatter()
        with timer('\nPlotting scatter graphs with convex hull'):
            self.sample_dataset(self.full_weights)
            print(self.sample.shape)
            self.set_X_y('target')
            self.convex_hull()
        with timer('\nPlotting linear separability with classifiers'):
            self.sample_dataset(self.minimal_weights)
            print(self.sample.shape)
            self.set_X_y('target')
            self.classifiers()

        self.log_file()
        print('Finished')

Пример #26

Показать файл

    def __init__(self):
        os.environ[
            'TF_CPP_MIN_LOG_LEVEL'] = '2'  # Ignore low level instruction warnings
        tf.logging.set_verbosity(tf.logging.ERROR)  # Set tensorflow verbosity
        self.g = tf.Graph()
        self.tf_sess = tf.Session(
            config=tf.ConfigProto(log_device_placement=True), graph=self.g)

        self.logfile = None
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.random_state = 20
        self.filehandler = Filehandler()
        self.ds = KDDCup1999()
        self.visualize = Visualize()
        self.folder = 'viz'

        # Datasets
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.n_features = None
        self.label_map_int_2_string = {
            0: 'good',
            1: 'bad',
            '0': 'good',
            '1': 'bad'
        }
        self.label_map_string_2_int = {
            'normal': 0,
            'dos': 1,
            'u2r': 1,
            'r2l': 1,
            'probe': 1
        }

        # K-fold validation
        self.splits = 5
        self.kfold = StratifiedKFold(n_splits=self.splits,
                                     shuffle=True,
                                     random_state=self.random_state)

        # Network parameters
        self.epochs = 20
        self.batch_size = 100
        self.verbose = 0

        # Scores
        self.metric_loss = []
        self.metric_acc = []
        self.metric_dr = []
        self.metric_far = []

        self.metric_val_loss = []
        self.metric_val_acc = []
        self.metric_val_dr = []
        self.metric_val_far = []

        with timer('\nPreparing dataset'):
            self.load_data()
            self.set_y()
            self.remove_target_from_X()
            self.n_features = self.X.shape[1]
            self.train_test_split()

        with timer('\nTraining & validating model with kfold'):
            self.g.as_default()  # Reset graph for tensorboard display
            K.clear_session()

            # Train model on K-1 and validate using remaining fold
            for train, val in self.kfold.split(self.X_train, self.y_train):
                #self.tensorboard = TensorBoard(log_dir='logs/tb/annmlpbinary_cv')
                self.model = self.get_model()

                self.history = self.model.fit(
                    self.X_train.iloc[train],
                    self.y_train.iloc[train],
                    validation_data=(self.X_train.iloc[val],
                                     self.y_train.iloc[val]),
                    epochs=self.epochs,
                    batch_size=self.batch_size,
                    verbose=self.verbose)
                #callbacks=[self.tensorboard])

                self.metric_loss.append(self.history.history['loss'])
                self.metric_acc.append(self.history.history['acc'])
                self.metric_dr.append(self.history.history['dr'])
                self.metric_far.append(self.history.history['far'])
                self.metric_val_loss.append(self.history.history['val_loss'])
                self.metric_val_acc.append(self.history.history['val_acc'])
                self.metric_val_dr.append(self.history.history['val_dr'])
                self.metric_val_far.append(self.history.history['val_far'])

            print('\nTraining mean loss', np.mean(self.metric_loss))
            print('Training mean acc', np.mean(self.metric_acc))
            print('Training mean dr', np.mean(self.metric_dr))
            print('Training mean far', np.mean(self.metric_far))
            print('\nValidation mean loss', np.mean(self.metric_val_loss))
            print('Validation mean acc', np.mean(self.metric_val_acc))
            print('Validation mean dr', np.mean(self.metric_val_dr))
            print('Validation mean far', np.mean(self.metric_val_far))

        with timer('\nTesting model on unseen test set'):
            self.g.as_default()  # Reset graph for tensorboard display
            K.clear_session()

            self.tensorboard = TensorBoard(log_dir='logs/tb/annmlpbinary_test')
            self.model = self.get_model()

            # Train model on complete train set and validate with unseen test set
            self.history = self.model.fit(self.X_train,
                                          self.y_train,
                                          validation_data=(self.X_test,
                                                           self.y_test),
                                          epochs=self.epochs,
                                          batch_size=self.batch_size,
                                          verbose=self.verbose,
                                          callbacks=[self.tensorboard])

        with timer('\nVisualising results'):
            # Plot model
            plot_model(self.model, to_file='viz/annMLPBinary - model plot.png')

            # Get single class prediction (rather than multi class probability summing to 1)
            y_pred = self.model.predict_classes(self.X_test)

            print('Test loss', np.mean(self.history.history['loss']))
            print('Test acc', np.mean(self.history.history['acc']))
            print('Test dr', np.mean(self.history.history['dr']))
            print('Test far', np.mean(self.history.history['far']))

            # Remap to string class targets
            self.y_pred = self.map_target_to_label(y_pred)
            self.y_pred = self.y_pred.ravel()
            self.y_test = self.map_target_to_label(self.y_test)

            self.visualize.confusion_matrix(self.y_test, self.y_pred,
                                            self.__class__.__name__)

            epochs = range(1, len(self.history.history['loss']) + 1)

            # Plot loss
            fig, ax = plt.subplots(figsize=(15, 8))
            plt.style.use('ggplot')
            ax.xaxis.set_major_locator(MaxNLocator(integer=True))
            ax.tick_params(axis='both', which='major', labelsize=12)
            ax.plot(epochs,
                    np.mean(self.metric_loss, axis=0),
                    'g',
                    label='Training')
            ax.plot(epochs,
                    np.mean(self.metric_val_loss, axis=0),
                    'b',
                    label='Validation')
            ax.plot(epochs, self.history.history['loss'], 'r', label='Test')
            self.title = '{} - {}'.format(self.__class__.__name__, 'Loss')
            plt.title(self.title, fontsize=18)
            plt.xlabel('Epochs', fontsize=14)
            plt.ylabel('Loss', fontsize=14)
            plt.legend(loc=1, prop={'size': 14})
            plt.savefig(fname=self.fname(self.title), dpi=300, format='png')
            plt.show()

            # Plot accuracy
            plt.clf()
            fig, ax = plt.subplots(figsize=(15, 8))
            plt.style.use('ggplot')
            ax.xaxis.set_major_locator(MaxNLocator(integer=True))
            ax.tick_params(axis='both', which='major', labelsize=12)
            ax.plot(epochs,
                    np.mean(self.metric_acc, axis=0),
                    'g',
                    label='Training')
            ax.plot(epochs,
                    np.mean(self.metric_val_acc, axis=0),
                    'b',
                    label='Validation')
            ax.plot(epochs, self.history.history['acc'], 'r', label='Test')
            self.title = '{} - {}'.format(self.__class__.__name__, 'Accuracy')
            plt.title(self.title, fontsize=18)
            plt.xlabel('Epochs', fontsize=14)
            plt.ylabel('Accuracy', fontsize=14)
            plt.legend(loc=4, prop={'size': 14})
            plt.savefig(fname=self.fname(self.title), dpi=300, format='png')
            plt.show()

            # Plot detection rate
            plt.clf()
            fig, ax = plt.subplots(figsize=(15, 8))
            plt.style.use('ggplot')
            ax.xaxis.set_major_locator(MaxNLocator(integer=True))
            ax.tick_params(axis='both', which='major', labelsize=12)
            ax.plot(epochs,
                    np.mean(self.metric_dr, axis=0),
                    'g',
                    label='Training')
            ax.plot(epochs,
                    np.mean(self.metric_val_dr, axis=0),
                    'b',
                    label='Validation')
            ax.plot(epochs, self.history.history['dr'], 'r', label='Test')
            self.title = '{} - {}'.format(self.__class__.__name__,
                                          'Detection Rate')
            plt.title(self.title, fontsize=18)
            plt.xlabel('Epochs', fontsize=14)
            plt.ylabel('Detection Rate', fontsize=14)
            plt.legend(loc=4, prop={'size': 14})
            plt.savefig(fname=self.fname(self.title), dpi=300, format='png')
            plt.show()

            # Plot false alarm rate
            plt.clf()
            fig, ax = plt.subplots(figsize=(15, 8))
            plt.style.use('ggplot')
            ax.xaxis.set_major_locator(MaxNLocator(integer=True))
            ax.tick_params(axis='both', which='major', labelsize=12)
            ax.plot(epochs,
                    np.mean(self.metric_far, axis=0),
                    'g',
                    label='Training')
            ax.plot(epochs,
                    np.mean(self.metric_val_far, axis=0),
                    'b',
                    label='Validation')
            ax.plot(epochs, self.history.history['far'], 'r', label='Test')
            self.title = '{} - {}'.format(self.__class__.__name__,
                                          'False Alarm Rate')
            plt.title(self.title, fontsize=18)
            plt.xlabel('Epochs', fontsize=14)
            plt.ylabel('False Alarm Rate', fontsize=14)
            plt.legend(loc=1, prop={'size': 14})
            plt.savefig(fname=self.fname(self.title), dpi=300, format='png')
            plt.show()

        self.log_file()
        print('Finished')

Пример #27

Показать файл

class Preprocessing:
    def __init__(self):
        self.logfile = False
        self.gettrace = getattr(sys, 'gettrace', None)
        self.original_stdout = sys.stdout
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        self.log_file()

        print(__doc__)

        self.filehandler = Filehandler()
        self.visualize = Visualize()
        self.ds = KDDCup1999()

        with timer('\nLoading dataset'):
            self.ds.dataset = self.filehandler.read_csv(
                self.ds.config['path'], self.ds.config['file'])
            self.ds.set_columns()
        with timer('\nTransforming dataset'):
            self.ds.transform()
        with timer('\nInitial dataset discovery'):
            self.ds.shape()
            self.ds.show_duplicates(self.ds.config['level_01'])
            self.ds.drop_duplicates()
            self.show_zeros()
            self.ds.drop_outliers()
            self.ds.shape()
            self.ds.discovery()
        with timer('\nSetting target'):
            self.ds.set_target()
        with timer('\nEvaluating sparse features'):
            self.ds.evaluate_sparse_features(engineer=False)
        with timer('\nVisualising pairplot for selected columns'):
            self.visualize.pairplot(self.ds.dataset,
                                    self.ds.config['pairplot_cols'],
                                    self.ds.config['pairplot_target'])
        with timer('\nDropping columns'):
            self.ds.drop_cols(self.ds.config['drop_cols_01'])
        with timer('\nEvaluating correlation'):
            self.visualize.correlation_heatmap(
                self.ds.dataset,
                title='Correlation Heatmap Before Column Drop')
            self.ds.drop_highly_correlated()
            self.visualize.correlation_heatmap(
                self.ds.dataset, title='Correlation Heatmap After Column Drop')
        with timer('\nPersisting transformed dataset and target'):
            self.filehandler.write_csv(self.ds.config['path'],
                                       self.ds.config['file'] + '_processed',
                                       self.ds.dataset)
            self.filehandler.write_csv(self.ds.config['path'],
                                       self.ds.config['file'] + '_target',
                                       self.ds.target)
            self.ds.shape()

        self.log_file()
        print('Finished')

    def log_file(self):
        if self.gettrace is None:
            pass
        elif self.gettrace():
            pass
        else:
            if self.logfile:
                sys.stdout = self.original_stdout
                self.logfile.close()
                self.logfile = False
            else:
                # Redirect stdout to file for logging if not in debug mode
                self.logfile = open(
                    'logs/{}_{}_stdout.txt'.format(self.__class__.__name__,
                                                   self.timestr), 'w')
                sys.stdout = self.logfile

    def show_zeros(self):
        df = self.ds.dataset.iloc[:, :-3]
        df[(
            df == 0
        )] = np.nan  # Transform 0's to NaN for visualisation of sparseness with missingno
        self.visualize.matrix_missing(
            df, 'Nullity matrix of features with 0 values')
        self.visualize.bar_missing(df, 'Bar plot of features with 0 values')
        self.visualize.heat_missing(df,
                                    'Heatmap of features with missing values')