示例#1
0
def preprocess_val_dataset(pipeline, val_dataset):
    """
    Pre-processes the validation dataset.

    :param pipeline: Scikit-learn pipeline.
    :param val_dataset: Validation dataset.
    :return: Tuple of (transformed features, labels)
    """
    X_val, y_val = split_x_y(val_dataset)
    X_val = pipeline.transform(X_val)

    return X_val, y_val.label_is_attack
示例#2
0
def fit_pipeline(train_dataset):
    """
    Creates and fits the scikit-learn pre-processing pipeline.

    :param train_dataset: Training dataset.
    :return: Tuple of (fitted scikit-learn pipeline, column names).
    """
    cols_to_impute = train_dataset.columns[train_dataset.isna().any()].tolist()

    X_train, _ = split_x_y(train_dataset)

    pipeline, get_col_names = create_pipeline(X_train,
                                              imputer_strategy='median',
                                              imputer_cols=cols_to_impute,
                                              scaler=FunctionTransformer,
                                              scaler_args={'validate': False})
    pipeline.fit(X_train)
    return pipeline, get_col_names()
示例#3
0
def preprocess_train_dataset(pipeline, train_dataset, nr_attack_samples, random_state):
    """
    Pre-processes the training dataset.

    :param pipeline: Scikit-learn pipeline.
    :param train_dataset: Training dataset.
    :param nr_attack_samples: Minimum number of attack samples per category. If the actual number of samples in the
    dataset is lower than this number the SMOTE algorithm will be used to upsample this category to have the requested
    number of samples.
    :return: Tuple of (transformed features, labels)
    """
    X_train, y_train = split_x_y(train_dataset)
    X_train = pipeline.transform(X_train)

    X_train, y_train = upsample_minority_classes(X_train, y_train,
                                                 min_samples=nr_attack_samples,
                                                 random_state=random_state)

    return X_train, (y_train != 0).astype('int')
示例#4
0
def measure_performance(clf, pipeline, dataset):
    """
    Measures performance metrics on the given dataset.
    :param clf: Classifier to test.
    :param pipeline: Preprocessing pipeline.
    :param dataset: Dataset.
    :return: the `Tuple(pr_auc, precision, recall, f1)`.
    """
    X, y = split_x_y(dataset)
    X = pipeline.transform(X)

    pool = Pool(X)
    y_true = y.label_is_attack

    pred_proba = predict_proba_positive(clf, pool)
    pred = clf.predict(pool)

    pr_auc = average_precision_score(y_true, pred_proba)
    precision = precision_score(y_true, pred)
    recall = recall_score(y_true, pred)
    f1 = f1_score(y_true, pred)
    return pr_auc, precision, recall, f1
示例#5
0
def transform_data(dataset,
                   attack_samples,
                   imputer_strategy,
                   scaler,
                   benign_samples=None,
                   random_state=None):

    cols_to_impute = dataset.columns[dataset.isna().any()].tolist()

    train_data, val_data, test_data = train_val_test_split(dataset,
                                                           val_size=0.1,
                                                           test_size=0.1,
                                                           stratify_col='label_cat',
                                                           random_state=random_state)

    if benign_samples:
        train_data = downsample(train_data, default_nr_samples=benign_samples, random_state=random_state)

    X_train_raw, y_train = split_x_y(train_data)
    X_val_raw, y_val = split_x_y(val_data)
    X_test_raw, y_test = split_x_y(test_data)

    print('Samples:')
    print('========')
    print('Training: {}'.format(X_train_raw.shape))
    print('Val:      {}'.format(X_val_raw.shape))
    print('Test:     {}'.format(X_test_raw.shape))

    print('\nTraining labels:')
    print('================')
    print(y_train.label.value_counts())
    print('\nValidation labels:')
    print('==================')
    print(y_val.label.value_counts())
    print('\nTest labels:')
    print('============')
    print(y_test.label.value_counts())

    del train_data, val_data, test_data
    gc.collect()

    pipeline, get_col_names = create_pipeline(X_train_raw,
                                              imputer_strategy=imputer_strategy,
                                              imputer_cols=cols_to_impute,
                                              scaler=scaler)

    X_train = pipeline.fit_transform(X_train_raw)
    X_val = pipeline.transform(X_val_raw)
    X_test = pipeline.transform(X_test_raw)

    column_names = get_col_names()

    print('Samples:')
    print('========')
    print('Training: {}'.format(X_train.shape))
    print('Val:      {}'.format(X_val.shape))
    print('Test:     {}'.format(X_test.shape))

    print('\nMissing values:')
    print('===============')
    print('Training: {}'.format(np.count_nonzero(np.isnan(X_train))))
    print('Val:      {}'.format(np.count_nonzero(np.isnan(X_val))))
    print('Test:     {}'.format(np.count_nonzero(np.isnan(X_test))))

    print('\nScaling:')
    print('========')
    print('Training: min={}, max={}'.format(np.min(X_train), np.max(X_train)))
    print('Val:      min={}, max={}'.format(np.min(X_val), np.max(X_val)))
    print('Test:     min={}, max={}'.format(np.min(X_test), np.max(X_test)))

    X_train, y_train = upsample_minority_classes(X_train,
                                                 y_train,
                                                 min_samples=attack_samples,
                                                 random_state=random_state)

    print('Samples:')
    print('========')
    print('Training: {}'.format(X_train.shape))

    print('\nTraining labels:')
    print('================')
    print(Counter(y_train))

    return X_train, y_train, X_val, y_val, X_test, y_test, column_names