示例#1
0
def inject_outlier_ts(X: np.ndarray,
                      perc_outlier: int,
                      perc_window: int = 10,
                      n_std: float = 2.,
                      min_std: float = 1.) -> Bunch:
    """
    Inject outliers in both univariate and multivariate time series data.

    Parameters
    ----------
    X
        Time series data to perturb (inject outliers).
    perc_outlier
        Percentage of observations which are perturbed to outliers. For multivariate data,
        the percentage is evenly split across the individual time series.
    perc_window
        Percentage of the observations used to compute the standard deviation used in the perturbation.
    n_std
        Number of standard deviations in the window used to perturb the original data.
    min_std
        Minimum number of standard deviations away from the current observation. This is included because
        of the stochastic nature of the perturbation which could lead to minimal perturbations without a floor.

    Returns
    -------
    Bunch object with the perturbed time series and the outlier labels.
    """
    n_dim = len(X.shape)
    if n_dim == 1:
        X = X.reshape(-1, 1)
    n_samples, n_ts = X.shape
    X_outlier = X.copy()
    is_outlier = np.zeros(n_samples)
    # one sided window used to compute mean and stdev from
    window = int(perc_window * n_samples * .5 / 100)
    # distribute outliers evenly over different time series
    n_outlier = int(n_samples * perc_outlier * .01 / n_ts)
    if n_outlier == 0:
        return Bunch(data=X_outlier,
                     target=is_outlier,
                     target_names=['normal', 'outlier'])
    for s in range(n_ts):
        outlier_idx = np.sort(random.sample(range(n_samples), n_outlier))
        window_idx = [
            np.maximum(outlier_idx - window, 0),
            np.minimum(outlier_idx + window, n_samples)
        ]
        stdev = np.array([
            X_outlier[window_idx[0][i]:window_idx[1][i], s].std()
            for i in range(len(outlier_idx))
        ])
        rnd = np.random.normal(size=n_outlier)
        X_outlier[outlier_idx, s] += np.sign(rnd) * np.maximum(
            np.abs(rnd * n_std), min_std) * stdev
        is_outlier[outlier_idx] = 1
    if n_dim == 1:
        X_outlier = X_outlier.reshape(n_samples, )
    return Bunch(data=X_outlier,
                 target=is_outlier,
                 target_names=['normal', 'outlier'])
示例#2
0
def inject_outlier_tabular(X: np.ndarray,
                           cols: List[int],
                           perc_outlier: int,
                           y: np.ndarray = None,
                           n_std: float = 2.,
                           min_std: float = 1.
                           ) -> Bunch:
    """
    Inject outliers in numerical tabular data.

    Parameters
    ----------
    X
        Tabular data to perturb (inject outliers).
    cols
        Columns of X that are numerical and can be perturbed.
    perc_outlier
        Percentage of observations which are perturbed to outliers. For multiple numerical features,
        the percentage is evenly split across the features.
    y
        Outlier labels.
    n_std
        Number of feature-wise standard deviations used to perturb the original data.
    min_std
        Minimum number of standard deviations away from the current observation. This is included because
        of the stochastic nature of the perturbation which could lead to minimal perturbations without a floor.

    Returns
    -------
    Bunch object with the perturbed tabular data and the outlier labels.
    """
    n_dim = len(X.shape)
    if n_dim == 1:
        X = X.reshape(-1, 1)
    n_samples, n_features = X.shape
    X_outlier = X.astype(np.float32).copy()
    if y is None:
        is_outlier = np.zeros(n_samples)
    else:
        is_outlier = y
    n_cols = len(cols)

    # distribute outliers evenly over different columns
    n_outlier = int(n_samples * perc_outlier * .01 / n_cols)
    if n_outlier == 0:
        return Bunch(data=X_outlier, target=is_outlier, target_names=['normal', 'outlier'])

    # add perturbations
    stdev = X_outlier.std(axis=0)
    for col in cols:
        outlier_idx = np.sort(random.sample(range(n_samples), n_outlier))
        rnd = np.random.normal(size=n_outlier)
        X_outlier[outlier_idx, col] += np.sign(rnd) * np.maximum(np.abs(rnd * n_std), min_std) * stdev[col]
        is_outlier[outlier_idx] = 1
    if n_dim == 1:
        X_outlier = X_outlier.reshape(n_samples, )
    return Bunch(data=X_outlier, target=is_outlier, target_names=['normal', 'outlier'])
示例#3
0
def fetch_ecg(return_X_y: bool = False) \
        -> Union[Bunch, Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]]:
    """
    Fetch ECG5000 data. The dataset contains 5000 ECG's, originally obtained from
    Physionet (https://archive.physionet.org/cgi-bin/atm/ATM) under the name
    "BIDMC Congestive Heart Failure Database(chfdb)", record "chf07".

    Parameters
    ----------
    return_X_y
        Bool, whether to only return the data and target values or a Bunch object.

    Returns
    -------
    Bunch
        Train and test datasets with labels.
    (train data, train target), (test data, test target)
        Tuple of tuples if 'return_X_y' equals True.
    """
    Xy_train = load_url_arff(
        'https://storage.googleapis.com/seldon-datasets/ecg/ECG5000_TRAIN.arff'
    )
    X_train, y_train = Xy_train[:, :-1], Xy_train[:, -1]
    Xy_test = load_url_arff(
        'https://storage.googleapis.com/seldon-datasets/ecg/ECG5000_TEST.arff')
    X_test, y_test = Xy_test[:, :-1], Xy_test[:, -1]
    if return_X_y:
        return (X_train, y_train), (X_test, y_test)
    else:
        return Bunch(data_train=X_train,
                     data_test=X_test,
                     target_train=y_train,
                     target_test=y_test)
示例#4
0
def fetch_cifar10c(corruption: Union[str, List[str]], severity: int, return_X_y: bool = False) \
        -> Union[Bunch, Tuple[np.ndarray, np.ndarray]]:
    """
    Fetch CIFAR-10-C data. Originally obtained from https://zenodo.org/record/2535967#.XkKh2XX7Qts and
    introduced in "Hendrycks, D and Dietterich, T.G. Benchmarking Neural Network Robustness to Common Corruptions
    and Perturbations. In 7th International Conference on Learning Represenations, 2019.".

    Parameters
    ----------
    corruption
        Corruption type. Options can be checked with `get_corruption_cifar10c()`.
        Alternatively, specify 'all' for all corruptions at a severity level.
    severity
        Severity level of corruption (1-5).
    return_X_y
        Bool, whether to only return the data and target values or a Bunch object.

    Returns
    -------
    Bunch
        Corrupted dataset with labels.
    (corrupted data, target)
        Tuple if 'return_X_y' equals True.
    """
    url = 'https://storage.googleapis.com/seldon-datasets/cifar10c/'
    n = 10000  # instances per corrupted test set
    istart, iend = (severity -
                    1) * n, severity * n  # idx for the relevant severity level
    corruption_list = corruption_types_cifar10c(
    )  # get all possible corruption types
    # convert input to list
    if isinstance(corruption, str) and corruption != 'all':
        corruption = [corruption]
    elif corruption == 'all':
        corruption = corruption_list
    for corr in corruption:  # check values in corruptions
        if corr not in corruption_list:
            raise ValueError(f'{corr} is not a valid corruption type.')
    # get corrupted data
    shape = ((len(corruption)) * n, 32, 32, 3)
    X = np.zeros(shape)
    for i, corr in enumerate(corruption):
        url_corruption = os.path.join(url, corr + '.npy')
        resp = requests.get(url_corruption)
        X_corr = np.load(BytesIO(resp.content))[istart:iend].astype('float32')
        X[i * n:(i + 1) * n] = X_corr

    # get labels
    url_labels = os.path.join(url, 'labels.npy')
    resp = requests.get(url_labels)
    y = np.load(BytesIO(resp.content))[istart:iend].astype('int64')
    if X.shape[0] != y.shape[0]:
        repeat = X.shape[0] // y.shape[0]
        y = np.tile(y, (repeat, ))

    if return_X_y:
        return (X, y)
    else:
        return Bunch(data=X, target=y)
示例#5
0
def fetch_genome(return_X_y: bool = False,
                 return_labels: bool = False) -> Union[Bunch, tuple]:
    """
    Load genome data including their labels and whether they are outliers or not. More details about the data can be
    found in the readme on https://console.cloud.google.com/storage/browser/seldon-datasets/genome/.
    The original data can be found here: https://drive.google.com/drive/folders/1Ht9xmzyYPbDouUTl_KQdLTJQYX2CuclR.

    Parameters
    ----------
    return_X_y
        Bool, whether to only return the data and target values or a Bunch object.
    return_labels
        Whether to return the genome labels which are detailed in the `label_json` dict
        of the returned Bunch object.

    Returns
    -------
    Bunch
        Training, validation and test data, whether they are outliers and optionally including the
        genome labels which are specified in the `label_json` key as a dictionary.
    (data, outlier) or (data, outlier, target)
        Tuple for the train, validation and test set with either the data and whether they
        are outliers or the data, outlier flag and labels for the genomes if 'return_X_y' equals True.
    """
    data_train = load_genome_npz('train_in', return_labels=return_labels)
    data_val_in = load_genome_npz('val_in', return_labels=return_labels)
    data_val_ood = load_genome_npz('val_ood', return_labels=return_labels)
    data_val = (np.concatenate([data_val_in[0], data_val_ood[0]]),
                np.concatenate([data_val_in[1], data_val_ood[1]]))
    data_test_in = load_genome_npz('test_in', return_labels=return_labels)
    data_test_ood = load_genome_npz('test_ood', return_labels=return_labels)
    data_test = (np.concatenate([data_test_in[0], data_test_ood[0]]),
                 np.concatenate([data_test_in[1], data_test_ood[1]]))
    if return_labels:
        data_val += (np.concatenate([data_val_in[2], data_val_ood[2]]),
                     )  # type: ignore
        data_test += (np.concatenate([data_test_in[2], data_test_ood[2]]),
                      )  # type: ignore
    if return_X_y:
        return data_train, data_val, data_test
    resp = requests.get(
        'https://storage.googleapis.com/seldon-datasets/genome/label_dict.json'
    )
    label_dict = resp.json()
    bunch = Bunch(data_train=data_train[0],
                  data_val=data_val[0],
                  data_test=data_test[0],
                  outlier_train=data_train[1],
                  outlier_val=data_val[1],
                  outlier_test=data_test[1],
                  label_dict=label_dict)
    if not return_labels:
        return bunch
    else:
        bunch['target_train'] = data_train[2]  # type: ignore
        bunch['target_val'] = data_val[2]  # type: ignore
        bunch['target_test'] = data_test[2]  # type: ignore
        return bunch
示例#6
0
def fetch_attack(dataset: str, model: str, attack: str, return_X_y: bool = False) \
        -> Union[Bunch, Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]]:
    """
    Load adversarial instances for a given dataset, model and attack type.

    Parameters
    ----------
    dataset
        Dataset under attack.
    model
        Model under attack.
    attack
        Attack name.
    return_X_y
        Bool, whether to only return the data and target values or a Bunch object.

    Returns
    -------
    Bunch
        Adversarial instances with original labels.
    (train data, train target), (test data, test target)
        Tuple of tuples if 'return_X_y' equals True.
    """
    # define paths
    url = 'https://storage.googleapis.com/seldon-datasets/'
    path_attack = os.path.join(url, dataset, 'attacks', model, attack)
    path_data = path_attack + '.npz'
    path_meta = path_attack + '_meta.pickle'
    # get adversarial instances and labels
    try:
        resp = requests.get(path_data, timeout=2)
        resp.raise_for_status()
    except RequestException:
        logger.exception("Could not connect, URL may be out of service")
        raise
    data = np.load(BytesIO(resp.content))
    X_train, X_test = data['X_train_adv'], data['X_test_adv']
    y_train, y_test = data['y_train'], data['y_test']

    if return_X_y:
        return (X_train, y_train), (X_test, y_test)

    # get metadata
    try:
        resp = requests.get(path_meta, timeout=2)
        resp.raise_for_status()
    except RequestException:
        logger.exception("Could not connect, URL may be out of service")
        raise
    meta = dill.load(BytesIO(resp.content))
    return Bunch(data_train=X_train,
                 data_test=X_test,
                 target_train=y_train,
                 target_test=y_test,
                 meta=meta)
示例#7
0
def fetch_nab(
    ts: str,
    return_X_y: bool = False
) -> Union[Bunch, Tuple[pd.DataFrame, pd.DataFrame]]:
    """
    Get time series in a DataFrame from the Numenta Anomaly Benchmark: https://github.com/numenta/NAB.

    Parameters
    ----------
    ts

    return_X_y
        Bool, whether to only return the data and target values or a Bunch object.

    Returns
    -------
    Bunch
        Dataset and outlier labels (0 means 'normal' and 1 means 'outlier') in DataFrames with timestamps.
    (data, target)
        Tuple if 'return_X_y' equals True.
    """
    url_labels = 'https://raw.githubusercontent.com/numenta/NAB/master/labels/combined_labels.json'
    try:
        resp = requests.get(url_labels, timeout=2)
        resp.raise_for_status()
    except RequestException:
        logger.exception("Could not connect, URL may be out of service")
        raise
    labels_json = resp.json()
    outliers = labels_json[ts + '.csv']
    if not outliers:
        logger.warning('The dataset does not contain any outliers.')
    url = 'https://raw.githubusercontent.com/numenta/NAB/master/data/' + ts + '.csv'
    df = pd.read_csv(url, header=0, index_col=0)
    labels = np.zeros(df.shape[0])
    for outlier in outliers:
        outlier_id = np.where(df.index == outlier)[0][0]
        labels[outlier_id] = 1
    df.index = pd.to_datetime(df.index)
    df_labels = pd.DataFrame(data={'is_outlier': labels}, index=df.index)

    if return_X_y:
        return df, df_labels

    return Bunch(data=df, target=df_labels, target_names=['normal', 'outlier'])
示例#8
0
def fetch_kdd(
        target: list = ['dos', 'r2l', 'u2r', 'probe'],
        keep_cols: list = [
            'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate',
            'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
            'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
            'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
            'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
            'dst_host_serror_rate', 'dst_host_srv_serror_rate',
            'dst_host_rerror_rate', 'dst_host_srv_rerror_rate'
        ],
        percent10: bool = True,
        return_X_y: bool = False
) -> Union[Bunch, Tuple[np.ndarray, np.ndarray]]:
    """
    KDD Cup '99 dataset. Detect computer network intrusions.

    Parameters
    ----------
    target
        List with attack types to detect.
    keep_cols
        List with columns to keep. Defaults to continuous features.
    percent10
        Bool, whether to only return 10% of the data.
    return_X_y
        Bool, whether to only return the data and target values or a Bunch object.

    Returns
    -------
    Bunch
        Dataset and outlier labels (0 means 'normal' and 1 means 'outlier').
    (data, target)
        Tuple if 'return_X_y' equals True.
    """

    # fetch raw data
    data_raw = fetch_kddcup99(subset=None, data_home=None, percent10=percent10)

    # specify columns
    cols = [
        'duration', 'protocol_type', 'service', 'flag', 'src_bytes',
        'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
        'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
        'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
        'num_access_files', 'num_outbound_cmds', 'is_host_login',
        'is_guest_login', 'count', 'srv_count', 'serror_rate',
        'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
        'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
        'dst_host_srv_count', 'dst_host_same_srv_rate',
        'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
        'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
        'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
        'dst_host_srv_rerror_rate'
    ]

    # create dataframe
    data = pd.DataFrame(data=data_raw['data'], columns=cols)

    # add target to dataframe
    data['attack_type'] = data_raw['target']

    # specify and map attack types
    attack_list = np.unique(data['attack_type'])
    attack_category = [
        'dos', 'u2r', 'r2l', 'r2l', 'r2l', 'probe', 'dos', 'u2r', 'r2l', 'dos',
        'probe', 'normal', 'u2r', 'r2l', 'dos', 'probe', 'u2r', 'probe', 'dos',
        'r2l', 'dos', 'r2l', 'r2l'
    ]

    attack_types = {}
    for i, j in zip(attack_list, attack_category):
        attack_types[i] = j

    data['attack_category'] = 'normal'
    for k, v in attack_types.items():
        data['attack_category'][data['attack_type'] == k] = v

    # define target
    data['target'] = 0
    for t in target:
        data['target'][data['attack_category'] == t] = 1
    is_outlier = data['target'].values

    # define columns to be dropped
    drop_cols = []
    for col in data.columns.values:
        if col not in keep_cols:
            drop_cols.append(col)

    if drop_cols != []:
        data.drop(columns=drop_cols, inplace=True)

    if return_X_y:
        return data.values, is_outlier

    return Bunch(data=data.values,
                 target=is_outlier,
                 target_names=['normal', 'outlier'],
                 feature_names=keep_cols)
示例#9
0
def inject_outlier_categorical(X: np.ndarray,
                               cols: List[int],
                               perc_outlier: int,
                               y: np.ndarray = None,
                               cat_perturb: dict = None,
                               X_fit: np.ndarray = None,
                               disc_perc: list = [25, 50, 75],
                               smooth: float = 1.) -> Bunch:
    """
    Inject outliers in categorical variables of tabular data.

    Parameters
    ----------
    X
        Tabular data with categorical variables to perturb (inject outliers).
    cols
        Columns of X that are categorical and can be perturbed.
    perc_outlier
        Percentage of observations which are perturbed to outliers. For multiple numerical features,
        the percentage is evenly split across the features.
    y
        Outlier labels.
    cat_perturb
        Dictionary mapping each category in the categorical variables to their furthest neighbour.
    X_fit
        Optional data used to infer pairwise distances from.
    disc_perc
        List with percentiles used in binning of numerical features used for the 'abdm' pairwise distance measure.
    smooth
        Smoothing exponent between 0 and 1 for the distances.
        Lower values will smooth the difference in distance metric between different features.

    Returns
    -------
    Bunch object with the perturbed tabular data, outlier labels and
    a dictionary used to map categories to their furthest neighbour.
    """
    if cat_perturb is None:
        # transform the categorical variables into numerical ones via
        # pairwise distances computed with abdm and multidim scaling
        X_fit = X.copy() if X_fit is None else X_fit

        # find number of categories for each categorical variable
        cat_vars = {k: None for k in cols}
        for k in cols:
            cat_vars[k] = len(np.unique(X_fit[:, k]))  # type: ignore

        # TODO: extend method for OHE
        ohe = False
        if ohe:
            X_ord, cat_vars_ord = ohe2ord(X, cat_vars)
        else:
            X_ord, cat_vars_ord = X, cat_vars

        # bin numerical features to compute the pairwise distance matrices
        n_ord = X_ord.shape[1]
        if len(cols) != n_ord:
            fnames = [str(_) for _ in range(n_ord)]
            disc = Discretizer(X_ord, cols, fnames, percentiles=disc_perc)
            X_bin = disc.discretize(X_ord)
            cat_vars_bin = {
                k: len(disc.names[k])
                for k in range(n_ord) if k not in cols
            }
        else:
            X_bin = X_ord
            cat_vars_bin = {}

        # pairwise distances for categorical variables
        d_pair = abdm(X_bin, cat_vars_ord, cat_vars_bin)

        # multidim scaling
        feature_range = (np.ones((1, n_ord)) * -1e10, np.ones(
            (1, n_ord)) * 1e10)
        d_abs = multidim_scaling(d_pair,
                                 n_components=2,
                                 use_metric=True,
                                 standardize_cat_vars=True,
                                 smooth=smooth,
                                 feature_range=feature_range,
                                 update_feature_range=False)[0]

        # find furthest category away for each category in the categorical variables
        cat_perturb = {k: np.zeros(len(v)) for k, v in d_abs.items()}
        for k, v in d_abs.items():
            for i in range(len(v)):
                cat_perturb[k][i] = np.argmax(np.abs(v[i] - v))
    else:
        d_abs = None

    n_dim = len(X.shape)
    if n_dim == 1:
        X = X.reshape(-1, 1)
    n_samples, n_features = X.shape
    X_outlier = X.astype(np.float32).copy()
    if y is None:
        is_outlier = np.zeros(n_samples)
    else:
        is_outlier = y
    n_cols = len(cols)

    # distribute outliers evenly over different columns
    n_outlier = int(n_samples * perc_outlier * .01 / n_cols)
    for col in cols:
        outlier_idx = np.sort(random.sample(range(n_samples), n_outlier))
        col_cat = X_outlier[outlier_idx, col].astype(int)
        col_map = np.tile(cat_perturb[col], (n_outlier, 1))
        X_outlier[outlier_idx, col] = np.diag(col_map.T[col_cat])
        is_outlier[outlier_idx] = 1
    if n_dim == 1:
        X_outlier = X_outlier.reshape(n_samples, )
    return Bunch(data=X_outlier,
                 target=is_outlier,
                 cat_perturb=cat_perturb,
                 d_abs=d_abs,
                 target_names=['normal', 'outlier'])