Exemplo n.º 1
0
    def __load_classifiers(self):
        outliers_fraction = 0.05
        random_state = np.random.RandomState(0)

        classifiers = {
            'Cluster-based Local Outlier Factor (CBLOF)':
            CBLOF(contamination=outliers_fraction,
                  check_estimator=False,
                  random_state=random_state),
            'Feature Bagging':
            FeatureBagging(LOF(n_neighbors=35),
                           contamination=outliers_fraction,
                           random_state=random_state),
            'Histogram-base Outlier Detection (HBOS)':
            HBOS(contamination=outliers_fraction),
            'Isolation Forest':
            IForest(contamination=outliers_fraction,
                    random_state=random_state),
            'K Nearest Neighbors (KNN)':
            KNN(contamination=outliers_fraction),
            'Average KNN':
            KNN(method='mean', contamination=outliers_fraction),
            'Local Outlier Factor (LOF)':
            LOF(n_neighbors=35, contamination=outliers_fraction),
            'Minimum Covariance Determinant (MCD)':
            MCD(contamination=outliers_fraction, random_state=random_state),
            'One-class SVM (OCSVM)':
            OCSVM(contamination=outliers_fraction),
        }

        return classifiers
Exemplo n.º 2
0
def train():
    dataset = get_data(1000, 10, 100)
    contamination = 0.01
    with mlflow.start_run():
        base_estimators = [
            LOF(n_neighbors=5, contamination=contamination),
            LOF(n_neighbors=15, contamination=contamination),
            LOF(n_neighbors=25, contamination=contamination),
            PCA(contamination=contamination),
            KNN(n_neighbors=5, contamination=contamination),
            KNN(n_neighbors=15, contamination=contamination),
            KNN(n_neighbors=25, contamination=contamination)]
        model = SUOD(base_estimators=base_estimators, n_jobs=6,  
                    rp_flag_global=True,  
                    bps_flag=True,  
                    approx_flag_global=False, 
                    contamination=contamination)
        model.fit(dataset)  
        model.approximate(dataset)  
        predicted_labels = model.predict(dataset)
        voted_labels = vote(predicted_labels)
        true_labels = [0]*1000 + [1]*10
        auc_score = roc_auc_score(voted_labels, true_labels)
        print("The resulted area under the ROC curve score is {}".format(auc_score))
        mlflow.log_metric("auc_score", auc_score)
        mlflow.sklearn.log_model(model, "anomaly_model", conda_env="conda.yaml")
Exemplo n.º 3
0
    def setUp(self):
        # Define data file and read X and y
        # Generate some data if the source data is missing
        this_directory = path.abspath(path.dirname(__file__))
        mat_file = 'cardio.mat'
        try:
            mat = loadmat(path.join(*[this_directory, 'data', mat_file]))

        except TypeError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        except IOError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        else:
            X = mat['X']
            y = mat['y'].ravel()
            X, y = check_X_y(X, y)

        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(X, y, test_size=0.4, random_state=42)

        self.base_estimators = [LOF(), LOF(), IForest(), COPOD()]
        self.clf = SUOD(base_estimators=self.base_estimators)
        self.clf.fit(self.X_train)
        self.roc_floor = 0.7
Exemplo n.º 4
0
def load_classifiers(outliers_fraction):
    outliers_fraction = min(0.5, outliers_fraction)
    random_state = np.random.RandomState(42)
    # Define nine outlier detection tools to be compared
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor (CBLOF)':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(LOF(n_neighbors=35),
                       contamination=outliers_fraction,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction,
                random_state=random_state,
                behaviour="new"),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Average KNN':
        KNN(method='mean', contamination=outliers_fraction),
        'Local Outlier Factor (LOF)':
        LOF(n_neighbors=35, contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)':
        MCD(contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)':
        OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)':
        PCA(contamination=outliers_fraction, random_state=random_state)
    }
    return classifiers
 def setUp(self):
     self.n_train = 200
     self.n_test = 100
     self.contamination = 0.1
     self.roc_floor = 0.8
     self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
         n_train=self.n_train, n_test=self.n_test,
         contamination=self.contamination, random_state=42)
     self.X_train, self.X_test = standardizer(self.X_train, self.X_test)
     self.detector_list = [LOF(), LOF()]
     self.clf = LSCP(self.detector_list, contamination=self.contamination)
     self.clf.fit(self.X_train)
Exemplo n.º 6
0
def lof_pyod_once(X_nor, X_test, y_test, n_neighbors, contamination=0.05):

    lof = LOF(n_neighbors=n_neighbors, contamination=contamination)

    X_train = X_nor.astype(float).values.copy()

    lof.fit(X_train)
    ## now threshold is determined

    y_pred = lof.predict(X_test)
    scoreTable = lof.decision_function(X_test)
    #print(scoreTable)
    scoreTable = np.nan_to_num(scoreTable, copy=True)

    ## confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    tpr = tp / (tp + fn)
    fpr = fp / (tn + fp)
    #tprW[trail] = tpr
    #fprW[trail] = fpr
    tprW = tpr
    fprW = fpr

    # Auc score
    auc = roc_auc_score(y_test, scoreTable)

    #print(tpr, fpr)
    #print(auc)

    return tprW, fprW, auc, scoreTable
Exemplo n.º 7
0
    def __call__(self):
        clf = LOF(contamination=0.1)
        buggy_enter_csv = self.get_file(
            join(self.data_buggy_dir, '*_ENTER.csv'))
        buggy_exit_csv = self.get_file(join(self.data_buggy_dir, '*_EXIT.csv'))
        data = self.get_data(buggy_enter_csv, buggy_exit_csv)

        # extend data with self.data_orig_dir
        for cur_dir, dirs, files in os.walk(self.data_orig_dir):
            for f_dir in dirs:
                enter_csv = self.get_file(join(cur_dir, f_dir, '*_ENTER.csv'))
                exit_csv = self.get_file(join(cur_dir, f_dir, '*_EXIT.csv'))
                ext_data = self.get_data(enter_csv, exit_csv)
                logger.debug('shape of data: {}'.format(data.shape))
                logger.debug('shape of ext_data: {}'.format(ext_data.shape))
                data = np.concatenate((data, ext_data), axis=0)
                logger.debug('shape of data: {}'.format(data.shape))

        clf.fit(data)
        train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        unique, counts = np.unique(train_pred, return_counts=True)
        logger.debug('unique (train): {}'.format(unique))
        logger.debug('counts (train): {}'.format(counts))
        if 0 not in unique:
            raise ModelError('Model contains no inlier')

        inliers_size = counts[0]
        outliers_size = counts[1] if len(counts) > 1 else 0
        logger.debug('num of inliers: {}'.format(inliers_size))
        logger.debug('num of outliers: {}'.format(outliers_size))

        return self.predict(clf)
Exemplo n.º 8
0
def run_LOF_base_detector(data, k, metric='euclidean', p=2):
    """
    Function to fit and predict the LOF base detector on `data`.
    
    Input:
     - data: pd.DataFrame, to run LOF on
     - k: integer, parameter to indicate the amount of neighbours to include in relative density determination
     - metric: string, distance metric to use, default `euclidean`
     - p: int, default 1 since metric = `euclidean`, otherwise set according to distance metric
     
    Output:
     - clf of class pyod.models.lof.LOF with all its properties
    """
    
    # Split data in values and targets: some datasets have an ID column, others don't
    try:
        X = data.drop(['outlier', 'id'], axis=1)
    except KeyError:
        X = data.drop('outlier', axis=1)
    
    # Construct and fit classifier
    clf = LOF(n_neighbors=k, metric='euclidean', p=p)
    clf.fit(X) # Fit only on features
    
    # Add ground truth labels for evaluation of the classifier
    clf.true_labels_ = data['outlier']
    
    # Return the classifier for further processing
    return clf
Exemplo n.º 9
0
def define_classifiers(random_state, outliers_fraction):
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(contamination=outliers_fraction,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction, random_state=random_state),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Local Outlier Factor (LOF)':
        LOF(contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)':
        MCD(contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)':
        OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)':
        PCA(contamination=outliers_fraction, random_state=random_state)
    }
    return classifiers
Exemplo n.º 10
0
Arquivo: ml.py Projeto: pedrovhb/tcc
def train_model(station: Station) -> LSCP:
    t1 = time.time()
    log.info(f'Training model for {station}...')
    log.info('Loading training observations')
    observations_select = Observation.select(
        Observation.time,
        Observation.sample_frequency,
        Observation.sample_count,
        Observation.rms,
        Observation.crest,
        Observation.peak_to_peak,
        Observation.kurtosis,
    ).where(Observation.station == station, Observation.is_training)

    obs_data = []
    for observation in observations_select:
        obs_data.append([
            observation.rms, observation.peak_to_peak, observation.kurtosis,
            observation.crest
        ])

    log.info('Fitting LSCP model')
    lscp = LSCP([KNN()] * 5 + [LOF()] * 5 + [PCA()] * 5, contamination=0.03)
    lscp.fit(X=obs_data)
    log.info(f'Trained model in {time.time() - t1}')
    return lscp
Exemplo n.º 11
0
def obj_func_LOF(params):
    ## objective function used in baseian optimization
    outlier_fraction = params[0]
    n_neighbors = params[1]
    algorithm = params[2]
    leaf_size = params[3]

    # load data set to function work space
    Y_train = np.load('Y_train.npy')
    X_train = np.load('X_train.npy')

    # create model
    clf = LOF(n_neighbors=n_neighbors,
              algorithm=algorithm,
              leaf_size=leaf_size,
              contamination=outlier_fraction)
    # fit the dataset to the model
    clf.fit(X_train)

    scores_pred = clf.decision_function(
        X_train) * -1  # predict raw anomaly score
    Rprecision = Rprecision_f(Y_train, scores_pred)
    if glb_verbose:
        print('R Precision : ', Rprecision)

    y_pred = clf.predict(
        X_train)  # prediction of a datapoint category outlier or inlier
    objVal = objVal_f(Rprecision, y_pred, Y_train)

    return objVal
def calculate_LOF(given_DT, given_neighbors):
  X_1 = pd.DataFrame(given_DT)
  X = X_1.values
  clf = LOF(n_neighbors=given_neighbors)
  clf.fit(X)
  X_scores = clf.decision_scores_#clf.decision_function(XX_1)
  return X_scores
Exemplo n.º 13
0
def anomaly_detection(data, label):
    X = data[data.select_dtypes('number').columns.tolist()]
    y = data[label]
    y = y.values
    X = X.drop([label], axis=1)

    sc = StandardScaler()
    X = pd.DataFrame(data=sc.fit_transform(X), columns=X.columns)

    ifo = IForest(contamination=0.01,
                  behaviour='new',
                  n_estimators=1000,
                  max_samples=1024,
                  n_jobs=-1,
                  verbose=1)
    ifo.fit(X)
    ifo_pred = ifo.labels_
    print('ROC score for Isolation forest: ', roc_auc_score(y, ifo_pred))
    utilities.plot_outlier_scores(
        y,
        ifo.decision_scores_,
        bw=0.1,
        title='Fraud, Isolation forest. (n_estimators={})'.format(
            ifo.n_estimators))

    ae = AutoEncoder(hidden_neurons=[25, 20, 15, 20, 25],
                     hidden_activation='relu',
                     output_activation='sigmoid',
                     optimizer='adam',
                     epochs=20,
                     batch_size=128,
                     dropout_rate=0.2,
                     l2_regularizer=0.0,
                     validation_size=0.1,
                     preprocessing=False,
                     verbose=1,
                     random_state=1,
                     contamination=0.01)
    ae.fit(X)
    ae_pred = ae.labels_
    print('ROC score for Autoencoder: ', roc_auc_score(y, ae_pred))
    utilities.plot_outlier_scores(
        y,
        ae.decision_scores_,
        bw=0.1,
        title='Fraud, Autoencoder. (epochs={})'.format(ae.epochs))

    # Too long to train, under-sample needed
    lof = LOF(n_neighbors=int(y.sum() * 1.3), contamination=0.01, n_jobs=-1)
    lof.fit(X)
    lof_pred = lof.labels_
    print('ROC score for LOF: ', roc_auc_score(y, lof_pred))
    utilities.plot_outlier_scores(
        y,
        lof.decision_scores_,
        bw=0.1,
        title='Fraud, Local outliers factor. (n_neighbors={})'.format(
            lof.n_neighbors))

    return y, ifo_pred, ae_pred, lof_pred
Exemplo n.º 14
0
def construct_raw_base_estimators():
    from pyod.models.knn import KNN
    from pyod.models.lof import LOF
    from pyod.models.cblof import CBLOF
    from pyod.models.hbos import HBOS
    from pyod.models.iforest import IForest
    from pyod.models.abod import ABOD
    from pyod.models.ocsvm import OCSVM

    estimator_list = []

    # predefined range of n_neighbors for KNN, AvgKNN, and LOF
    k_range = [3, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

    for k in k_range:
        estimator_list.append(
            KNN(n_neighbors=k, method="largest", contamination=0.05))
        estimator_list.append(
            KNN(n_neighbors=k, method="mean", contamination=0.05))
        estimator_list.append(LOF(n_neighbors=k, contamination=0.05))

    # predefined range of nu for one-class svm
    nu_range = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99]
    for nu in nu_range:
        estimator_list.append(OCSVM(nu=nu, contamination=0.05))

    # predefined range for number of estimators in isolation forests
    n_range = [10, 20, 50, 70, 100, 150, 200, 250]
    for n in n_range:
        estimator_list.append(
            IForest(n_estimators=n, random_state=42, contamination=0.05))

    return estimator_list
Exemplo n.º 15
0
def train_model(X, Y, contamination, name, from_scratch=True):
    model_dir = './model'
    if not os.path.exists(model_dir):
        os.mkdir(model_dir)
    file_name = name + '.pkl'

    if from_scratch:
        if name == 'ocsvm':
            model = OCSVM(contamination=contamination)
            model.fit(X)
        elif name == 'iforest':
            model = IForest(contamination=contamination)
            model.fit(X)
        elif name == 'lof':
            model = LOF(contamination=contamination)
            model.fit(X)
        elif name == 'knn':
            model = KNN(contamination=contamination)
            model.fit(X)
        elif name == 'xgbod':
            model = XGBOD(contamination=contamination)
            model.fit(X, Y)

        save(model, model_dir, file_name)

    else:
        model = load(model_dir, file_name)

    return model
Exemplo n.º 16
0
def main():

    scalers = ['no', 'std', 'minmax']
    root = 'Unsupervised_Anamaly_Detection_csv'
    start = 0
    counts = 90
    CPUS = 3
    CPUS_Models = 4
    sklearn_models = [
        'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS',
        'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD',
        'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal',
        'VAE', 'AutoEncoder'
    ]

    models = {
        'BRM': BRM(bootstrap_sample_percent=70),
        'GM': GaussianMixture(),
        'IF': IsolationForest(),
        'OCSVM': OneClassSVM(),
        'EE': EllipticEnvelope(),
        'AvgKNN': KNN(method='mean'),
        'LargestKNN': KNN(method='largest'),
        'MedKNN': KNN(method='median'),
        'PCA': PCA(),
        'COF': COF(),
        'LODA': LODA(),
        'LOF': LOF(),
        'HBOS': HBOS(),
        'MCD': MCD(),
        'AvgBagging': FeatureBagging(combination='average'),
        'MaxBagging': FeatureBagging(combination='max'),
        'CBLOF': CBLOF(n_clusters=10, n_jobs=4),
        'FactorAnalysis': FactorAnalysis(),
        'KernelDensity': KernelDensity(),
        'COPOD': COPOD(),
        'SOD': SOD(),
        'LSCPwithLODA': LSCP([LODA(), LODA()]),
        'AveLMDD': LMDD(dis_measure='aad'),
        'VarLMDD': LMDD(dis_measure='var'),
        'IqrLMDD': LMDD(dis_measure='iqr'),
        'SoGaal': SO_GAAL(),
        'MoGaal': MO_GAAL(),
        'VAE': VAE(encoder_neurons=[8, 4, 2]),
        'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]),
        'OCKRA': m_OCKRA(),
    }

    name = "30_Models"

    Parallel(n_jobs=CPUS) \
        (delayed(runByScaler)
         (root, scaler, models, start, counts,
          other_models=sklearn_models,
          CPUS=CPUS_Models,
          save_name=name)
         for scaler in scalers)
Exemplo n.º 17
0
def outlier_detection(x_raw, y_raw):
    """
    Filter all ourlier points
    :param x_raw: feature in ndarray
    :param y_raw: label in ndarray
    :return x_clean, y_clean: cleaned feature and label in ndarray
    """
    # TODO Filter the outliers.
    print()
    print("Detecting outliers...")
    print("Before outlier detection: {}".format(x_raw.shape))
    outliers_fraction = 0.04
    random_state = np.random.RandomState(42)
    # all outlier detection method candidate list as follows
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(contamination=outliers_fraction,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction, random_state=random_state),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Local Outlier Factor (LOF)':
        LOF(contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)':
        MCD(contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)':
        OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)':
        PCA(contamination=outliers_fraction, random_state=random_state),
        'Improving Supervised Outlier Detection with Unsupervised Representation Learning':
        XGBOD(contamination=outliers_fraction),
    }
    clf_name = 'Isolation Forest'
    clf = IForest(contamination=outliers_fraction, random_state=random_state)
    # clf_name = 'Angle-based Outlier Detector (ABOD)'
    # clf = ABOD(contamination=outliers_fraction, method='default')
    clf.fit(x_raw)
    y_pred = clf.predict(x_raw)
    # for pyod, 1 means outliers and 0 means inliers
    # for sklearn,  -1 means outliers and 1 means inliers
    idx_y_pred = [i for i in range(0, 1212) if y_pred[i] == 1]
    x_clean = del_rowsorcolumns(x_raw, idx_y_pred, axis=0)
    y_clean = del_rowsorcolumns(y_raw, idx_y_pred, axis=0)
    print("After outlier detection: {}".format(x_clean.shape))
    assert (x_clean.shape[0] == y_clean.shape[0])
    return x_clean, y_clean
Exemplo n.º 18
0
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.X_test, self.y_train, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = LOF(contamination=self.contamination)
        self.clf.fit(self.X_train)
Exemplo n.º 19
0
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination)

        self.clf = LOF(contamination=self.contamination)
        self.clf.fit(self.X_train)
Exemplo n.º 20
0
def create_ensemble_LOF(ensemble_combinations, pca):
    model_list = []
    for ensemble_combination in ensemble_combinations:
        for i in range(3, pca + 1):
            for j in range(i + 1, pca + 1):
                for k in range(j + 1, pca + 1):
                    element = {
                        "model": SimpleDetectorAggregator,
                        "supervised": False,
                        "parameters": {
                            "method": ensemble_combination,
                            "base_estimators": [
                                LOF(n_neighbors=i),
                                LOF(n_neighbors=j),
                                LOF(n_neighbors=k),
                            ],
                        }
                    }
                    model_list.append(element)
    print(len(model_list))
    return model_list
Exemplo n.º 21
0
def ranger(parameter, classifier):
    __ = parameter
    classi__ = {
        'CBLOF': (CBLOF(contamination=outliers_fraction,
                        check_estimator=False,
                        random_state=random_state,
                        n_clusters=__)),
        'HBOS': (HBOS(contamination=outliers_fraction, n_bins=__)),
        'KNN': (KNN(contamination=outliers_fraction, n_neighbors=__)),
        'LOF': (LOF(n_neighbors=__, contamination=outliers_fraction))
    }
    return classi__[classifier]
def getOutlierLOF(dataset):
    '''
    @brief Function that executes LOF algorithm on the dataset and obtains the
    labels of the dataset indicating which instance is an inlier (0) or outlier (1)
    @param dataset Dataset on which to try the algorithm
    @return It returns a list of labels 0 means inlier, 1 means outlier
    '''
    # Initializating the model
    lof = LOF()
    # Fits the data and obtains labels
    lof.fit(dataset)
    # Return labels
    return lof.labels_
Exemplo n.º 23
0
    def __init__(self, *,
                 hyperparams: Hyperparams, #
                 random_seed: int = 0,
                 docker_containers: Dict[str, DockerContainer] = None) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)

        self._clf = LOF(contamination=hyperparams['contamination'],
                        n_neighbors=hyperparams['n_neighbors'],
                        algorithm=hyperparams['algorithm'],
                        leaf_size=hyperparams['leaf_size'],
                        metric=hyperparams['metric'],
                        p=hyperparams['p'],
                        metric_params=hyperparams['metric_params'],
                        )

        return
Exemplo n.º 24
0
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=42)

        detectors = [KNN(), LOF(), OCSVM()]

        self.clf = SimpleDetectorAggregator(base_estimators=detectors,
                                            method='maximization',
                                            contamination=self.contamination)
        self.clf.fit(self.X_train)
Exemplo n.º 25
0
def get_model_lof(percentage_of_outliers=0.002, num_neighbors=2):
    """Create a LOF model.

    Args:
        percentage_of_outliers: percentage of fraud on data
        num_neighbors: number of neighbors for kneighbors queries

    Returns:
        model: LOF model
    """
    utils.save_log('{0} :: {1}'.format(
        get_model_lof.__module__,
        get_model_lof.__name__))

    model = LOF(contamination=percentage_of_outliers,
                n_neighbors=num_neighbors,
                n_jobs=config.num_jobs)

    return model
def out_lier_score(df, target, num_var):

    scaler = MinMaxScaler(feature_range=(0, 1))
    df = scaler.fit_transform(df.loc[:, num_var], df[target])  #.to_numpy()
    random_state = np.random.RandomState(42)
    outliers_fraction = 0.05

    X = df
    df_out_score = []
    # Define seven outlier  tools detectionto be compared
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor (CBLOF)':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(LOF(n_neighbors=35),
                       contamination=outliers_fraction,
                       check_estimator=False,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction, random_state=random_state),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Average KNN':
        KNN(method='mean', contamination=outliers_fraction)
    }
    for i, (clf_name, clf) in enumerate(classifiers.items()):
        clf.fit(X)
        # predict raw anomaly score
        scores_pred = clf.decision_function(X) * -1
        # prediction of a datapoint category outlier or inlier
        y_pred = clf.predict(X)
        df_out_score.append(y_pred.tolist())

    df_out_score = pd.DataFrame(df_out_score).T
    df_out_score.columns = list(classifiers.keys())
    return df_out_score
Exemplo n.º 27
0
    def setUp(self):
        self.n_train = 1000
        self.n_test = 500
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.random_state = 42
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=self.random_state)

        self.base_estimators = [
            LOF(n_neighbors=5, contamination=self.contamination),
            LOF(n_neighbors=15, contamination=self.contamination),
            LOF(n_neighbors=25, contamination=self.contamination),
            LOF(n_neighbors=35, contamination=self.contamination),
            LOF(n_neighbors=45, contamination=self.contamination),
            HBOS(contamination=self.contamination),
            PCA(contamination=self.contamination),
            LSCP(detector_list=[
                LOF(n_neighbors=5, contamination=self.contamination),
                LOF(n_neighbors=15, contamination=self.contamination)
            ],
                 random_state=self.random_state)
        ]

        this_directory = os.path.abspath(os.path.dirname(__file__))

        self.cost_forecast_loc_fit_ = os.path.join(this_directory,
                                                   'bps_train.joblib')

        self.cost_forecast_loc_pred_ = os.path.join(this_directory,
                                                    'bps_prediction.joblib')

        self.model = SUOD(base_estimators=self.base_estimators,
                          n_jobs=2,
                          rp_flag_global=True,
                          bps_flag=True,
                          contamination=self.contamination,
                          approx_flag_global=True,
                          cost_forecast_loc_fit=self.cost_forecast_loc_fit_,
                          cost_forecast_loc_pred=self.cost_forecast_loc_pred_,
                          verbose=True)
Exemplo n.º 28
0
def outlier_score(x_train, y_train, algorithm=LOF(), method="unify"):
    number_of_instances = len(x_train)
    outlier_values = np.zeros(number_of_instances)
    outlier_labels = np.zeros(number_of_instances)
    number_of_classes = len(set(y_train))
    for i in range(number_of_classes):
        indices_of_class_members = [
            j for j in range(number_of_instances) if y_train[j] == i
        ]
        class_i_x_values = np.array(
            [x_train[j] for j in indices_of_class_members])
        algorithm.fit(X=class_i_x_values)
        partial_values = algorithm.predict_proba(X=class_i_x_values,
                                                 method=method)[:, 1]
        partial_labels = algorithm.labels_
        for t in range(len(indices_of_class_members)):
            index = indices_of_class_members[t]
            outlier_values[index] = partial_values[t]
            outlier_labels[index] = partial_labels[t]
    return outlier_values, outlier_labels.astype('int')
Exemplo n.º 29
0
def create_tunable_ensemble(knn_neighbors, lof_neighbors, abod_neighbors):
    model_list = []
    for knn_neighbor in knn_neighbors:
        for lof_neighbor in lof_neighbors:
            for abod_neighbor in abod_neighbors:
                element = {
                    "model": SimpleDetectorAggregator,
                    "supervised": False,
                    "parameters": {
                        "method": "average",
                        "base_estimators": [
                            KNN(n_neighbors=knn_neighbor),
                            LOF(n_neighbors=lof_neighbor),
                            ABOD(n_neighbors=abod_neighbor),
                            OCSVM()
                        ],
                    }
                }
                model_list.append(element)

    return model_list
Exemplo n.º 30
0
def choose_model(model, nnet):
    """ among implemented in PyOD """
    clfs = {
        'AE':
        AutoEncoder(hidden_neurons=nnet, contamination=0.1, epochs=15),
        'VAE':
        VAE(encoder_neurons=nnet[:5],
            decoder_neurons=nnet[4:],
            contamination=0.1,
            epochs=13),
        'ABOD':
        ABOD(),
        'FeatureBagging':
        FeatureBagging(),
        'HBOS':
        HBOS(),
        'IForest':
        IForest(),
        'KNN':
        KNN(),
        'LOF':
        LOF(),
        'OCSVM':
        OCSVM(),
        'PCA':
        PCA(),
        'SOS':
        SOS(),
        'COF':
        COF(),
        'CBLOF':
        CBLOF(),
        'SOD':
        SOD(),
        'LOCI':
        LOCI(),
        'MCD':
        MCD()
    }
    return clfs[model]