Пример #1
0
    def setUp(self):
        # Define data file and read X and y
        # Generate some data if the source data is missing
        this_directory = path.abspath(path.dirname(__file__))
        mat_file = 'cardio.mat'
        try:
            mat = loadmat(path.join(*[this_directory, 'data', mat_file]))

        except TypeError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        except IOError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        else:
            X = mat['X']
            y = mat['y'].ravel()
            X, y = check_X_y(X, y)

        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(X, y, test_size=0.4, random_state=42)

        self.detector_list = [LOF(), LOF()]
        self.clf = LSCP(self.detector_list)
        self.clf.fit(self.X_train)
        self.roc_floor = 0.6
Пример #2
0
Файл: ml.py Проект: pedrovhb/tcc
def train_model(station: Station) -> LSCP:
    t1 = time.time()
    log.info(f'Training model for {station}...')
    log.info('Loading training observations')
    observations_select = Observation.select(
        Observation.time,
        Observation.sample_frequency,
        Observation.sample_count,
        Observation.rms,
        Observation.crest,
        Observation.peak_to_peak,
        Observation.kurtosis,
    ).where(Observation.station == station, Observation.is_training)

    obs_data = []
    for observation in observations_select:
        obs_data.append([
            observation.rms, observation.peak_to_peak, observation.kurtosis,
            observation.crest
        ])

    log.info('Fitting LSCP model')
    lscp = LSCP([KNN()] * 5 + [LOF()] * 5 + [PCA()] * 5, contamination=0.03)
    lscp.fit(X=obs_data)
    log.info(f'Trained model in {time.time() - t1}')
    return lscp
 def setUp(self):
     self.n_train = 200
     self.n_test = 100
     self.contamination = 0.1
     self.roc_floor = 0.8
     self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
         n_train=self.n_train, n_test=self.n_test,
         contamination=self.contamination, random_state=42)
     self.X_train, self.X_test = standardizer(self.X_train, self.X_test)
     self.detector_list = [LOF(), LOF()]
     self.clf = LSCP(self.detector_list, contamination=self.contamination)
     self.clf.fit(self.X_train)
Пример #4
0
    def compute_ensemble_components(self, data_array):
        detector_list = []
        feature_index = np.array([i for i in range(data_array.shape[1])])
        for i in range(self.ensemble_size):
            # Randomly sample feature size
            feature_size = np.random.randint(self.dim_start, self.dim_end)
            # Randomly select features
            selected_features = np.random.choice(feature_index, feature_size)
            detector_list.append(kNN_LSCP(neighbor_size=self.neighbor, selected_features=selected_features))

        clf = LSCP(detector_list)
        clf.fit(data_array)
        score = clf.decision_scores_
        return [score, ]
Пример #5
0
def create_model(percentage_of_outliers=0.002):
    """Create a LSCP model.

    Args:
        percentage_of_outliers: percentage of fraud on data

    Returns:
        model: LSCP model
    """
    utils.save_log('{0} :: {1}'.format(
        create_model.__module__,
        create_model.__name__))

    bagging_model = \
        get_model_bagging(percentage_of_outliers=percentage_of_outliers)

    lof_model = \
        get_model_lof(percentage_of_outliers=percentage_of_outliers)

    cblof_model = \
        get_model_cblof(percentage_of_outliers=percentage_of_outliers)

    list_of_detectors = [bagging_model, lof_model, cblof_model]
    model = LSCP(detector_list=list_of_detectors,
                 contamination=percentage_of_outliers)

    return model
Пример #6
0
    def fit(self, X, y=None):
        """Fit detector. y is optional for unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : numpy array of shape (n_samples,), optional (default=None)
            The ground truth of the input samples (labels).
        """

        # Validate inputs X and y
        X = check_array(X)
        self._set_n_classes(y)
        self.detector_ = PyOD_LSCP(detector_list=self.base_estimators,
                                   local_region_size=self.local_region_size,
                                   local_max_features=self.local_max_features,
                                   n_bins=self.n_bins,
                                   random_state=self.random_state,
                                   contamination=self.contamination)
        self.detector_.fit(X)
        self.decision_scores_ = self.detector_.decision_scores_
        self._process_decision_scores()

        return self
Пример #7
0
def construct_lscp():
    from pyod.models.lscp import LSCP
    base_estimators = construct_raw_base_estimators()
    model = LSCP(base_estimators,
                 local_region_size=150,
                 contamination=0.05,
                 n_bins=10,
                 random_state=42)
    return model
Пример #8
0
def main():

    scalers = ['no', 'std', 'minmax']
    root = 'Unsupervised_Anamaly_Detection_csv'
    start = 0
    counts = 90
    CPUS = 3
    CPUS_Models = 4
    sklearn_models = [
        'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS',
        'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD',
        'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal',
        'VAE', 'AutoEncoder'
    ]

    models = {
        'BRM': BRM(bootstrap_sample_percent=70),
        'GM': GaussianMixture(),
        'IF': IsolationForest(),
        'OCSVM': OneClassSVM(),
        'EE': EllipticEnvelope(),
        'AvgKNN': KNN(method='mean'),
        'LargestKNN': KNN(method='largest'),
        'MedKNN': KNN(method='median'),
        'PCA': PCA(),
        'COF': COF(),
        'LODA': LODA(),
        'LOF': LOF(),
        'HBOS': HBOS(),
        'MCD': MCD(),
        'AvgBagging': FeatureBagging(combination='average'),
        'MaxBagging': FeatureBagging(combination='max'),
        'CBLOF': CBLOF(n_clusters=10, n_jobs=4),
        'FactorAnalysis': FactorAnalysis(),
        'KernelDensity': KernelDensity(),
        'COPOD': COPOD(),
        'SOD': SOD(),
        'LSCPwithLODA': LSCP([LODA(), LODA()]),
        'AveLMDD': LMDD(dis_measure='aad'),
        'VarLMDD': LMDD(dis_measure='var'),
        'IqrLMDD': LMDD(dis_measure='iqr'),
        'SoGaal': SO_GAAL(),
        'MoGaal': MO_GAAL(),
        'VAE': VAE(encoder_neurons=[8, 4, 2]),
        'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]),
        'OCKRA': m_OCKRA(),
    }

    name = "30_Models"

    Parallel(n_jobs=CPUS) \
        (delayed(runByScaler)
         (root, scaler, models, start, counts,
          other_models=sklearn_models,
          CPUS=CPUS_Models,
          save_name=name)
         for scaler in scalers)
Пример #9
0
    def setUp(self):
        self.n_train = 1000
        self.n_test = 500
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.random_state = 42
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=self.random_state)

        self.base_estimators = [
            LOF(n_neighbors=5, contamination=self.contamination),
            LOF(n_neighbors=15, contamination=self.contamination),
            LOF(n_neighbors=25, contamination=self.contamination),
            LOF(n_neighbors=35, contamination=self.contamination),
            LOF(n_neighbors=45, contamination=self.contamination),
            HBOS(contamination=self.contamination),
            PCA(contamination=self.contamination),
            LSCP(detector_list=[
                LOF(n_neighbors=5, contamination=self.contamination),
                LOF(n_neighbors=15, contamination=self.contamination)
            ],
                 random_state=self.random_state)
        ]

        this_directory = os.path.abspath(os.path.dirname(__file__))

        self.cost_forecast_loc_fit_ = os.path.join(this_directory,
                                                   'bps_train.joblib')

        self.cost_forecast_loc_pred_ = os.path.join(this_directory,
                                                    'bps_prediction.joblib')

        self.model = SUOD(base_estimators=self.base_estimators,
                          n_jobs=2,
                          rp_flag_global=True,
                          bps_flag=True,
                          contamination=self.contamination,
                          approx_flag_global=True,
                          cost_forecast_loc_fit=self.cost_forecast_loc_fit_,
                          cost_forecast_loc_pred=self.cost_forecast_loc_pred_,
                          verbose=True)
Пример #10
0
    # 'Median KNN': KNN(method='median',
    #                   contamination=outliers_fraction),
    'Local Outlier Factor (LOF)':
    LOF(n_neighbors=35),
    # 'Local Correlation Integral (LOCI)':
    #     LOCI(contamination=outliers_fraction),
    'Minimum Covariance Determinant (MCD)':
    MCD(random_state=random_state),
    'One-class SVM (OCSVM)':
    OCSVM(),
    'Principal Component Analysis (PCA)':
    PCA(random_state=random_state),
    # 'Stochastic Outlier Selection (SOS)': SOS(
    #     contamination=outliers_fraction),
    'Locally Selective Combination (LSCP)':
    LSCP(detector_list, random_state=random_state),
    # 'Connectivity-Based Outlier Factor (COF)':
    #     COF(n_neighbors=35, contamination=outliers_fraction),
    # 'Subspace Outlier Detection (SOD)':
    #     SOD(contamination=outliers_fraction),
}

# Show all detectors
for i, clf in enumerate(classifiers.keys()):
    print('Model', i + 1, clf)

X_scale = pd.read_csv('after_reduce&scale.csv',
                      parse_dates=['Date'],
                      index_col='Date')

# Fit the models with the generated data and
Пример #11
0
    LOF(n_neighbors=15, contamination=contamination),
    LOF(n_neighbors=25, contamination=contamination),
    LOF(n_neighbors=35, contamination=contamination),
    LOF(n_neighbors=45, contamination=contamination),
    HBOS(contamination=contamination),
    PCA(contamination=contamination),
    OCSVM(contamination=contamination),
    KNN(n_neighbors=5, contamination=contamination),
    KNN(n_neighbors=15, contamination=contamination),
    KNN(n_neighbors=25, contamination=contamination),
    KNN(n_neighbors=35, contamination=contamination),
    KNN(n_neighbors=45, contamination=contamination),
    IForest(n_estimators=50, contamination=contamination),
    IForest(n_estimators=100, contamination=contamination),
    LSCP(detector_list=[
        LOF(contamination=contamination),
        LOF(contamination=contamination)
    ])
]

# number of the parallel jobs
n_jobs = 6
n_estimators = len(base_estimators)

# the algorithms that should be be using random projection
rp_clf_list = ['LOF', 'KNN', 'ABOD']
# the algorithms that should NOT use random projection
rp_ng_clf_list = ['IForest', 'PCA', 'HBOS']
# global flag for random projection
rp_flag_global = True
objective_dim = 6
rp_method = 'discrete'
Пример #12
0
    'MedKNN': KNN(method='median'),
    'PCA': PCA(),
    'COF': COF(),
    'LODA': LODA(),
    'LOF': LOF(),
    'HBOS': HBOS(),
    'MCD': MCD(),
    'AvgBagging': FeatureBagging(combination='average'),
    'MaxBagging': FeatureBagging(combination='max'),
    'IForest': IForest(),
    'CBLOF': CBLOF(n_clusters=10, n_jobs=4),
    'FactorAnalysis': FactorAnalysis(),
    'KernelDensity': KernelDensity(),
    'COPOD': COPOD(),
    'SOD': SOD(),
    'LSCPwithLODA': LSCP([LODA(), LODA()]),
    'AveLMDD': LMDD(dis_measure='aad'),
    'VarLMDD': LMDD(dis_measure='var'),
    'IqrLMDD': LMDD(dis_measure='iqr'),
    'SoGaal': SO_GAAL(),
    #'MoGaal':MO_GAAL(),
    'VAE': VAE(encoder_neurons=[8, 4, 2]),
    'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6])
}

models = {
    'XGBOD': XGBOD(),
    'BRM': BRM(),
    'GM': GaussianMixture(),
    'IF': IsolationForest(),
    'OCSVM': OneClassSVM(),
                 contamination=0.1), 'MO_GAAL'),
        # SO_GAAL pyod
        (SO_GAAL(stop_epochs=20,
                 lr_d=0.01,
                 lr_g=0.0001,
                 decay=1e-06,
                 momentum=0.9,
                 contamination=0.1), 'SO_GAAL'),
        # OCKRA github
        (m_ockra.m_OCKRA(), 'OCKRA'),
        # VAR LMDD pyOD
        (LMDD(dis_measure='var', random_state=rs), 'VAR_LMDD'),
        # LOCI pyod
        (LSCP(detector_list,
              local_region_size=30,
              local_max_features=1.0,
              n_bins=10,
              random_state=None,
              contamination=0.1), 'LSCP')
    ]

    # Select the model location with i to run
    i = 8
    had_error = []
    # Initialize the class anomaly
    #for i in range(1,8):
    #    try:
    #        AnomalyTester(models[i][0],models[i][1], rootDir)
    #        AnomalyTester(models[i][0],models[i][1]+'_std', rootDir, StandardScaler())
    #        AnomalyTester(models[i][0],models[i][1]+'_mm', rootDir, MinMaxScaler())
    #    except:
    #        had_error.append(i)
Пример #14
0
def get_estimators_small(contamination=0.1):
    """Internal method to create a list of 600 base outlier detectors.

    Parameters
    ----------
    contamination : float in (0., 0.5), optional (default=0.1)
        The amount of contamination of the data set,
        i.e. the proportion of outliers in the data set. Used when fitting to
        define the threshold on the decision function.

    Returns
    -------
    base_detectors : list
        A list of initialized random base outlier detectors.

    """
    base_estimators = [
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        HBOS(contamination=contamination),
        PCA(contamination=contamination),
        OCSVM(contamination=contamination),
        KNN(n_neighbors=5, contamination=contamination),
        KNN(n_neighbors=15, contamination=contamination),
        KNN(n_neighbors=25, contamination=contamination),
        KNN(n_neighbors=35, contamination=contamination),
        KNN(n_neighbors=45, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),

        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        HBOS(contamination=contamination),
        PCA(contamination=contamination),
        OCSVM(contamination=contamination),
        KNN(n_neighbors=5, contamination=contamination),
        KNN(n_neighbors=15, contamination=contamination),
        KNN(n_neighbors=25, contamination=contamination),
        KNN(n_neighbors=35, contamination=contamination),
        KNN(n_neighbors=45, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),

        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        HBOS(contamination=contamination),
        PCA(contamination=contamination),
        OCSVM(contamination=contamination),
        KNN(n_neighbors=5, contamination=contamination),
        KNN(n_neighbors=15, contamination=contamination),
        KNN(n_neighbors=25, contamination=contamination),
        KNN(n_neighbors=35, contamination=contamination),
        KNN(n_neighbors=45, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),

        LSCP(detector_list=[LOF(contamination=contamination),
                            LOF(contamination=contamination)]),
        LSCP(detector_list=[LOF(contamination=contamination),
                            LOF(contamination=contamination)]),
        LSCP(detector_list=[LOF(contamination=contamination),
                            LOF(contamination=contamination)]),
        LSCP(detector_list=[LOF(contamination=contamination),
                            LOF(contamination=contamination)]),
        LSCP(detector_list=[LOF(contamination=contamination),
                            LOF(contamination=contamination)])
    ]
    return base_estimators
Пример #15
0
    X_train, X_test, y_train, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train lscp
    clf_name = 'LSCP'
    detector_list = [
        LOF(n_neighbors=15),
        LOF(n_neighbors=20),
        LOF(n_neighbors=25),
        LOF(n_neighbors=35)
    ]
    clf = LSCP(detector_list, random_state=42)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
Пример #16
0
class LSCP(BaseAggregator):
    """ Locally Selection Combination in Parallel Outlier Ensembles

    LSCP is an unsupervised parallel outlier detection ensemble which selects
    competent detectors in the local region of a test instance. This
    implementation uses an Average of Maximum strategy. First, a heterogeneous
    list of base detectors is fit to the training data and then generates a
    pseudo ground truth for each train instance is generated by
    taking the maximum outlier score.

    For each test instance:
    1) The local region is defined to be the set of nearest training points in
    randomly sampled feature subspaces which occur more frequently than
    a defined threshold over multiple iterations.

    2) Using the local region, a local pseudo ground truth is defined and the
    pearson correlation is calculated between each base detector's training
    outlier scores and the pseudo ground truth.

    3) A histogram is built out of pearson correlation scores; detectors in
    the largest bin are selected as competent base detectors for the given
    test instance.

    4) The average outlier score of the selected competent detectors is taken
    to be the final score.

    See :cite:`zhao2019lscp` for details.

    Parameters
    ----------
    base_estimators : list, length must be greater than 1
        Base unsupervised outlier detectors from PyOD. (Note: requires fit and
        decision_function methods)

    local_region_size : int, optional (default=30)
        Number of training points to consider in each iteration of the local
        region generation process (30 by default).

    local_max_features : float in (0.5, 1.), optional (default=1.0)
        Maximum proportion of number of features to consider when defining the
        local region (1.0 by default).

    n_bins : int, optional (default=10)
        Number of bins to use when selecting the local region

    random_state : RandomState, optional (default=None)
        A random number generator instance to define the state of the random
        permutations generator.

    contamination : float in (0., 0.5), optional (default=0.1)
        The amount of contamination of the data set, i.e.
        the proportion of outliers in the data set. Used when fitting to
        define the threshold on the decision function (0.1 by default).

    pre_fitted: bool, optional (default=False)
        Whether the base estimators are trained. If True, `fit`
        process may be skipped.

    Attributes
    ----------
    decision_scores_ : numpy array of shape (n_samples,)
        The outlier scores of the training data.
        The higher, the more abnormal. Outliers tend to have higher
        scores. This value is available once the detector is fitted.

    threshold_ : float
        The threshold is based on ``contamination``. It is the
        ``n_samples * contamination`` most abnormal samples in
        ``decision_scores_``. The threshold is calculated for generating
        binary outlier labels.

    labels_ : int, either 0 or 1
        The binary labels of the training data. 0 stands for inliers
        and 1 for outliers/anomalies. It is generated by applying
        ``threshold_`` on ``decision_scores_``.
    """

    def __init__(self, base_estimators, local_region_size=30,
                 local_max_features=1.0, n_bins=10,
                 random_state=None, contamination=0.1, pre_fitted=False):
        super(LSCP, self).__init__(base_estimators=base_estimators,
                                   pre_fitted=pre_fitted)

        if not (0. < contamination <= 0.5):
            raise ValueError("contamination must be in (0, 0.5], "
                             "got: %f" % contamination)
        self.contamination = contamination
        self.base_estimators = base_estimators
        self.local_region_size = local_region_size
        self.local_max_features = local_max_features
        self.n_bins = n_bins
        self.random_state = random_state

    def fit(self, X, y=None):
        """Fit detector. y is optional for unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : numpy array of shape (n_samples,), optional (default=None)
            The ground truth of the input samples (labels).
        """

        # Validate inputs X and y
        X = check_array(X)
        self._set_n_classes(y)
        self.detector_ = PyOD_LSCP(detector_list=self.base_estimators,
                                   local_region_size=self.local_region_size,
                                   local_max_features=self.local_max_features,
                                   n_bins=self.n_bins,
                                   random_state=self.random_state,
                                   contamination=self.contamination)
        self.detector_.fit(X)
        self.decision_scores_ = self.detector_.decision_scores_
        self._process_decision_scores()

        return self

    def decision_function(self, X):
        """Predict raw anomaly scores of X using the fitted detector.

        The anomaly score of an input sample is computed based on the fitted
        detector. For consistency, outliers are assigned with
        higher anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """

        check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
        X = check_array(X)

        return self.detector_.decision_function(X)

    def predict(self, X):
        """Predict if a particular sample is an outlier or not.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        Returns
        -------
        outlier_labels : numpy array of shape (n_samples,)
            For each observation, tells whether or not
            it should be considered as an outlier according to the
            fitted model. 0 stands for inliers and 1 for outliers.
        """
        check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
        X = check_array(X)
        return self._detector_predict(X)

    def predict_proba(self, X, proba_method='linear'):
        """Predict the probability of a sample being outlier. Two approaches
        are possible:

        1. simply use Min-max conversion to linearly transform the outlier
           scores into the range of [0,1]. The model must be
           fitted first.
        2. use unifying scores, see :cite:`kriegel2011interpreting`.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        proba_method : str, optional (default='linear')
            Probability conversion method. It must be one of
            'linear' or 'unify'.

        Returns
        -------
        outlier_labels : numpy array of shape (n_samples,)
            For each observation, tells whether or not
            it should be considered as an outlier according to the
            fitted model. Return the outlier probability, ranging
            in [0,1].
        """

        check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
        X = check_array(X)
        return self._detector_predict_proba(X, proba_method)
Пример #17
0
                       contamination=outliers_fraction),
    # 'Median KNN': KNN(method='median',
    #                   contamination=outliers_fraction),
    '(LOF) Local Outlier Factor ':
        LOF(n_neighbors=35, contamination=outliers_fraction),
    # 'Local Correlation Integral (LOCI)':
    #     LOCI(contamination=outliers_fraction),
    '(MCD) Minimum Covariance Determinant ': MCD(
        contamination=outliers_fraction, random_state=random_state),
    'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
    '(PCA) Principal Component Analysis ': PCA(
        contamination=outliers_fraction, random_state=random_state),
    # 'Stochastic Outlier Selection (SOS)': SOS(
    #     contamination=outliers_fraction),
    '(LSCP) Locally Selective Combination ': LSCP(
        detector_list, contamination=outliers_fraction,
        random_state=random_state),
    # 'Connectivity-Based Outlier Factor (COF)':
    #     COF(n_neighbors=35, contamination=outliers_fraction),
    # 'Subspace Outlier Detection (SOD)':
    #     SOD(contamination=outliers_fraction),
}
st.subheader('SELECT AN ALGORITHM:')

classifier_name = st.selectbox('THE ALGORITHM',[*classifiers])

# Show all detectors
st.subheader(f'Model is: {classifier_name}')
st.write(f'Parameters are: {classifiers[classifier_name]}')

# Fit the models with the generated data and
Пример #18
0
def main():

    # PART 1:
    # Getting the predictions for each classifier
    # SK means: The classifier is from sklearn or works like sklearn
    # PY means: The classifier is from pyod or works like pyod

    models = {
        'SK_EE': EllipticEnvelope(),
        'SK_GM': GaussianMixture(),
        'SK_IF': IsolationForest(),
        'SK_OCSVM': OneClassSVM(),
        'SK_FA': FactorAnalysis(),
        'SK_KD': KernelDensity(),
        'PY_PCA': PCA(),
        'PY_COF': COF(),
        'PY_LODA': LODA(),
        'PY_LOF': LOF(),
        'PY_HBOS': HBOS(),
        'PY_MCD': MCD(),
        'PY_AvgKNN': KNN(method='mean'),
        'PY_LargestKNN': KNN(method='largest'),
        'PY_MedKNN': KNN(method='median'),
        'PY_AvgBagging': FeatureBagging(combination='average'),
        'PY_MaxBagging': FeatureBagging(combination='max'),
        'PY_CBLOF': CBLOF(n_clusters=10, n_jobs=4),
        'PY_COPOD': COPOD(),
        'PY_SOD': SOD(),
        'PY_LSCPwithLODA': LSCP([LODA(), LODA()]),
        'PY_AveLMDD': LMDD(dis_measure='aad'),
        'PY_VarLMDD': LMDD(dis_measure='var'),
        'PY_IqrLMDD': LMDD(dis_measure='iqr'),
        'PY_VAE': VAE(encoder_neurons=[8, 4, 2]),
        'PY_AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]),
        'SK_BRM': BRM(bootstrap_sample_percent=70),
        'SK_OCKRA': m_OCKRA(),
        'PY_SoGaal': SO_GAAL(),
        'PY_MoGaal': MO_GAAL()
    }
    ranker = ADRanker(data="datasets", models=models)
    ranker.get_predictions()

    # PART 2:
    # After predictions, we can evaluate our classifiers using different scores
    # You can add manually a new metric by modifying 'metrics.py'

    ranker.get_scores(scores={'auc': Metrics.get_roc, 'ave': Metrics.get_ave})

    # PART 3:
    # Finally, it is time to summarize the results by plotting different graphs
    # You can add your own graphs by modifying ' plots.py'
    plot = Plots()
    plot.make_plot_basic(paths=[
        'results/scores/auc/no/results.csv',
        'results/scores/auc/minmax/results.csv',
        'results/scores/auc/std/results.csv',
        'results/scores/ave/no/results.csv',
        'results/scores/ave/minmax/results.csv',
        'results/scores/ave/std/results.csv'
    ],
                         scalers=[
                             'Without scaler', 'Min max scaler',
                             'Standard scaler', 'Without scaler',
                             'Min max scaler', 'Standard scaler'
                         ])

    plot.make_cd_plot(
        paths=[
            'results/scores/auc/minmax/results.csv',
            'results/scores/ave/no/results.csv',
            'results/scores/auc/no/results.csv',
            'results/scores/ave/no/results.csv',
            'results/scores/auc/std/results.csv',
            'results/scores/ave/std/results.csv'
        ],
        names=[
            'CD auc minmax scale', 'CD ave minmax scale', 'CD auc no scale',
            'CD ave no scale', 'CD auc std scale', 'CD ave std scale'
        ],
        titles=[
            'CD diagram - AUC with min max scaling',
            'CD diagram - Average precision with min max scaling',
            'CD diagram - AUC without scaling',
            'CD diagram - Average precision without scaling',
            'CD diagram - AUC with standard scaling',
            'CD diagram - Average precision with  standard scaling'
        ])
Пример #19
0
class TestLSCP(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=42)
        self.X_train, self.X_test = standardizer(self.X_train, self.X_test)
        self.detector_list = [LOF(), LOF()]
        self.clf = LSCP(self.detector_list, contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert_true(
            hasattr(self.clf, 'decision_scores_')
            and self.clf.decision_scores_ is not None)
        assert_true(
            hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)
        assert_true(
            hasattr(self.clf, 'threshold_')
            and self.clf.threshold_ is not None)
        assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None)
        assert_true(
            hasattr(self.clf, '_sigma') and self.clf._sigma is not None)
        assert_true(
            hasattr(self.clf, 'detector_list')
            and self.clf.detector_list is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test,
                                       self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass
Пример #20
0
class TestLSCP(unittest.TestCase):
    def setUp(self):
        # Define data file and read X and y
        # Generate some data if the source data is missing
        this_directory = path.abspath(path.dirname(__file__))
        mat_file = 'cardio.mat'
        try:
            mat = loadmat(path.join(*[this_directory, 'data', mat_file]))

        except TypeError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        except IOError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        else:
            X = mat['X']
            y = mat['y'].ravel()
            X, y = check_X_y(X, y)

        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(X, y, test_size=0.4, random_state=42)

        self.detector_list = [LOF(), LOF()]
        self.clf = LSCP(self.detector_list)
        self.clf.fit(self.X_train)
        self.roc_floor = 0.6

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_')
                and self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_')
                and self.clf.threshold_ is not None)
        assert (hasattr(self.clf, '_mu') and self.clf._mu is not None)
        assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None)
        assert (hasattr(self.clf, 'detector_list')
                and self.clf.detector_list is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test,
                                       self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass
# In[6]:

data195061 = df[(df['CarId'] == '195061')]
x = data195061['Time']
y = data195061['Speed diff']

plt.figure(figsize=(10, 4))
plt.plot(x, y, label='Car 195061')
plt.xlabel('Time')
plt.ylabel('Speed diff')
plt.show()

# In[7]:

lscp = LSCP(detector_list=[MCD(), MCD()])
lscp.fit(df['Speed diff'].values.reshape(-1, 1))
xx = np.linspace(df['Speed diff'].min(), df['Speed diff'].max(),
                 len(df)).reshape(-1, 1)
anomaly_score = lscp.decision_function(xx)
outlier = lscp.predict(xx)
plt.figure(figsize=(10, 4))
plt.plot(xx, anomaly_score, label='anomaly score')
plt.ylabel('anomaly score')
plt.xlabel('Speed diff')
plt.show()

# In[8]:

df.loc[df['Speed diff'] > 10]
Пример #22
0
        for line in lines:
            lineData = line.strip().split(' ')
            lineData = list(map(lambda x: float(x), lineData))
            dataMat.append(lineData)
    return (np.array(dataMat))


data = data_loadDataSet()

X_train, y_train, X_test, y_test = generate_data(n_train=50,
                                                 n_test=50,
                                                 contamination=0.1,
                                                 random_state=42)
X_train, X_test = standardizer(X_train, X_test)
detector_list = [LOF(n_neighbors=10), LOF(n_neighbors=15)]
clf = LSCP(detector_list)
clf.fit(X_train)
clf.fit(data)
y_train_scores = clf.decision_scores_

sort_factor = argsort(y_train_scores, kind='quicksort')
print(sort_factor)
sort_factors = sort_factor[::-1]
print(sort_factors)
np.savetxt(r'C:\Users\zz\Desktop\res\lscp\D1_2.txt',
           sort_factors,
           fmt='%f',
           delimiter=' ')

# count = 0
# num = 97
Пример #23
0
def make_mlo(hub, data, train):
    '''
    Create the Machine Learning Object used for this sequence
    '''
    return LSCP(contamination=0.001)
Пример #24
0
def identify_outliers(df,
                      features,
                      contamination=0.1,
                      algorithms=['Isolation Forest']):
    """Cleans the outliers.

    Outlier detection using LSCP: Locally selective combination in parallel outlier ensembles.
    https://arxiv.org/abs/1812.01528


    Parameters
    ----------
    features : list
        List of feature names.

    df : DataFrame
        The data to be examined.

    contamination : float in (0., 0.5)
        the proportion of outliers in the data set.

    algorithms: list
        list with at the names of least 2 algorithms to be used during LSCP. A list of supported algorithms:

        ['Isolation Forest', 'Cluster-based Local Outlier Factor', 'Minimum Covariance Determinant (MCD)',
                  'Principal Component Analysis (PCA)', 'Angle-based Outlier Detector (ABOD)',
                  'Histogram-base Outlier Detection (HBOS)', 'K Nearest Neighbors (KNN)', 'Local Outlier Factor (LOF)',
                  'Feature Bagging', 'One-class SVM (OCSVM)']



    Returns
    -------

    df_sorted : DataFrame
        Original data with 3 new columns: anomaly_score, probability and prediction. Sorted on descending anomaly score.

    df_styled: DataFrame
        Styled version of df_sorted for use in Jupyter Notebook (i.e. display(df_styled)).
    """

    df_numeric = df.select_dtypes(
        include=[np.number])  # keep only numeric type features
    X = np.asarray(df_numeric)

    classifiers = {
        'Isolation Forest': IForest,
        'Cluster-based Local Outlier Factor': CBLOF,
        'Minimum Covariance Determinant (MCD)': MCD,
        'Principal Component Analysis (PCA)': PCA,
        'Angle-based Outlier Detector (ABOD)': ABOD,
        'Histogram-base Outlier Detection (HBOS)': HBOS,
        'K Nearest Neighbors (KNN)': knn,
        'Local Outlier Factor (LOF)': LOF,
        'Feature Bagging': FeatureBagging,
        'One-class SVM (OCSVM)': OCSVM,
    }

    if len(algorithms) > 1:
        selected_classifiers = [classifiers[x]() for x in algorithms]
        clf = LSCP(selected_classifiers, contamination=contamination)
    else:
        clf = classifiers[algorithms[0]](contamination=contamination)

    clf.fit(X)
    y_pred = clf.predict(X)

    y_predict_proba = clf.predict_proba(X, method='unify')
    y_predict_proba = [item[1] for item in y_predict_proba]

    outlier_index, = np.where(y_pred == 1)

    anomaly_score = clf.decision_function(X)
    anomaly_score = pd.DataFrame(anomaly_score, columns=['anomaly_score'])

    y_predict_proba = pd.DataFrame(y_predict_proba, columns=['probability'])
    prediction = pd.DataFrame(y_pred, columns=['prediction'])

    df.columns = features
    df_with_anomaly_score = pd.concat(
        [df, anomaly_score, y_predict_proba, prediction], axis=1)

    df_sorted = df_with_anomaly_score.sort_values(by='anomaly_score',
                                                  ascending=False)
    cm = sns.diverging_palette(220, 10, sep=80, n=7, as_cmap=True)
    df_styled = df_sorted.style.background_gradient(cmap=cm, subset=['anomaly_score']) \
        .apply(lambda x: ['background: MistyRose' if x.name in outlier_index.tolist() else '' for i in x], axis=1,
               subset=df_sorted.columns[:-3])

    return df_sorted, df_styled
Пример #25
0
    #                   contamination=outliers_fraction),
    'Local Outlier Factor (LOF)':
    LOF(n_neighbors=35, contamination=outliers_fraction),
    # 'Local Correlation Integral (LOCI)':
    #     LOCI(contamination=outliers_fraction),
    'Minimum Covariance Determinant (MCD)':
    MCD(contamination=outliers_fraction, random_state=random_state),
    'One-class SVM (OCSVM)':
    OCSVM(contamination=outliers_fraction),
    'Principal Component Analysis (PCA)':
    PCA(contamination=outliers_fraction, random_state=random_state),
    # 'Stochastic Outlier Selection (SOS)': SOS(
    #     contamination=outliers_fraction),
    'Locally Selective Combination (LSCP)':
    LSCP(detector_list,
         contamination=outliers_fraction,
         random_state=random_state),
    # 'Connectivity-Based Outlier Factor (COF)':
    #     COF(n_neighbors=35, contamination=outliers_fraction),
    # 'Subspace Outlier Detection (SOD)':
    #     SOD(contamination=outliers_fraction),
}

# Show all detectors
for i, clf in enumerate(classifiers.keys()):
    print('Model', i + 1, clf)

# Fit the models with the generated data and
# compare model performances
for i, offset in enumerate(clusters_separation):
    np.random.seed(42)
Пример #26
0
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      contamination=contamination,
                      random_state=42)
    X_train, X_test = standardizer(X_train, X_test)

    # train lscp
    clf_name = 'LSCP'
    detector_list = [LOF(), LOF()]
    clf = LSCP(detector_list, random_state=42)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)