示例#1
0
 def outliers_detect(self, columns,outliers_fraction = 0.05):
     X = pd.get_dummies(self.data[columns])
     clf = CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=0)
     clf.fit(X)
     scores_pred = clf.decision_function(X) * -1
     y_pred = clf.predict(X)
     self.data['outlier'] = y_pred.tolist()
     n_inliers = len(y_pred) - np.count_nonzero(y_pred)
     n_outliers = np.count_nonzero(y_pred == 1)        
def getOutlierCBLOF(dataset):
    '''
    @brief Function that executes CBLOF algorithm on the dataset and obtains the
    labels of the dataset indicating which instance is an inlier (0) or outlier (1)
    @param dataset Dataset on which to try the algorithm
    @return It returns a list of labels 0 means inlier, 1 means outlier
    '''
    # Initializating the model
    cblof = CBLOF()
    # Fits the data and obtains labels
    cblof.fit(dataset)
    # Return labels
    return cblof.labels_
示例#3
0
    def cblof(self, X_train, contamination=None, random_state=None):
        """
        Train CBLOF model from PYOD

        Parameters
        __________
        X_train: scaled training data
        contamination: percentage of anomalies in the data
        random_state: random number seed

        Returns
        ________
        Anomaly scores
        """
        model = CBLOF(contamination=contamination, random_state=random_state)
        model.fit(X_train)

        # Predict raw anomaly score
        labels = model.predict(X_train)  # outlier labels (0 or 1)
        cblof_anomaly_scores = model.decision_function(
            X_train)  # outlier scores
        cblof_anomaly_scores = self.min_max_scaler(cblof_anomaly_scores)
        return cblof_anomaly_scores, labels
示例#4
0
    def get_CBOLF_scores(dataframe, cols, outliers_fraction=0.01):
        '''Takes df, a list selected column nmaes, outliers_fraction = 0.01 default
        
        Returns:
            df with CBOLF scores added         
        '''
        #standardize selected variables
        minmax = MinMaxScaler(feature_range=(0, 1))
        dataframe[cols] = minmax.fit_transform(dataframe[cols])

        #Convert dataframe to a numpy array in order to incorprate our algorithm
        arrays = []
        for row in cols:
            row = dataframe[row].values.reshape(-1, 1)
            arrays.append(row)
        X = np.concatenate((arrays), axis=1)

        #fit
        clf = CBLOF(contamination=outliers_fraction,
                    check_estimator=False,
                    random_state=0)
        clf.fit(X)

        # predict raw anomaly score
        scores_pred = clf.decision_function(X) * -1

        # prediction of a datapoint category outlier or inlier
        y_pred = clf.predict(X)
        n_inliers = len(y_pred) - np.count_nonzero(y_pred)
        n_outliers = np.count_nonzero(y_pred == 1)

        CheckOutliers.df1 = dataframe
        CheckOutliers.df1['outlier'] = y_pred.tolist()

        print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers,
              'found with CBLOF')
示例#5
0
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    X_train, X_test, y_train, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train CBLOF detector
    clf_name = 'CBLOF'
    clf = CBLOF(random_state=42)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
示例#6
0
#Extracting y-labels for the validation data and dropping in X data. Y labels will be the same for all feature sets ofcourse
Y_valid1 = X_valid1['Label_<lambda>']
X_valid1.drop(['Label_<lambda>'], inplace=True, axis=1)

# Reading original test data to extract the malicious flow data after prediction
orig_test_data = pd.read_csv("test_data.csv", header=None)
orig_test_data.columns = ['Date_Flow_Start', 'Duration','Protocol','Src_IP','Src_Port','Direction','Dst_IP','Dst_Port','State','Source_Service','Dest_Service','Total_Packets','BiDirection_Bytes','SrcToDst_Bytes']

""" Training on Feature Set 1

CBLOF on Default Parameters
"""

clf1 = CBLOF(random_state=42) # Default contamination 0.1
clf1.fit(X_train1)

#Setting threshold using the contamination parameter
dec_scores = clf1.decision_scores_
dec_scores_sorted=sorted(dec_scores, reverse=True)
a = round(len(X_train1) * clf1.contamination)
print(a)

anomalies=dec_scores_sorted[:a]
threshold = anomalies[-1]
print(threshold)

# Validation data is scored
y_valid_scores = clf1.decision_function(X_valid1)
y_valid_scores = pd.Series(y_valid_scores)
示例#7
0
class TestCBLOF(unittest.TestCase):
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = CBLOF(contamination=self.contamination, random_state=42)
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_') and
                self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and
                self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_') and
                self.clf.threshold_ is not None)
        assert (hasattr(self.clf, '_mu') and
                self.clf._mu is not None)
        assert (hasattr(self.clf, '_sigma') and
                self.clf._sigma is not None)
        assert (hasattr(self.clf, 'clustering_estimator_') and
                self.clf.clustering_estimator_ is not None)
        assert (hasattr(self.clf, 'cluster_labels_') and
                self.clf.cluster_labels_ is not None)
        assert (hasattr(self.clf, 'cluster_sizes_') and
                self.clf.cluster_sizes_ is not None)
        assert (hasattr(self.clf, 'cluster_centers_') and
                self.clf.cluster_centers_ is not None)
        assert (hasattr(self.clf, '_clustering_threshold') and
                self.clf._clustering_threshold is not None)
        assert (hasattr(self.clf, 'small_cluster_labels_') and
                self.clf.small_cluster_labels_ is not None)
        assert (hasattr(self.clf, 'large_cluster_labels_') and
                self.clf.large_cluster_labels_ is not None)
        assert (hasattr(self.clf, '_large_cluster_centers') and
                self.clf._large_cluster_centers is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_prediction_labels_confidence(self):
        pred_labels, confidence = self.clf.predict(self.X_test,
                                                   return_confidence=True)
        assert_equal(pred_labels.shape, self.y_test.shape)
        assert_equal(confidence.shape, self.y_test.shape)
        assert (confidence.min() >= 0)
        assert (confidence.max() <= 1)

    def test_prediction_proba_linear_confidence(self):
        pred_proba, confidence = self.clf.predict_proba(self.X_test,
                                                        method='linear',
                                                        return_confidence=True)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

        assert_equal(confidence.shape, self.y_test.shape)
        assert (confidence.min() >= 0)
        assert (confidence.max() <= 1)

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def test_model_clone(self):
        clone_clf = clone(self.clf)

    def tearDown(self):
        pass
示例#8
0
class TestLOF(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = CBLOF(contamination=self.contamination, random_state=42)
        self.clf.fit(self.X_train)

    def test_sklearn_estimator(self):
        # TODO: sklearn examples are too small to form valid
        # check_estimator(self.clf)
        pass

    def test_parameters(self):
        assert_true(hasattr(self.clf, 'decision_scores_') and
                    self.clf.decision_scores_ is not None)
        assert_true(hasattr(self.clf, 'labels_') and
                    self.clf.labels_ is not None)
        assert_true(hasattr(self.clf, 'threshold_') and
                    self.clf.threshold_ is not None)
        assert_true(hasattr(self.clf, '_mu') and
                    self.clf._mu is not None)
        assert_true(hasattr(self.clf, '_sigma') and
                    self.clf._sigma is not None)
        assert_true(hasattr(self.clf, 'clustering_estimator_') and
                    self.clf.clustering_estimator_ is not None)
        assert_true(hasattr(self.clf, 'cluster_labels_') and
                    self.clf.cluster_labels_ is not None)
        assert_true(hasattr(self.clf, 'cluster_sizes_') and
                    self.clf.cluster_sizes_ is not None)
        assert_true(hasattr(self.clf, 'cluster_centers_') and
                    self.clf.cluster_centers_ is not None)
        assert_true(hasattr(self.clf, '_clustering_threshold') and
                    self.clf._clustering_threshold is not None)
        assert_true(hasattr(self.clf, 'small_cluster_labels_') and
                    self.clf.small_cluster_labels_ is not None)
        assert_true(hasattr(self.clf, 'large_cluster_labels_') and
                    self.clf.large_cluster_labels_ is not None)
        assert_true(hasattr(self.clf, '_large_cluster_centers') and
                    self.clf._large_cluster_centers is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)
        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])
        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass
minmax = MinMaxScaler(feature_range=(0, 1))
df[['Sales', 'Profit']] = minmax.fit_transform(df[['Sales', 'Profit']])
print(df[['Sales', 'Profit']].head())

X1 = df['Sales'].values.reshape(-1, 1)
X2 = df['Profit'].values.reshape(-1, 1)

X = np.concatenate((X1, X2), axis=1)

outliers_fraction = 0.01
xx, yy = np.meshgrid(np.linspace(0, 1, 100), np.linspace(0, 1, 100))
clf = CBLOF(contamination=outliers_fraction,
            check_estimator=False,
            random_state=0)
clf.fit(X)
# predict raw anomaly score
scores_pred = clf.decision_function(X) * -1

# prediction of a datapoint category outlier or inlier
y_pred = clf.predict(X)
n_inliers = len(y_pred) - np.count_nonzero(y_pred)
n_outliers = np.count_nonzero(y_pred == 1)

plt.figure(figsize=(8, 8))

df1 = df
df1['outlier'] = y_pred.tolist()

# sales - inlier feature 1,  profit - inlier feature 2
inliers_sales = np.array(df1['Sales'][df1['outlier'] == 0]).reshape(-1, 1)
示例#10
0
    def get_outliers(dataframe,
                     cols,
                     outliers_fraction,
                     row_id,
                     n,
                     cbolf=True,
                     hbos=True,
                     iforest=True,
                     knn=True):
        '''
        Params:
            row_id ('str'): unique row identifier on the dataframe
            n(int): Minimum number of timmes an observation should be flagged as an outlier 
                    to be considered one
        
        Retrurns:
            List of index labels for rows in the dataframe that are flagged as outliers
        
        '''
        #standardize selected numerical variables
        minmax = MinMaxScaler(feature_range=(0, 1))
        dataframe[cols] = minmax.fit_transform(dataframe[cols])

        #Convert dataframe to a numpy array in order to incorprate our algorithm
        #Outliers.row_id = row_id
        arrays = []
        for row in cols:
            row = dataframe[row].values.reshape(-1, 1)
            arrays.append(row)
        X = np.concatenate((arrays), axis=1)

        if cbolf:
            '''Runs Cluster-Based Outlier Local Factor (CBOLF)  algorithm to identify outliers'''
            #fit
            clf = CBLOF(contamination=outliers_fraction,
                        check_estimator=False,
                        random_state=0)
            clf.fit(X)

            #predict raw anomaly score
            scores_pred = clf.decision_function(X) * -1

            #prediction of a datapoint category outlier or inlier
            y_pred = clf.predict(X)
            n_inliers = len(y_pred) - np.count_nonzero(y_pred)
            n_outliers = np.count_nonzero(y_pred == 1)

            #Hold results to dataframe and print findings
            Outliers.df1 = dataframe
            Outliers.df1['outlier'] = y_pred.tolist()
            Outliers.df1 = Outliers.df1.loc[Outliers.df1['outlier'] == 1]
            print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers,
                  'found with CBLOF')

        if hbos:
            '''Runs Histogram Based Outlier Score (HBOS) algorithm to identify outliers'''
            #fit
            clf = HBOS(contamination=outliers_fraction)
            clf.fit(X)

            #predict raw anomaly score
            scores_pred = clf.decision_function(X) * -1

            #prediction of a datapoint category outlier or inlier
            y_pred = clf.predict(X)
            n_inliers = len(y_pred) - np.count_nonzero(y_pred)
            n_outliers = np.count_nonzero(y_pred == 1)

            #Hold results to dataframe and print findings
            Outliers.df2 = dataframe
            Outliers.df2['outlier'] = y_pred.tolist()
            Outliers.df2 = Outliers.df2.loc[Outliers.df2['outlier'] == 1]
            print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers,
                  'found with HBOS')

        if iforest:
            '''Runs Isolation Forest algorithm to identify outliers'''
            #fit
            clf = IForest(contamination=outliers_fraction, random_state=0)
            clf.fit(X)

            #predict raw anomaly score
            scores_pred = clf.decision_function(X) * -1

            #prediction of a datapoint category outlier or inlier
            y_pred = clf.predict(X)
            n_inliers = len(y_pred) - np.count_nonzero(y_pred)
            n_outliers = np.count_nonzero(y_pred == 1)

            #Hold results to dataframe and print findings
            Outliers.df3 = dataframe
            Outliers.df3['outlier'] = y_pred.tolist()
            Outliers.df3 = Outliers.df3.loc[Outliers.df3['outlier'] == 1]
            print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers,
                  'found with IForest')

        if knn:
            '''Runs K-Nearest Neighbors algorithm to identify outliers'''
            #fit
            clf = KNN(contamination=outliers_fraction)
            clf.fit(X)

            #predict raw anomaly score
            scores_pred = clf.decision_function(X) * -1

            #prediction of a datapoint category outlier or inlier
            y_pred = clf.predict(X)
            n_inliers = len(y_pred) - np.count_nonzero(y_pred)
            n_outliers = np.count_nonzero(y_pred == 1)

            #Hold results to dataframe and print findings
            Outliers.df4 = dataframe
            Outliers.df4['outlier'] = y_pred.tolist()
            Outliers.df4 = Outliers.df4.loc[Outliers.df4['outlier'] == 1]
            print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers,
                  'found with KNN')

        #Merge dataframes
        merged_df = pd.concat(
            [Outliers.df1, Outliers.df2, Outliers.df3, Outliers.df4])

        #Get counts (Count number of times an observation is identified as an outlier)
        merged_df['count'] = merged_df.groupby(row_id)[row_id].transform(
            'count')
        #outliers['count'] = outliers.groupby('client_id')['client_id'].transform('count')

        #Filter common outliers (Outlier identified by all n algorithms)
        #common = outliers.loc[outliers['count'] >= n]
        common = merged_df.loc[merged_df['count'] >= n]

        #drop duplicates
        common = common.drop_duplicates(keep='last')

        #get list of indices to be removed on main dataframe
        Outliers.outlier_indices = []
        for index in common.index:
            Outliers.outlier_indices.append(index)

        #print(f' \n{common.shape[0]} outliers commonly found by all algorithms\n')
        print(
            f' \n{len(Outliers.outlier_indices)} outliers commonly found by all algorithms\n'
        )
        print(f'The row index labels are:\n {Outliers.outlier_indices}')
        return Outliers.outlier_indices
# In[14]:

data118457 = df[(df['CarId'] == '118457')]
x = data118457['Time']
y = data118457['Speed diff']

plt.figure(figsize=(10, 4))
plt.plot(x, y, label='Car 118457')
plt.xlabel('Time')
plt.ylabel('Speed diff')
plt.show()

# In[15]:

cblof = CBLOF()
cblof.fit(df['Speed diff'].values.reshape(-1, 1))
xx = np.linspace(df['Speed diff'].min(), df['Speed diff'].max(),
                 len(df)).reshape(-1, 1)
anomaly_score = cblof.decision_function(xx)
outlier = cblof.predict(xx)
plt.figure(figsize=(10, 4))
plt.plot(xx, anomaly_score, label='anomaly score')
plt.ylabel('anomaly score')
plt.xlabel('Speed diff')
plt.show()

# In[16]:

minmax = MinMaxScaler(feature_range=(0, 1))
df[['CarId', 'Speed diff', 'Heading diff',
    'Position diff']] = minmax.fit_transform(
示例#12
0
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train CBLOF detector
    clf_name = 'CBLOF'
    clf = CBLOF()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
from pyod.models.lscp import LSCP
# from pyod.models.auto_encoder import AutoEncoder

clf_knn = KNN()
clf_pca = PCA()
clf_mcd = MCD()
clf_lof = LOF()
clf_cblof = CBLOF()
# clf_lscp = LSCP([clf_knn, clf_pca, clf_mcd ])
# clf_ae = AutoEncoder(epochs=50)

clf_mcd.fit(encodings_train)
clf_pca.fit(encodings_train)
clf_knn.fit(encodings_train)
clf_lof.fit(encodings_train)
clf_cblof.fit(encodings_train)
# clf_lscp.fit(encodings_train)
# clf_ae.fit(encodings_train)

anomaly_scores_mcd = clf_mcd.decision_function(encodings_train)
anomaly_scores_pca = clf_pca.decision_function(encodings_train)
anomaly_scores_knn = clf_knn.decision_function(encodings_train)
anomaly_scores_lof = clf_lof.decision_function(encodings_train)
anomaly_scores_cblof = clf_cblof.decision_function(encodings_train)
# anomaly_scores_lscp = clf_lscp.decision_function(encodings_train)
# anomaly_scores_ae = clf_ae.predict_proba(encodings_train)

# y_test_scores = []
# for x,_ in test_loader:
#     encodings_test = encoder(torch.Tensor(x).to(device))
#     probs = clf.predict_proba(encodings_test.detach().cpu().numpy())
示例#14
0
    return mapped

# 数据预处理


insurance["smoker"] = map_smoking(insurance["smoker"])  # 置换成离散数值
insurance["sex"] = map_smoking(insurance["sex"])        # 置换成离散数值
insurance = insurance.drop('region', 1)                 # 丢弃地区信息
insurance["charges"] = standard(insurance["charges"])   # 归一化处理
insurance["age"] = standard(insurance["age"])
insurance["bmi"] = standard(insurance["bmi"])
insurance["age"] = standard(insurance["age"])
insurance["children"] = standard(insurance["children"])
insurance["smoker"] = standard(insurance["smoker"])

print(insurance)
# train CBLOF detector
clf_name = 'CBLOF'
clf = CBLOF()
clf.fit(insurance)
y_train_scores = clf.decision_scores_  # raw outlier scores
print(y_train_scores)
index = y_train_scores.argsort()
print(index)
show = []
for i in range(0, 5):
    show.append(insuranceori.iloc[index[i]])

print(show)