print outliers_rows
# we can display the tuple of the row/column coordinates in the arry:
print(list(zip(outliers_rows, outliers_columns)))
# the univariate approach can reveal quite a lot of potential outliers.  It won't disclose an outlier that does not have an extreme value.  However it will reveal the outlier if it finds an unusual combination of values in two or more variables.  Often in such cases, the values of the involved variables may not even be extreme and therefore the outlier may slip away unnoticed.
# In order to discover cases where this happens, you can use a dimensionality reduction algorithm, such as PCA and then check the absolute values of the components that are beyond three standard deviations
# Scikit-Learn offers a couple of classes that can automatically work for you straight out of the box and signal all suspect classes: the covariance.EllipticEnvelope class fits a robust distribution estimation of your data, pointing out outliers that might be contaminating your dataset because they are the extreme points in the general distribution of the data. svm.OneClassSVM class is a support vector machine algorithm that can approximate the shape of your data and find out if any new instances provided should be considered as a novelty (acts as novelty detector by default, assumes no outlier in data), by modifying its parameters can work on dataset where outliers are present providing an even more robust and reliable outlier detection system than EllipticEnvelope

## EllipticEnvelope - a function that tries to figure out the key parameters of your data's general distribution by assuming that your entire data is an expression of an underlying multivariate Gaussian distribution.  We can say that it checks the distance of each observation with respect to a grand mean that takes into account all the variables in your dataset.  Thus it is able to spot both univariate and multivariate outliers.
# The only parameter you have to take into account when using this function from the covariance module is the contamination parameter, which can take a value of up to 0.5.  Situations may vary from dataset to dataset, however as a starting figure, we suggest a value from 0.01-0.02 since it is the percentage of observations that should fall over the absolute value 3 in the Z score distance from the mean in a standardize normal distribution.  For this reason, we deem the default value of 0.1 as too high.
# create an artifical distribution made of blobs
from sklearn.datasets import make_blobs
blobs = 1 # The number of distributions (parameter centers) is related to the user-defined variable blobs, which is initially set to 1
blob = make_blobs(n_samples=100, n_features=2, centers=blobs, cluster_std=1.5, shuffle=True, random_state=5) # creates a certain number of distributions into a bidimensional space for a total of 100 examples (n_samples parameter).
# Robust Covariance Estimate
from sklearn.covariance import EllipticEnvelope
robust_covariance_est = EllipticEnvelope(contamination=.1).fit(blob[0]) # running EllipticEnvelope with a contamination rate of 10 percent helps you find out the most extreme values in the distribution.  The model deploys first fit by using the .fit() method on the EllipticEnvelope class.
detection = robust_covariance_est.predict(blob[0]) # Now the model prediction is obtained by using the predict() method on the data that was used for fit.
outliers= np.where(detection == -1) # results corresponding to a vector of values 1 and -1, -1 being the mark for anomalous examples
inliers = np.where(detection == 1)
# the distinction between inliers and outliers is recorded in the variable's outliers and inliers which contain the indexes of the examples
# Draw the distribution and detected outliers
import matplotlib.pyplot as plt # just the distribution
plt.plot(blob[0][:,0], blob[0][:,1], 'x', markersize=10, color='black', alpha=0.8)
plt.show()
# The distribution and the outliers
a = plt.plot(blob[0][inliers,0],blob[0][inliers,1],'x',markersize=10,color='black',alpha=0.8,label='inliers')
b = plt.plot(blob[0][outliers,0],blob[0][outliers,1],'o',markersize=6,color='black',alpha=0.8,label='outliers')
plt.legend((a[0],b[0]),('inliers', 'outliers'), numpoints=1, loc='lower right')
plt.show() # in the case of a unique underlying multivariate distribution (when the variable blobs=1), the EllipticEnvelope algorithm has successfully located 10 percent of the observations on the fringe of the distribution and has consequently signaled all suspect outliers.
# a limitation of the EllipticEnvelope is when multiple distributions are present in the data as if there were two or more natural clusters, the algorithm, trying to fit a unique general distribution, tends to locate the potential outliers on just the most remote cluster, thus ignoring other areas of data that might be potentially affected by outlying cases, which is a situation that could occur with real data.
# Boston data to use in data that's real
예제 #2
0
rng = np.random.RandomState(42)

# Example settings
n_samples = 200
outliers_fraction = 0.25
clusters_separation = [0, 1, 2]

# define two outlier detection tools to be compared
classifiers = {
    "One-Class SVM":
    svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
                    kernel="rbf",
                    gamma=0.1),
    "Robust covariance":
    EllipticEnvelope(contamination=outliers_fraction),
    "Isolation Forest":
    IsolationForest(max_samples=n_samples,
                    contamination=outliers_fraction,
                    random_state=rng)
}

# Compare given classifiers under given settings
xx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500))
n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)
ground_truth = np.ones(n_samples, dtype=int)
ground_truth[-n_outliers:] = -1

# Fit the problem with varying cluster separation
for i, offset in enumerate(clusters_separation):
예제 #3
0
def OutLierDetection(df,feature1,feature2,outliers_fraction=.1):
    
    new_df = df.copy()
    rng = np.random.RandomState(42)

    # Example settings
    n_samples = new_df.shape[0]
#     outliers_fraction = 0.2 # ************************************** imp
    clusters_separation = [0]#, 1, 2]

    # define two outlier detection tools to be compared
    classifiers = {
        "One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
                                         kernel="rbf", gamma=0.1),
        "Robust covariance": EllipticEnvelope(contamination=outliers_fraction),
        "Isolation Forest": IsolationForest(max_samples=n_samples,
                                            contamination=outliers_fraction,
                                            random_state=rng),
        "Local Outlier Factor": LocalOutlierFactor(
            n_neighbors=35,
            contamination=outliers_fraction)}

    
    xx, yy = np.meshgrid(np.linspace(new_df[feature1].min()-new_df[feature1].min()*10/100, 
                                     new_df[feature1].max()+new_df[feature1].max()*10/100, 50),
                         np.linspace(new_df[feature2].min()-new_df[feature2].min()*10/100,
                                     new_df[feature2].max()+new_df[feature2].max()*10/100, 50))


    n_inliers = int((1. - outliers_fraction) * n_samples)
    n_outliers = int(outliers_fraction * n_samples)
    ground_truth = np.ones(n_samples, dtype=int)
    ground_truth[-n_outliers:] = -1

    # Fit the problem with varying cluster separation
    for i, offset in enumerate(clusters_separation):
        np.random.seed(42)
        # Data generation

        X = new_df[[feature1,feature2]].values.tolist()

        # Fit the model
        plt.figure(figsize=(9, 7))
        for i, (clf_name, clf) in enumerate(classifiers.items()):
            # fit the data and tag outliers
            if clf_name == "Local Outlier Factor":
                y_pred = clf.fit_predict(X)
                scores_pred = clf.negative_outlier_factor_
            else:
                clf.fit(X)
                scores_pred = clf.decision_function(X)
                y_pred = clf.predict(X)
            threshold = stats.scoreatpercentile(scores_pred,
                                                100 * outliers_fraction)
            n_errors = (y_pred != ground_truth).sum()
            
            unique, counts = np.unique(y_pred,return_counts=True)
            print(clf_name,dict(zip(unique, counts)))
            
            new_df[feature1+'_'+feature2+clf_name] = y_pred
#             print(clf_name,y_pred) 
            # plot the levels lines and the points
            if clf_name == "Local Outlier Factor":
                # decision_function is private for LOF
                Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])
            else:
                Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
            Z = Z.reshape(xx.shape)
            subplot = plt.subplot(2, 2, i + 1)
            subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),
                             cmap=plt.cm.Blues_r)
            a = subplot.contour(xx, yy, Z, levels=[threshold],
                                linewidths=2, colors='red')
            subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],
                             colors='orange')
            b = plt.scatter(new_df[feature1], new_df[feature2], c='white',
                     s=20, edgecolor='k')

            subplot.axis('tight')

            subplot.set_xlabel("%s" % (feature1))
 
            plt.ylabel(feature2)#, fontsize=18)
            plt.title("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors))

        plt.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26)
#         plt.suptitle("Outlier detection")

    plt.show()
    return new_df
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.svm import OneClassSVM
import matplotlib.pyplot as plt
import matplotlib.font_manager
from sklearn.datasets import load_boston

# Get data
X1 = load_boston()['data'][:, [8, 10]]  # two clusters
X2 = load_boston()['data'][:, [5, 12]]  # "banana"-shaped

# Define "classifiers" to be used
classifiers = {
    "Empirical Covariance":
    EllipticEnvelope(support_fraction=1., contamination=0.261),
    "Robust Covariance (Minimum Covariance Determinant)":
    EllipticEnvelope(contamination=0.261),
    "OCSVM":
    OneClassSVM(nu=0.261, gamma=0.05)
}
colors = ['m', 'g', 'b']
legend1 = {}
legend2 = {}

# Learn a frontier for outlier detection with several classifiers
xx1, yy1 = np.meshgrid(np.linspace(-8, 28, 500), np.linspace(3, 40, 500))
xx2, yy2 = np.meshgrid(np.linspace(3, 10, 500), np.linspace(-5, 45, 500))
for i, (clf_name, clf) in enumerate(classifiers.items()):
    plt.figure(1)
    clf.fit(X1)
예제 #5
0
    print "Output labels:",' '.join(outlabels)
    if len(outlabels) != len(targets):
        print "Warning, dataset is missing some targets"

    #perform PCA to detect help outliers
    pca = PCA(n_components=4,whiten=False)
    alldata = np.array(zip(*[v[1:] for v in cols]))
    alldata_pcs = pca.fit(alldata).transform(alldata)
    # Components
    print pca.components_
    # Percentage of variance explained for each components
    print 'explained variance ratio (first %d components):'%(len(pca.components_),), \
          pca.explained_variance_ratio_
    print 'total explained variance:',sum(pca.explained_variance_ratio_)

    outlier_classifier = EllipticEnvelope(contamination=.05)
    outlier_classifier.fit(alldata_pcs)
    inlier_classification = outlier_classifier.predict(alldata_pcs)
    #print "Outliers:",[i for (i,c) in enumerate(inlier_classification) if c<0]

    inputs = zip(*[v[1:] for v in cols if v[0] not in targets])
    outputs = zip(*[v[1:] for v in cols if v[0] in targets])
    #reject outliers
    inputs = [inputs[i] for (i,c) in enumerate(inlier_classification) if c>0]
    outputs = [outputs[i] for (i,c) in enumerate(inlier_classification) if c>0]
    print len(inputs),"inliers"
    #select output 0 (adapt_cost)
    #output = np.array([v[0] for v in outputs])
    #select output 2 (subopt_score)

    for ind in xrange(len(outputs[0])):
예제 #6
0
import math

# mu = train_latent_features[zeros_idx].mean()
# variance = train_latent_features[zeros_idx].var()
# sigma = math.sqrt(variance)
# x = np.linspace(mu - 3 * sigma, mu + 3 * sigma, 100)
# plt.plot(x, stats.norm.pdf(x, mu, sigma))
print(train_latent_features[ones_idx].mean(axis=1).shape)
plt.hist(train_latent_features[ones_idx][0])
plt.show()

exit()
# exit()
# model = svm.OneClassSVM(kernel="poly")
# oneclass_svm = IsolationForest(random_state=0)
model = EllipticEnvelope()
model.fit(train_latent_features)
oneclass_predictions = model.predict(train_latent_features)
masked_predictions = mask_preds_for_one_class(oneclass_predictions)
train_metrics = accuracy_fn(to_tensor(masked_predictions),
                            to_tensor(train_labels),
                            threshold=threshold)
train_metrics = {'train_' + k: v for k, v in train_metrics.items()}
print(f'***** Train Metrics ***** ')
print(
    f"Accuracy: {'%.5f' % train_metrics['train_accuracy']} "
    f"| UAR: {'%.5f' % train_metrics['train_uar']}| F1:{'%.5f' % train_metrics['train_f1']} "
    f"| Precision:{'%.5f' % train_metrics['train_precision']} "
    f"| Recall:{'%.5f' % train_metrics['train_recall']} | AUC:{'%.5f' % train_metrics['train_auc']}"
)
print('Train Confusion matrix - \n' +
예제 #7
0
#define x and y
x = data.drop('y', axis=1)
y = data['y']

# fig1 = plt.figure(figsize=(5,5))
# bad = plt.scatter(x['x1'][y==0],x['x2'][y==0],label = 'bad')
# good = plt.scatter(x['x1'][y==1],x['x2'][y==1],label='good')
# plt.title('raw data')
# plt.xlabel('x1')
# plt.ylabel('x2')
# plt.legend()
# plt.show()

#anomay detection

ad_model = EllipticEnvelope(contamination=0.02)
ad_model.fit(x[y == 0])
y_predict_bad = ad_model.predict(x[y == 0])
# ad_model.fit(x[y==1])
# y_predict_good = ad_model.predict(x[y==1])
fig2 = plt.figure(figsize=(5, 5))
bad = plt.scatter(x['x1'][y == 0], x['x2'][y == 0], label='bad')
good = plt.scatter(x['x1'][y == 1], x['x2'][y == 1], label='good')
plt.scatter(x['x1'][y == 0][y_predict_bad == -1],
            x['x2'][y == 0][y_predict_bad == -1],
            marker='x',
            s=150)
# plt.scatter(x['x1'][y==1][y_predict_good==-1],x['x2'][y==1][y_predict_good==-1],marker='x',s=150)
plt.title('raw data')
plt.xlabel('x1')
plt.ylabel('x2')
예제 #8
0
ONECLASS = {
    "IsolationForest":
    IsolationForest(n_estimators=100,
                    max_samples='auto',
                    contamination=0.1,
                    max_features=1.0,
                    bootstrap=False,
                    n_jobs=1,
                    random_state=random_state,
                    verbose=0),
    "OneClassSVM":
    OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1),
    "EllipticEnvelope":
    EllipticEnvelope(store_precision=True,
                     assume_centered=False,
                     support_fraction=None,
                     contamination=0.1,
                     random_state=random_state)
}

BINARY = {
    "SVM":
    SVC(C=1.0,
        kernel='rbf',
        degree=3,
        gamma='auto',
        coef0=0.0,
        shrinking=True,
        probability=False,
        tol=0.001,
        cache_size=200,
예제 #9
0
def update_filtered_div_caller(radius, contents, selected_date1,
                               selected_date2, selected_duration,
                               selected_time, selected_option, selected_caller,
                               selected_receiver, ml_value, contamination):
    # Date,Time,Duration Filter
    global suspicious_users
    if contents is not None:
        content_type, content_string = contents.split(',')
        decoded = base64.b64decode(content_string)
        global df
        df = pd.read_csv(io.StringIO(decoded.decode('utf-8')))
        preprocess_data(df)

    filtered_df = df[
        (df['Date'] >= pd.to_datetime(selected_date1))
        & (df['Date'] <= pd.to_datetime(selected_date2))
        & ((df['Duration'] >= selected_duration[0]) &
           (df['Duration'] <= selected_duration[1]))
        & ((df['Time'] < times[selected_time[1]]['label']) &
           (df['Time'] >= times[selected_time[0]]['label']))].reset_index(
               drop=True)

    def chk(lat1, lon1, radius):
        R = 6373.0
        global sel_lat
        global sel_lon
        lat2 = sel_lat
        lon2 = sel_lon

        dlon = lon2 - lon1
        dlat = lat2 - lat1

        a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
        c = 2 * atan2(sqrt(a), sqrt(1 - a))

        distance = R * c
        print(distance)
        if distance > radius:
            return False
        return True

    if radius != 0:
        towers_c = towers.copy()
        ver = towers_c.apply(lambda x: chk(x['lat'], x['lon'], radius), axis=1)
        towers_req = towers_c[ver]['TowerID'].unique()
        filtered_df = filtered_df[filtered_df['TowerID'].isin(towers_req)]

    if ml_value in [1, 2, 3, 4, 5, 6]:
        filtered_df = pd.merge(filtered_df,
                               towers[['lat', 'lon', 'TowerID', 'Suspicious']],
                               on='TowerID')
        filtered_df['Time_new'] = pd.to_datetime(filtered_df['Time'],
                                                 format='%H:%M:%S')
        filtered_df["Time_new"] = filtered_df["Time_new"].apply(
            lambda x: (x - x.replace(hour=0, minute=0, second=0, microsecond=0)
                       ).total_seconds())
        filtered_df["Suspicious users"] = filtered_df[[
            "Caller", "Receiver"
        ]].apply(lambda x: 1 if (x.Caller in suspicious_users or x.Receiver in
                                 suspicious_users) else 0,
                 axis=1)
        contamination /= 100
        if (ml_value == 3):
            iso = IsolationForest(contamination=contamination)
            mask = iso.fit_predict(filtered_df[[
                "Time_new", "Duration", "lat", "lon", 'Suspicious',
                'Suspicious users'
            ]]) == -1
            filtered_df = filtered_df[mask].drop(['lat', 'lon', 'Time_new'],
                                                 axis=1)
        elif (ml_value == 4):
            iso = EllipticEnvelope(contamination=contamination)
            mask = iso.fit_predict(filtered_df[[
                "Time_new", "Duration", "lat", "lon", 'Suspicious',
                'Suspicious users'
            ]]) == -1
            filtered_df = filtered_df[mask].drop(['lat', 'lon', 'Time_new'],
                                                 axis=1)
        elif (ml_value == 5):
            iso = LocalOutlierFactor(contamination=contamination)
            mask = iso.fit_predict(filtered_df[[
                "Time_new", "Duration", "lat", "lon", 'Suspicious',
                'Suspicious users'
            ]]) == -1
            filtered_df = filtered_df[mask].drop(['lat', 'lon', 'Time_new'],
                                                 axis=1)
        elif (ml_value == 1):
            filtered_df = filtered_df[filtered_df["Suspicious"] == 1]
            filtered_df = filtered_df.drop(['lat', 'lon', 'Time_new'], axis=1)
        elif (ml_value == 2):
            filtered_df = filtered_df[filtered_df["Suspicious users"] == 1]
            filtered_df = filtered_df.drop(['lat', 'lon', 'Time_new'], axis=1)
    # Number Filter
    # If Caller is Selected
    if (selected_option == 1):
        if selected_caller != 'None':
            filtered_df = filtered_df[(filtered_df['Caller'].isin(
                list(selected_caller)))].reset_index(drop=True)

# If Receiver is selected
    if (selected_option == 2):
        if selected_receiver != 'None':
            filtered_df = filtered_df[(filtered_df['Receiver'].isin(
                (selected_receiver)))].reset_index(drop=True)

# If the option either is selected
    if (selected_option == 3):
        if selected_caller != 'None' or selected_receiver != 'None':
            filtered_df = filtered_df[((filtered_df['Caller'].isin(
                list(selected_caller))) | (filtered_df['Receiver'].isin(
                    list(selected_receiver))))].reset_index(drop=True)
    # If option both is selected
    if (selected_option == 4):
        if selected_caller != 'None' and selected_receiver != 'None':
            filtered_df = df[((filtered_df['Caller'].isin(
                list(selected_caller))) & (filtered_df['Receiver'].isin(
                    list(selected_receiver))))].reset_index(drop=True)

    if filtered_df.shape[0] == 0:
        # No update since nothing matches
        return dash.no_update, 'Nothing Matches that Query'
    else:
        # Update Filtered Dataframe
        return filtered_df.to_json(date_format='iso',
                                   orient='split'), 'Updated'
예제 #10
0
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import neighbors
from sklearn.covariance import EllipticEnvelope

latitude = float(sys.argv[1])
longitude = float(sys.argv[2])

df = pd.read_csv(
    "C:/Users/Sangameswaran/WebstormProjects/Voldemort/pythonScripts/crime.csv"
)
df = df.drop(['crimetime'], axis=1)
X = np.array(df.drop(['type'], 1))
y = np.array(df['type'])

elliptic = EllipticEnvelope(contamination=0.15)
elliptic.fit(X)
prediction = elliptic.predict([[latitude, longitude]])

if prediction == -1:
    possibility = "Safe zone"
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    clf = neighbors.KNeighborsClassifier(n_neighbors=5)
    clf.fit(X_train, y_train)
    clf.score(X_test, y_test)
    val = np.array([[latitude, longitude]])
    p = clf.predict(val)
    if p == 0:
        possibility = "Sexual abuse"
    elif p == 1:
# License: BSD

import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.svm import OneClassSVM
import matplotlib.pyplot as plt
import matplotlib.font_manager
from sklearn.datasets import load_boston

# Get data
X1 = load_boston()['data'][:, [8, 10]]  # two clusters
X2 = load_boston()['data'][:, [5, 12]]  # "banana"-shaped

# Define "classifiers" to be used
classifiers = {
    "Empirical Covariance": EllipticEnvelope(support_fraction=1.,
                                            contamination=0.261),
    "Robust Covariance (Minimum Covariance Determinant)":
        EllipticEnvelope(contamination=0.261),
    "OCSVM": OneClassSVM(nu=0.261, gamma=0.05)}
colors = ['m', 'g', 'b']
legend1 = {}
legend2 = {}

# Learn a frontier for outlier detection with several classifiers
xx1, yy1 = np.meshgrid(np.linspace(-8, 28, 500), np.linspace(3, 40, 500))
xx2, yy2 = np.meshgrid(np.linspace(3, 10, 500), np.linspace(-5, 45, 500))
for i, (clf_name, clf) in enumerate(classifiers.iteritems()):
    plt.figure(1)
    clf.fit(X1)
    Z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()])
    Z1 = Z1.reshape(xx1.shape)
        return criterion(y_pred, target.long())


seed = 1200
annotation_path = "../Data/data/preprocessed_annotation_global.csv"
y = pd.read_csv(annotation_path)["label"]
names = y.astype('category').cat.categories
y = y.astype('category').cat.codes
meth_path = "../Data/data/preprocessed_Matrix_meth.csv"
mRNA_path = "../Data/data/preprocessed_Matrix_miRNA_deseq_correct.csv"
mRNA_normalized_path = "../Data/data/preprocessed_Matrix_mRNA_deseq_normalized_prot_coding_correct.csv"
files = [meth_path, mRNA_path, mRNA_normalized_path]
outliers = [
    LocalOutlierFactor(novelty=True),
    IsolationForest(),
    EllipticEnvelope(random_state=0),
    svm.OneClassSVM()
]
filenames = ["meth", "mrna", "micro mrna"]
modelnames = [
    "mlp-local-outlier", "mlp-isolation-forest", "mlp-elliptic",
    "mlp-one-class"
]
for modelname, outlier in zip(modelnames, outliers):
    for file, filename in zip(files, filenames):
        with open('../Data/outputs/' + filename + '-bnn-output.txt', 'w') as f:
            X = pd.read_csv(file, index_col=False, header=None)
            if (filename == "mrna"):
                X = pd.DataFrame(X[X.std().sort_values(
                    ascending=False).head(1200).index].values.tolist())
            X_train, X_test, y_train, y_test = train_test_split(
예제 #13
0
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
sys.path.append('../deployment/prediction/')
from src.utils import upload_model_to_s3
from src.config import MODEL_FILENAME, MODEL_EXTENSION, BUCKET_NAME, AWS_PROFILE, SCALER_FILENAME


df = pd.read_csv('../../data/raw/data_2020_05.csv', parse_dates=['ts'])
user_ids = df.user_id.unique()

min_hr = 1  # Values below this ignored
min_rr = 3  # Values below this ignored
for user in user_ids:
    X = df[(df.in_room == True) &
           (df.user_id == user) &
           (df.hr > min_hr) &
           (df.rr > min_rr)][['hr', 'rr']]

    X_tr, X_va = train_test_split(X, test_size=0.2, shuffle=False)

    scaler = StandardScaler()
    X_tr_scaled = scaler.fit_transform(X_tr)

    # Fit Gaussian to data to detect outliers
    el = EllipticEnvelope(contamination=0.12)
    el.fit(X_tr_scaled)

    savepath = '../../data/models/'
    upload_model_to_s3(el, user, MODEL_FILENAME, savepath, profile=AWS_PROFILE)
    upload_model_to_s3(scaler, user, SCALER_FILENAME, savepath, profile=AWS_PROFILE)
예제 #14
0
def model_monitor(country="total", dev=DEV, training=True):
    """
    performance monitoring
    """
    print("Monitor Model")

    ## import data
    datasets = engineer_features(training=training, dev=dev)
    X, y, dates, labels = datasets[country]
    dates = pd.to_datetime(dates)
    print(X.shape)

    ## train the model
    if training:
        _model_train(X, y, labels, tag=country, dev=dev)

    ## monitor RMSE
    samples = [10, 20, 30, 50, 60]

    for n in samples:
        X_new, y_new, dates_new = simulate_samples(n, X, y, dates)
        queries = [(str(d.year), str(d.month), str(d.day), country)
                   for d in dates_new]
        y_pred = [
            model_predict(year=query[0],
                          month=query[1],
                          day=query[2],
                          country=query[3],
                          verbose=False,
                          dev=dev)["y_pred"][0].round(2) for query in queries
        ]
        rmse = np.sqrt(mean_squared_error(y_new.tolist(), y_pred))
        print("sample size: {}, RSME: {}".format(n, rmse.round(2)))

    ## monitor performance
    ## scaling
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    samples = [25, 50, 75, 90]

    clf_y = EllipticEnvelope(random_state=0, contamination=0.01)
    clf_X = EllipticEnvelope(random_state=0, contamination=0.01)

    clf_X.fit(X)
    clf_y.fit(y.reshape(y.size, 1))

    results = defaultdict(list)
    for n in samples:
        X_new, y_new, dates_new = simulate_samples(n, X, y, dates)
        results["sample_size"].append(n)
        results['wasserstein_X'].append(
            np.round(wasserstein_distance(X.flatten(), X_new.flatten()), 2))
        results['wasserstein_y'].append(
            np.round(wasserstein_distance(y, y_new), 2))
        test1 = clf_X.predict(X_new)
        test2 = clf_y.predict(y_new.reshape(y_new.size, 1))
        results["outlier_percent_X"].append(
            np.round(1.0 - (test1[test1 == 1].size / test1.size), 2))
        results["outlier_percent_y"].append(
            np.round(1.0 - (test2[test2 == 1].size / test2.size), 2))

    return pd.DataFrame(results)
예제 #15
0
    def fuse_to_get_results(self, weights, num_comp):
        if weights[0] != 0:
            self.apply_pca(num_comp)
            # Make sure you apply pca before using Envelop -- it is very sensitive to the feature dimensions
            clf_een = EllipticEnvelope(store_precision=True,
                                       assume_centered=False,
                                       support_fraction=0.25,
                                       contamination=0.1,
                                       random_state=True)
            # Fitting the model on reduced dimensionality
            clf_een.fit(self.gen_tr_data)
            # The anomaly score of the input samples. The lower, the more abnormal.
            #输入样本的异常分数。越低越不正常。
            pred_gen_scores_ee = clf_een.decision_function(self.gen_ts_data)
            pred_imp_scores_ee = clf_een.decision_function(self.imp_ts_data)
            pred_scores_ts_ee = np.concatenate(
                (pred_gen_scores_ee, pred_imp_scores_ee))
            norm_scores_ee = self.mymm_scaler(pred_scores_ts_ee)
        else:
            norm_scores_ee = self.fill_sc_with_zero(
                np.concatenate(
                    (self.get_gen_ts_labels(), self.get_imp_ts_labels())))
        if weights[1] != 0:
            # Make sure you apply pca before using envelop -- it is very sensitive to the feature dimensions
            clf_if = IsolationForest(max_samples="auto",
                                     contamination=0.2,
                                     random_state=True)
            # Fitting the model on reduced dimensionality
            clf_if.fit(self.gen_tr_data)
            # The anomaly score of the input samples. The lower, the more abnormal.
            pred_gen_scores_if = clf_if.decision_function(self.gen_ts_data)
            pred_imp_scores_if = clf_if.decision_function(self.imp_ts_data)
            # print('pred_gen_scores_if',self.mymm_scaler(pred_gen_scores_if))
            # print(clf_if.predict(self.gen_ts_data))
            # print('pred_imp_scores_if', self.mymm_scaler(pred_imp_scores_if))
            # print(clf_if.predict(self.imp_ts_data))

            pred_scores_ts_if = np.concatenate(
                (pred_gen_scores_if, pred_imp_scores_if))
            norm_scores_if = self.mymm_scaler(pred_scores_ts_if)
            # print('norm_scores_if',norm_scores_if)
            # print('plabel',np.concatenate((clf_if.predict(self.gen_ts_data),clf_if.predict(self.imp_ts_data))))
        else:
            norm_scores_if = self.fill_sc_with_zero(
                np.concatenate(
                    (self.get_gen_ts_labels(), self.get_imp_ts_labels())))
        if weights[2] != 0:
            num_neighbors = 35
            clf_lof = LocalOutlierFactor(n_neighbors=num_neighbors,
                                         metric='l2',
                                         contamination=0.25)
            X = np.concatenate((self.gen_tr_data, self.gen_ts_data))
            X_all = np.concatenate((X, self.imp_ts_data))
            pred_all_score = clf_lof.fit_predict(X_all)
            #print('pred_all_score')
            #print(pred_all_score)
            pred_scores_ts_lof = pred_all_score[
                range(len(self.gen_tr_data), len(pred_all_score)), ]
            norm_scores_lof = self.mymm_scaler(pred_scores_ts_lof)
        else:
            norm_scores_lof = self.fill_sc_with_zero(
                np.concatenate(
                    (self.get_gen_ts_labels(), self.get_imp_ts_labels())))

        if weights[3] != 0:
            # Make sure you apply pca before using envelop -- it is very sensitive to the feature dimensions
            clf_svm1c = svm.OneClassSVM(kernel='rbf',
                                        degree=3,
                                        gamma=0.001,
                                        coef0=0.0,
                                        tol=0.00001,
                                        nu=0.001,
                                        shrinking=True,
                                        cache_size=200,
                                        verbose=False,
                                        max_iter=-1,
                                        random_state=True)
            # Fitting the model on reduced dimensionality
            clf_svm1c.fit(self.gen_tr_data)
            # The anomaly score of the input samples. The lower the more abnormal.
            pred_gen_scores_svm = clf_svm1c.decision_function(self.gen_ts_data)
            pred_imp_scores_svm = clf_svm1c.decision_function(self.imp_ts_data)
            pred_scores_ts_svm = np.concatenate(
                (pred_gen_scores_svm, pred_imp_scores_svm))
            norm_scores_svm = self.mymm_scaler(pred_scores_ts_svm)
        else:
            norm_scores_svm = self.fill_sc_with_zero(
                np.concatenate(
                    (self.get_gen_ts_labels(), self.get_imp_ts_labels())))

        # Score level fusion
        pred_ts_labels = []
        fused_scores = []
        for ees, ifs, lofs, svms in zip(norm_scores_ee, norm_scores_if,
                                        norm_scores_lof, norm_scores_svm):
            cfscore = (weights[0] * ees + weights[1] * ifs +
                       weights[2] * lofs + weights[3] * svms) / sum(weights)
            fused_scores.append(cfscore)
            if cfscore < self.threshold:
                pred_ts_labels.append(-1)
            else:
                pred_ts_labels.append(1)

        act_ts_labels = np.concatenate(
            (self.get_gen_ts_labels(), self.get_imp_ts_labels()))
        tn, fp, fn, tp = confusion_matrix(act_ts_labels,
                                          pred_ts_labels).ravel()
        far = fp / (fp + tn)
        frr = fn / (fn + tp)
        pr = tp / (tp + fp)
        final_score_table = [
            norm_scores_ee, norm_scores_if, norm_scores_lof, norm_scores_svm,
            fused_scores, act_ts_labels
        ]
        #ee分数
        print(norm_scores_ee)
        #if分数
        print(norm_scores_if)
        #lof是0,1标签
        print(norm_scores_lof)
        #svm分数
        print(norm_scores_svm)
        #混合后也是分数
        print(fused_scores)
        #标签
        print(act_ts_labels)
        return far, frr, pr, final_score_table
def anomaly_detection_ex8_ng():
    """Run anomaly detection.
        Example from Andrew Ng's coursera course
    """

    # =====================
    # load data

    dataset = loadmat('data/ex8data1.mat')
    # dataset = loadmat('data/ex8data2.mat')
    print(dataset.keys())

    X = dataset['X']
    print('X:', X.shape, X[0, :])  # 307x2

    Xval = dataset['Xval']
    print('X_val:', Xval.shape, Xval[0, :])  # 307x2
    yval = dataset['yval']
    print('y_val:', yval.shape, yval[0, :])  # 307x1

    # =====================
    # display
    fig = plt.figure(facecolor='white')
    fig1 = fig.add_subplot(2, 2, 1)
    plt.scatter(X[:, 0], X[:, 1], c='k')
    plt.title("Outlier detection")
    plt.xlabel('Latency (ms)')
    plt.ylabel('Throughput (mb/s)')

    # =====================
    # detecting outliers in a Gaussian distributed dataset.
    clf = EllipticEnvelope()
    clf.fit(X)

    # Calculate the decision function and use threshold to determine outliers
    y_pred = clf.decision_function(X).ravel()
    # print('y pred', y_pred)

    # =====================
    # find best threshold for outlier detection
    if False:
        samples = np.linspace(0.1, 10.0, num=100)
        best_f1 = 0.0
        best_perc = 0.0
        for sample in samples:
            Xval_pred = clf.decision_function(Xval)
            perc = sample
            th = np.percentile(Xval_pred, perc)
            outl = Xval_pred < th
            f1score = f1_score(yval, outl)
            print('f1 score (', sample, '):', f1score)

            if best_f1 < f1score:
                best_f1 = f1score
                best_perc = perc
        print('best f1:', best_f1, ', best perc:', best_perc)

    # set threshold for outlier detection
    percentile = 1.9  # 5.1 # 1.9 #best_perc # 1.9607843
    threshold = np.percentile(y_pred, percentile)
    outliers = y_pred < threshold
    # print('outliers:', X[outliers])

    # =====================
    # plot contours

    fig.add_subplot(2, 2, 2)

    # create the grid for plotting
    if False:
        xx, yy = np.meshgrid(np.linspace(0, 25, 200), np.linspace(0, 30, 200))
        Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)

        plt.contour(xx,
                    yy,
                    Z,
                    levels=[threshold],
                    linewidths=2,
                    colors='blue',
                    linestyles='dotted')

        threshold = np.percentile(y_pred, 1.0)
        plt.contour(xx,
                    yy,
                    Z,
                    levels=[threshold],
                    linewidths=2,
                    colors='blue',
                    linestyles='dotted')
        threshold = np.percentile(y_pred, 0.5)
        plt.contour(xx,
                    yy,
                    Z,
                    levels=[threshold],
                    linewidths=2,
                    colors='blue',
                    linestyles='dotted')

    # plot outliers
    plt.scatter(X[:, 0], X[:, 1], c='k')
    plt.scatter(X[outliers, 0], X[outliers, 1], c='r')
    print('num outliers:', sum(outliers))

    # samples_idx = yval == 1
    # print(yval[samples_idx])
    # print('X_val:', Xval.shape, Xval[0, :])  # 307x2
    # print(Xval[samples_idx])

    plt.show()
def main():


	#Variables setting
    columnArrangement = ['id', 'selectedFault', 'faultType', 'faultIntensity', 'externalControllerOutput', 
    'pressureValveInlet', 'pressureValveOutlet', 'disturbedMediumFlow', 'mediumTemperature', 'rodDisplacement']
    plottingVariables = ['externalControllerOutput', 'pressureValveInlet', 'pressureValveOutlet', 'disturbedMediumFlow', 
    'mediumTemperature', 'rodDisplacement']
    
    ellipticEnvelopeContamination = 0.04
    classifiers = {'DummyClf':DummyClassifier(), 'EllipticEnvelope':EllipticEnvelope(contamination=ellipticEnvelopeContamination)}

    pd.options.mode.chained_assignment = None

    nsamples = 1000
   
    random_seed = 0 #Change this to make it really random, 0 for testing purposes

    cv_folds = 4

    desiredComponents = ['Valve']

    scoringMetrics = ['precision_macro', 'recall_macro', 'f1_macro']

    startDateTime = datetime(2017, 11, 6, hour=0, minute=0, second=0, microsecond=0)
    endDateTime = datetime(2017, 11, 16, hour=0, minute=0, second=0, microsecond=0)

    dataManager = DataManager(user="******", password="******", engineType="mysql+mysqldb://", dbName="damadics", host="localhost", port="3306")

    y_trains = {'DummyClf':None, 'EllipticEnvelope':list()}
    y_tests = {'DummyClf':None, 'EllipticEnvelope':list()}

    #Data acquisition and formatting
    dataFrames = dataManager.readData(startDateTime, endDateTime, desiredComponents)
    
    df = dataFrames['ValveReadings']
    df = dataManager.reshapeAndCleanDataFrame(df)
    df = df[columnArrangement] #Rearrange columns
    
    #display(df.head())
    
    X_raw = df[['externalControllerOutput', 'disturbedMediumFlow', 'pressureValveInlet', 'pressureValveOutlet', 
        	'mediumTemperature', 'rodDisplacement']]
    df.loc[df['selectedFault'] != 20, 'selectedFault'] = 1
    df.loc[df['selectedFault'] == 20, 'selectedFault'] = 0

    totalCount = df.shape[0]
    faultydf = df.loc[df['selectedFault'] == 1, 'selectedFault']
    faultCount = faultydf.shape[0]
    nonFaultCount = df.shape[0] - faultCount
    faultNonFaultRatio = faultCount/nonFaultCount

    y_raw = df['selectedFault']
    #get a jointplot of the 7 variables
    """pt.jp_plotData(df, 'The 7 variables in the data', saveToFile='snspp_damadics.png', nsamples = 1000, 
    	vars=plottingVariables, hue='selectedFault')"""

    #Anomaly detection 

    #First standardize the data
    X_transformed = StandardScaler().fit_transform(X_raw)

    X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_raw, random_state=random_seed)

    y_trains['EllipticEnvelope'] = [-1 if y == 1 else 1 for y in y_train]
    y_tests['EllipticEnvelope'] = [-1 if y == 1 else 1 for y in y_test]

    print('Performing cross validations')

    for classifierKey in classifiers:

    	print('\nResults for {} classifier'.format(classifierKey))

    	clf = classifiers[classifierKey]

    	if y_trains[classifierKey] != None:
    		y_train = y_trains[classifierKey]

    	cv_scores = cross_validate(clf, X_train, y_train, scoring=scoringMetrics, cv=cv_folds)
    	"""clf.fit(X_train, y_train)
    	y_pred = clf.predict(X_train)
    	score_acc = accuracy_score(y_train, y_pred)
    	print('accuracy {}'.format(score_acc))
    	print('Type: {}, first 5 elements {}, element type {}'.format(type(y_pred), y_pred[:5], type(y_pred[0])))"""

    	print('{}-fold cross validation'.format(cv_folds))

    	for key in cv_scores:
    		print("For metric %s Accuracy: %0.5f (+/- %0.5f)" % (key, cv_scores[key].mean(), cv_scores[key].std() * 2))
    
    print('\nTotal sample size {}, Train Size {}, Test Size {}'.format(X_raw.shape[0], X_train.shape[0], X_test.shape[0]))
    print('Total sample size {}, Faulty samples {}, Normal samples {}, Fault/Non Fault Ratio {:.4f}'.
    	format(totalCount, faultCount, nonFaultCount, faultNonFaultRatio))
    #print('Total sample size {}, Train Size {}, Test Size {}'.format(X_raw.shape[0], X_train.shape[0], X_test.shape[0]))
    


    #display((df))
    
    
    
    dataManager.endDataManager()
예제 #18
0
# Generate labels, 1 for inliers and -1 for outliers
labels = np.ones(num_samples, dtype=int)
labels[-num_outliers:] = -1

# plt.figure()
inlier_plot = plt.plot(x[:num_inliers,0], x[:num_inliers,1], 'go', label='inliers')
outlier_plot = plt.plot(x[-num_outliers:,0], x[-num_outliers:,1], 'ko', label='outliers')
plt.xlim(-11,11)
plt.ylim(-11,11)
plt.legend(numpoints=1)
# plt.show()
plt.savefig(pdf, format='pdf')

## Applying sklearn.covariance.EllipticEnvelope

classifier = EllipticEnvelope(contamination=outlier_ratio)
classifier.fit(x)
y_pred = classifier.predict(x)
num_errors = sum(y_pred != labels)
print('Number of errors fitting Elliptic Envelope to Gaussian distribution: {}'.format(num_errors))

xx, yy, Z, threshold = output(x, outlier_ratio)

# plt.figure()
inlier_plot = plt.plot(x[:num_inliers,0], x[:num_inliers,1], 'go', label='inliers')
outlier_plot = plt.plot(x[-num_outliers:,0], x[-num_outliers:,1], 'ko', label='outliers')

plt.contour(xx, yy, Z, levels=[threshold],linewidths=5, colors='gray')
plt.contour(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 10), cmap=plt.cm.Greys_r)
plt.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='gray')
plt.xlim(-11,11)
예제 #19
0
        def assign_to_nearest(samples, centroids, label2id):
            """
            Args:
                samples:text-cnn後の出力
                centroids:クラスターそれぞれの重心 shape=(cluster_num,output_dim)
            Returns:
                nearest: 入力データと最も近いクラスターID shape=(data_num, )
            """
            #1-1.KNearest Neighborで一番近いクラスタと紐付け
            neigh = KNeighborsClassifier(n_neighbors=1)
            neigh.fit(centroids, np.arange(len(centroids)))
            nearest = neigh.predict(samples)

            ###1-1.5 NEW!! ####
            from sklearn.covariance import EllipticEnvelope
            outliers_fraction = 0.05
            sup_ids = np.array([label2id[_] for _ in train_y[supervised]])
            unsup_ids = nearest[unsupervised]

            #print(label2id)
            for i in range(n_cluster):
                if i != label2id[drop_cat]:
                    #print(i)
                    clf = EllipticEnvelope(contamination=outliers_fraction)

                    #ラベル付きデータで学習
                    ind = np.where(sup_ids == i)[0]
                    clf.fit(samples[supervised][ind])

                    #ラベルなしデータで最近傍であっても離れすぎてないか確認
                    unind = np.where(unsup_ids == i)[0]
                    r = clf.predict(samples[unsupervised][unind])

                    nearest[unind[r < 0]] = label2id[drop_cat]

            #from scipy.stats import itemfreq
            #print(itemfreq(nearest))

            #################
            #1-2.ハンガリアンアルゴリズムでラベル付きデータと重心を紐付ける

            sup_labels = train_y[supervised]
            hglabel = np.unique(sup_labels)

            hgx = []  #教師データのラベルごとに平均した点(重心)を求める
            for i in hglabel:
                ind = np.where(sup_labels == i)[0]
                hgx.append(np.mean(samples[supervised][ind], axis=0))
            hgx = np.array(hgx)

            #教師ラベルごとの重心と現在の重心との距離行列
            DistanceMatrix = np.linalg.norm(hgx[:, np.newaxis, :] -
                                            centroids[np.newaxis, :, :],
                                            axis=2)

            # ハンガリアンアルゴリズムで合計が一番小さくなるように紐づける
            from scipy.optimize import linear_sum_assignment
            row_ind, col_ind = linear_sum_assignment(DistanceMatrix)

            #ラベルとclusterIDを紐づける
            label2id = {hglabel[i]: col for i, col in enumerate(col_ind)}
            label2id[drop_cat] = list(
                set(list(range(n_cluster))) - set(label2id.values()))[0]

            return nearest, label2id
예제 #20
0
from sklearn import svm
from sklearn.datasets import make_moons, make_blobs
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

matplotlib.rcParams['contour.negative_linestyle'] = 'solid'
# 设置
n_samples = 300
outliers_fraction = 0.15
n_outliers = int(outliers_fraction * n_samples)
n_inliers = n_samples - n_outliers

# 定义比/异常检测方法
anomaly_algorithms = [
    ("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)),
    ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf",
                                      gamma=0.1)),
    ("Isolation Forest", IsolationForest(contamination=outliers_fraction,
                                         random_state=42)),
    ("Local Outlier Factor", LocalOutlierFactor(
        n_neighbors=35, contamination=outliers_fraction))]

# 定义数据集
blobs_params = dict(random_state=0, n_samples=n_inliers, n_features=2)
datasets = [
    make_blobs(centers=[[0, 0], [0, 0]], cluster_std=0.5,
               **blobs_params)[0],
    make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[0.5, 0.5],
               **blobs_params)[0],
    make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[1.5, .3],
#using pandas

df = pd.DataFrame(features, columns = ['feature_1','feature-2'])
#print(df.apply(add_ten))

#Detecting Outliers

features, _ = make_blobs(n_samples = 10,
                         n_features = 2,
                         centers = 1,
                         random_state = 1)

features[0,0] = 10000
features[0,1] = 10000

outlier_detector = EllipticEnvelope(contamination = .1)

outlier_detector.fit(features)

outlier_detector.predict(features)

feature = features[:,0]

def indicies_of_outliers(x):
    q1, q3 = np.percentile(x, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - (iqr * 1.5)
    upper_bound = q3 + (iqr * 1.5)
    return np.where((x > upper_bound) | (x < lower_bound))

#print(indicies_of_outliers(feature))
예제 #22
0
    model_LocalOutlierFactor = LocalOutlierFactor(n_neighbors=20,
                                                  contamination=0.2,
                                                  novelty=True,
                                                  leaf_size=10)

    # define model
    model_svm = svm.OneClassSVM(nu=0.2, gamma='scale')

    model_isolation = IsolationForest(
        contamination=0.2,
        random_state=42,
        behaviour='new',
        n_estimators=100,
    )
    # define model
    model_EllipticEnvelope = EllipticEnvelope(contamination=0.2,
                                              support_fraction=1)

    #f.write('LOF'+'\n')
    #calculate_accuracies(model_LocalOutlierFactor, test_data, fitter, number, 100, f, LOF)

    #f.write('Isolation forest'+'\n')
    #calculate_accuracies(model_isolation, test_data, fitter, number, 100, f, iso)

    #f.write('Envelop'+'\n')
    #calculate_accuracies(model_EllipticEnvelope, test_data, fitter, number, 100, f, envelop)

    ### raw data
    test_data = np.concatenate((no_mod[number:], mod[number:]))

    model_LocalOutlierFactor = LocalOutlierFactor(n_neighbors=20,
                                                  contamination=0.2,
예제 #23
0
 def ElliEnvelope(data):
     clf = EllipticEnvelope(contamination=0.1)
     clf.fit(data)
     outlier_pre = clf.predict(data)
     outlier = data[outlier_pre == -1]
     return (outlier)
예제 #24
0
mse = mean_squared_error(y_test, yhat)
rmse = math.sqrt(mse)
r2score = r2_score(y_test, yhat)
print('MAE: %.3f' % mae)
print('MSE: %.3f' % mse)
print('RMSE: %.3f' % rmse)
print('r2 Score: %.3f' % r2score)
print("IsolationForest Complete Duration: --- %s seconds ---" %
      (time.time() - sttime))

#===========================================
# evaluate model performance with outliers removed using elliptical envelope
# identify outliers in the training dataset
sttime = time.time()
print("EllipticEnvelope")
ee = EllipticEnvelope(contamination=0.01, support_fraction=1.7)
yhat = ee.fit_predict(X_train)
print(yhat)

# select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)

# fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_test)
# evaluate predictions
예제 #25
0
import pandas as pd
import numpy as np
#from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

algorithms = [  #("Local Outlier Factor", LocalOutlierFactor()),
    ("One Class SVM", OneClassSVM()),
    ("Isolation Forest", IsolationForest()),
    ("Elliptical Envelope", EllipticEnvelope()),
]
white = pd.read_csv('C:/Users/gajja/Desktop/winequality-white.csv', sep=';')
X = white

for name, algo in algorithms:
    if name == "Local Outlier Factor":
        pred = algo.fit_predict(X)
    else:
        pred = algo.fit(X).predict(X)
    outliers = [x for x in pred if x == -1]
    print(*outliers, sep='\n')
    print(name, ':', len(outliers), 'potential outliers detected.')
예제 #26
0
 "mirror": [pre.couples_raw.Mirror(), None],
 "cluster": [
     pre.couples_xy.Cluster(SpectralClustering()),
     pre.couples_xy.Cluster(MiniBatchKMeans()),
     pre.couples_xy.Cluster(AgglomerativeClustering()),
     pre.couples_xy.Cluster(Birch()),
     pre.transformers.Pass(),
 ],
 "cluster__clusterer__n_clusters":
 [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 24, 26],
 "cluster__replace": [True],
 "sanitize": [
     pre.couples_xy.SanitizeStartEnd(),
     pre.transformers.Pass(),
 ],
 "sanitize__alg": [IsolationForest(), EllipticEnvelope()],
 "sanitize__contamination": [.01, .03, .05, .07, .1, .15, .2, .3, .4, .5],
 "form_data__alg": [
     None,  # I have to explicitly specify n_components for each because of this
     TSNE(n_components=3),
     LocallyLinearEmbedding(n_components=3),
     LocallyLinearEmbedding(n_components=4),
     Isomap(n_components=3),
     Isomap(n_components=4),
     MDS(n_components=3),
     MDS(n_components=4),
     SpectralEmbedding(n_components=3),
     SpectralEmbedding(n_components=4),
     PCA(n_components=3),
     PCA(n_components=4),
 ],
예제 #27
0
        feature_sum = np.sum(temp, axis=0)
        sum_sort = np.sort(feature_sum)

        pca1 = feature_names[np.where(feature_sum == sum_sort[0])[0][0]]
        pca2 = feature_names[np.where(feature_sum == sum_sort[1])[0][0]]
        pca3 = feature_names[np.where(feature_sum == sum_sort[2])[0][0]]

        print(f"selected features :{pca1}, {pca2}, {pca3}")
        """
            Result -> Performance
        """
        """
            One class Svm
        """
        anomaly_algorithms = [
            ("Robust covariance", EllipticEnvelope()),
            ("One-Class SVM", svm.OneClassSVM(kernel="rbf", gamma=0.001)),
            ("Isolation Forest", IsolationForest(random_state=42)),
            ("Local Outlier Factor", LocalOutlierFactor())
        ]

        X = np.append(arr=X, values=features_pca, axis=0)
        X_num = X.shape[0]
        base_estimators = [LOF(), IForest(), OCSVM(kernel="rbf", gamma=0.001)]

        model = SUOD(
            base_estimators=base_estimators,
            n_jobs=2,  # number of workers(if -1 it use full core)
            rp_flag_global=True,  # global flag for random projection
            bps_flag=True,  # global flag for balanced parallel scheduling
            approx_flag_global=False,  # global flag for model approximation
from sklearn import svm
from sklearn.covariance import EllipticEnvelope

# Example settings
n_samples = 200
outliers_fraction = 0.25
clusters_separation = [0, 1, 2]

# define two outlier detection tools to be compared
classifiers = {
    "One-Class SVM":
    svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
                    kernel="rbf",
                    gamma=0.1),
    "robust covariance estimator":
    EllipticEnvelope(contamination=.1)
}

# Compare given classifiers under given settings
xx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500))
n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)
ground_truth = np.ones(n_samples, dtype=int)
ground_truth[-n_outliers:] = 0

# Fit the problem with varying cluster separation
for i, offset in enumerate(clusters_separation):
    np.random.seed(42)
    # Data generation
    X1 = 0.3 * np.random.randn(0.5 * n_inliers, 2) - offset
    X2 = 0.3 * np.random.randn(0.5 * n_inliers, 2) + offset
예제 #29
0
    df["usage proportion"] = df["usage_cycles"] / (df["usage_cycles"] +
                                                   df["non_usage_cycles"])

    X = df["usage proportion"].values
    Y = df["usage_percentage"].values

    XY = df[["usage proportion", "usage_percentage"]].values

    # Example settings
    n_samples = len(df)
    outliers_fraction = 0.2
    clusters_separation = [0]

    # define two outlier detection tools to be compared
    classifiers = {
        "robust covariance estimator": EllipticEnvelope(contamination=.1)
    }

    # Compare given classifiers under given settings
    #xx, yy = np.meshgrid(np.linspace(-0.1, 1.1, 1000), np.linspace(0, 100, 1000))
    n_inliers = int((1. - outliers_fraction) * n_samples)
    n_outliers = int(outliers_fraction * n_samples)

    # Fit the problem with varying cluster separation
    np.random.seed(42)
    # Data generation

    # Fit the model with the One-Class SVM
    #plt.figure(figsize=(10, 5))

    clf = EllipticEnvelope(contamination=.1)
예제 #30
0
def get_elliptic_envelope(X1):

    # Define "classifiers" to be used
    classifiers = {
        "Elliptic Envelope":
        EllipticEnvelope(),
        "Empirical Covariance":
        EllipticEnvelope(support_fraction=1., contamination=0.261),
        "Robust Covariance (Minimum Covariance Determinant)":
        EllipticEnvelope(contamination=0.261),
    }

    # list color codes for plotting
    colors = ['firebrick', 'gold', 'mediumorchid']
    legend1 = {}

    # Learn a frontier for outlier detection with several classifiers
    a = [0.995, 1.001]
    b = [0.9999, 1.0001]
    lim_min = np.min(X1, axis=0)
    lim_max = np.max(X1, axis=0)

    # create meshgrids for plotting ellipses (contours)
    xx1, yy1 = np.meshgrid(
        np.linspace(lim_min[0] * a[0], lim_max[0] * a[1], 500),
        np.linspace(lim_min[1] * b[0], lim_max[1] * b[1], 500))

    # loop over classifiers and fit then plot
    for i, (clf_name, clf) in enumerate(classifiers.items()):
        # compute and store plot for X1
        plt.figure(1)
        clf.fit(X1)  # fit current classifier
        # get decision function with outlier thresh = 0 (default)
        Z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()])
        # reshape for plotting
        Z1 = Z1.reshape(xx1.shape)
        # plot

        #legend1[clf_name] = plt.contour(
        plt.contour(xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[i])

    #legend1_values_list = list(legend1.values())
    #legend1_keys_list = list(legend1.keys())

    # Plot the results for X1

    plt.figure(1)  # two clusters
    plt.title("Outlier detection on NBA Players")
    plt.scatter(X1[:, 0], X1[:, 1], color='royalblue')  # just data points
    # set figure limits from meshgrids
    plt.xlim((xx1.min(), xx1.max()))
    plt.ylim((yy1.min(), yy1.max()))

    # set labels
    plt.ylabel("Latitude")
    plt.xlabel("Longitude")

    # create legend
    #plt.legend((legend1_values_list[0].collections[0],
    #            legend1_values_list[1].collections[0],
    #            legend1_values_list[2].collections[0]),
    #           (legend1_keys_list[0], legend1_keys_list[1], legend1_keys_list[2]),
    #           loc="lower center",
    #           prop=matplotlib.font_manager.FontProperties(size=12))

    plt.show()