print outliers_rows # we can display the tuple of the row/column coordinates in the arry: print(list(zip(outliers_rows, outliers_columns))) # the univariate approach can reveal quite a lot of potential outliers. It won't disclose an outlier that does not have an extreme value. However it will reveal the outlier if it finds an unusual combination of values in two or more variables. Often in such cases, the values of the involved variables may not even be extreme and therefore the outlier may slip away unnoticed. # In order to discover cases where this happens, you can use a dimensionality reduction algorithm, such as PCA and then check the absolute values of the components that are beyond three standard deviations # Scikit-Learn offers a couple of classes that can automatically work for you straight out of the box and signal all suspect classes: the covariance.EllipticEnvelope class fits a robust distribution estimation of your data, pointing out outliers that might be contaminating your dataset because they are the extreme points in the general distribution of the data. svm.OneClassSVM class is a support vector machine algorithm that can approximate the shape of your data and find out if any new instances provided should be considered as a novelty (acts as novelty detector by default, assumes no outlier in data), by modifying its parameters can work on dataset where outliers are present providing an even more robust and reliable outlier detection system than EllipticEnvelope ## EllipticEnvelope - a function that tries to figure out the key parameters of your data's general distribution by assuming that your entire data is an expression of an underlying multivariate Gaussian distribution. We can say that it checks the distance of each observation with respect to a grand mean that takes into account all the variables in your dataset. Thus it is able to spot both univariate and multivariate outliers. # The only parameter you have to take into account when using this function from the covariance module is the contamination parameter, which can take a value of up to 0.5. Situations may vary from dataset to dataset, however as a starting figure, we suggest a value from 0.01-0.02 since it is the percentage of observations that should fall over the absolute value 3 in the Z score distance from the mean in a standardize normal distribution. For this reason, we deem the default value of 0.1 as too high. # create an artifical distribution made of blobs from sklearn.datasets import make_blobs blobs = 1 # The number of distributions (parameter centers) is related to the user-defined variable blobs, which is initially set to 1 blob = make_blobs(n_samples=100, n_features=2, centers=blobs, cluster_std=1.5, shuffle=True, random_state=5) # creates a certain number of distributions into a bidimensional space for a total of 100 examples (n_samples parameter). # Robust Covariance Estimate from sklearn.covariance import EllipticEnvelope robust_covariance_est = EllipticEnvelope(contamination=.1).fit(blob[0]) # running EllipticEnvelope with a contamination rate of 10 percent helps you find out the most extreme values in the distribution. The model deploys first fit by using the .fit() method on the EllipticEnvelope class. detection = robust_covariance_est.predict(blob[0]) # Now the model prediction is obtained by using the predict() method on the data that was used for fit. outliers= np.where(detection == -1) # results corresponding to a vector of values 1 and -1, -1 being the mark for anomalous examples inliers = np.where(detection == 1) # the distinction between inliers and outliers is recorded in the variable's outliers and inliers which contain the indexes of the examples # Draw the distribution and detected outliers import matplotlib.pyplot as plt # just the distribution plt.plot(blob[0][:,0], blob[0][:,1], 'x', markersize=10, color='black', alpha=0.8) plt.show() # The distribution and the outliers a = plt.plot(blob[0][inliers,0],blob[0][inliers,1],'x',markersize=10,color='black',alpha=0.8,label='inliers') b = plt.plot(blob[0][outliers,0],blob[0][outliers,1],'o',markersize=6,color='black',alpha=0.8,label='outliers') plt.legend((a[0],b[0]),('inliers', 'outliers'), numpoints=1, loc='lower right') plt.show() # in the case of a unique underlying multivariate distribution (when the variable blobs=1), the EllipticEnvelope algorithm has successfully located 10 percent of the observations on the fringe of the distribution and has consequently signaled all suspect outliers. # a limitation of the EllipticEnvelope is when multiple distributions are present in the data as if there were two or more natural clusters, the algorithm, trying to fit a unique general distribution, tends to locate the potential outliers on just the most remote cluster, thus ignoring other areas of data that might be potentially affected by outlying cases, which is a situation that could occur with real data. # Boston data to use in data that's real
rng = np.random.RandomState(42) # Example settings n_samples = 200 outliers_fraction = 0.25 clusters_separation = [0, 1, 2] # define two outlier detection tools to be compared classifiers = { "One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, kernel="rbf", gamma=0.1), "Robust covariance": EllipticEnvelope(contamination=outliers_fraction), "Isolation Forest": IsolationForest(max_samples=n_samples, contamination=outliers_fraction, random_state=rng) } # Compare given classifiers under given settings xx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500)) n_inliers = int((1. - outliers_fraction) * n_samples) n_outliers = int(outliers_fraction * n_samples) ground_truth = np.ones(n_samples, dtype=int) ground_truth[-n_outliers:] = -1 # Fit the problem with varying cluster separation for i, offset in enumerate(clusters_separation):
def OutLierDetection(df,feature1,feature2,outliers_fraction=.1): new_df = df.copy() rng = np.random.RandomState(42) # Example settings n_samples = new_df.shape[0] # outliers_fraction = 0.2 # ************************************** imp clusters_separation = [0]#, 1, 2] # define two outlier detection tools to be compared classifiers = { "One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, kernel="rbf", gamma=0.1), "Robust covariance": EllipticEnvelope(contamination=outliers_fraction), "Isolation Forest": IsolationForest(max_samples=n_samples, contamination=outliers_fraction, random_state=rng), "Local Outlier Factor": LocalOutlierFactor( n_neighbors=35, contamination=outliers_fraction)} xx, yy = np.meshgrid(np.linspace(new_df[feature1].min()-new_df[feature1].min()*10/100, new_df[feature1].max()+new_df[feature1].max()*10/100, 50), np.linspace(new_df[feature2].min()-new_df[feature2].min()*10/100, new_df[feature2].max()+new_df[feature2].max()*10/100, 50)) n_inliers = int((1. - outliers_fraction) * n_samples) n_outliers = int(outliers_fraction * n_samples) ground_truth = np.ones(n_samples, dtype=int) ground_truth[-n_outliers:] = -1 # Fit the problem with varying cluster separation for i, offset in enumerate(clusters_separation): np.random.seed(42) # Data generation X = new_df[[feature1,feature2]].values.tolist() # Fit the model plt.figure(figsize=(9, 7)) for i, (clf_name, clf) in enumerate(classifiers.items()): # fit the data and tag outliers if clf_name == "Local Outlier Factor": y_pred = clf.fit_predict(X) scores_pred = clf.negative_outlier_factor_ else: clf.fit(X) scores_pred = clf.decision_function(X) y_pred = clf.predict(X) threshold = stats.scoreatpercentile(scores_pred, 100 * outliers_fraction) n_errors = (y_pred != ground_truth).sum() unique, counts = np.unique(y_pred,return_counts=True) print(clf_name,dict(zip(unique, counts))) new_df[feature1+'_'+feature2+clf_name] = y_pred # print(clf_name,y_pred) # plot the levels lines and the points if clf_name == "Local Outlier Factor": # decision_function is private for LOF Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()]) else: Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) subplot = plt.subplot(2, 2, i + 1) subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), cmap=plt.cm.Blues_r) a = subplot.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='red') subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='orange') b = plt.scatter(new_df[feature1], new_df[feature2], c='white', s=20, edgecolor='k') subplot.axis('tight') subplot.set_xlabel("%s" % (feature1)) plt.ylabel(feature2)#, fontsize=18) plt.title("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors)) plt.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26) # plt.suptitle("Outlier detection") plt.show() return new_df
import numpy as np from sklearn.covariance import EllipticEnvelope from sklearn.svm import OneClassSVM import matplotlib.pyplot as plt import matplotlib.font_manager from sklearn.datasets import load_boston # Get data X1 = load_boston()['data'][:, [8, 10]] # two clusters X2 = load_boston()['data'][:, [5, 12]] # "banana"-shaped # Define "classifiers" to be used classifiers = { "Empirical Covariance": EllipticEnvelope(support_fraction=1., contamination=0.261), "Robust Covariance (Minimum Covariance Determinant)": EllipticEnvelope(contamination=0.261), "OCSVM": OneClassSVM(nu=0.261, gamma=0.05) } colors = ['m', 'g', 'b'] legend1 = {} legend2 = {} # Learn a frontier for outlier detection with several classifiers xx1, yy1 = np.meshgrid(np.linspace(-8, 28, 500), np.linspace(3, 40, 500)) xx2, yy2 = np.meshgrid(np.linspace(3, 10, 500), np.linspace(-5, 45, 500)) for i, (clf_name, clf) in enumerate(classifiers.items()): plt.figure(1) clf.fit(X1)
print "Output labels:",' '.join(outlabels) if len(outlabels) != len(targets): print "Warning, dataset is missing some targets" #perform PCA to detect help outliers pca = PCA(n_components=4,whiten=False) alldata = np.array(zip(*[v[1:] for v in cols])) alldata_pcs = pca.fit(alldata).transform(alldata) # Components print pca.components_ # Percentage of variance explained for each components print 'explained variance ratio (first %d components):'%(len(pca.components_),), \ pca.explained_variance_ratio_ print 'total explained variance:',sum(pca.explained_variance_ratio_) outlier_classifier = EllipticEnvelope(contamination=.05) outlier_classifier.fit(alldata_pcs) inlier_classification = outlier_classifier.predict(alldata_pcs) #print "Outliers:",[i for (i,c) in enumerate(inlier_classification) if c<0] inputs = zip(*[v[1:] for v in cols if v[0] not in targets]) outputs = zip(*[v[1:] for v in cols if v[0] in targets]) #reject outliers inputs = [inputs[i] for (i,c) in enumerate(inlier_classification) if c>0] outputs = [outputs[i] for (i,c) in enumerate(inlier_classification) if c>0] print len(inputs),"inliers" #select output 0 (adapt_cost) #output = np.array([v[0] for v in outputs]) #select output 2 (subopt_score) for ind in xrange(len(outputs[0])):
import math # mu = train_latent_features[zeros_idx].mean() # variance = train_latent_features[zeros_idx].var() # sigma = math.sqrt(variance) # x = np.linspace(mu - 3 * sigma, mu + 3 * sigma, 100) # plt.plot(x, stats.norm.pdf(x, mu, sigma)) print(train_latent_features[ones_idx].mean(axis=1).shape) plt.hist(train_latent_features[ones_idx][0]) plt.show() exit() # exit() # model = svm.OneClassSVM(kernel="poly") # oneclass_svm = IsolationForest(random_state=0) model = EllipticEnvelope() model.fit(train_latent_features) oneclass_predictions = model.predict(train_latent_features) masked_predictions = mask_preds_for_one_class(oneclass_predictions) train_metrics = accuracy_fn(to_tensor(masked_predictions), to_tensor(train_labels), threshold=threshold) train_metrics = {'train_' + k: v for k, v in train_metrics.items()} print(f'***** Train Metrics ***** ') print( f"Accuracy: {'%.5f' % train_metrics['train_accuracy']} " f"| UAR: {'%.5f' % train_metrics['train_uar']}| F1:{'%.5f' % train_metrics['train_f1']} " f"| Precision:{'%.5f' % train_metrics['train_precision']} " f"| Recall:{'%.5f' % train_metrics['train_recall']} | AUC:{'%.5f' % train_metrics['train_auc']}" ) print('Train Confusion matrix - \n' +
#define x and y x = data.drop('y', axis=1) y = data['y'] # fig1 = plt.figure(figsize=(5,5)) # bad = plt.scatter(x['x1'][y==0],x['x2'][y==0],label = 'bad') # good = plt.scatter(x['x1'][y==1],x['x2'][y==1],label='good') # plt.title('raw data') # plt.xlabel('x1') # plt.ylabel('x2') # plt.legend() # plt.show() #anomay detection ad_model = EllipticEnvelope(contamination=0.02) ad_model.fit(x[y == 0]) y_predict_bad = ad_model.predict(x[y == 0]) # ad_model.fit(x[y==1]) # y_predict_good = ad_model.predict(x[y==1]) fig2 = plt.figure(figsize=(5, 5)) bad = plt.scatter(x['x1'][y == 0], x['x2'][y == 0], label='bad') good = plt.scatter(x['x1'][y == 1], x['x2'][y == 1], label='good') plt.scatter(x['x1'][y == 0][y_predict_bad == -1], x['x2'][y == 0][y_predict_bad == -1], marker='x', s=150) # plt.scatter(x['x1'][y==1][y_predict_good==-1],x['x2'][y==1][y_predict_good==-1],marker='x',s=150) plt.title('raw data') plt.xlabel('x1') plt.ylabel('x2')
ONECLASS = { "IsolationForest": IsolationForest(n_estimators=100, max_samples='auto', contamination=0.1, max_features=1.0, bootstrap=False, n_jobs=1, random_state=random_state, verbose=0), "OneClassSVM": OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1), "EllipticEnvelope": EllipticEnvelope(store_precision=True, assume_centered=False, support_fraction=None, contamination=0.1, random_state=random_state) } BINARY = { "SVM": SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200,
def update_filtered_div_caller(radius, contents, selected_date1, selected_date2, selected_duration, selected_time, selected_option, selected_caller, selected_receiver, ml_value, contamination): # Date,Time,Duration Filter global suspicious_users if contents is not None: content_type, content_string = contents.split(',') decoded = base64.b64decode(content_string) global df df = pd.read_csv(io.StringIO(decoded.decode('utf-8'))) preprocess_data(df) filtered_df = df[ (df['Date'] >= pd.to_datetime(selected_date1)) & (df['Date'] <= pd.to_datetime(selected_date2)) & ((df['Duration'] >= selected_duration[0]) & (df['Duration'] <= selected_duration[1])) & ((df['Time'] < times[selected_time[1]]['label']) & (df['Time'] >= times[selected_time[0]]['label']))].reset_index( drop=True) def chk(lat1, lon1, radius): R = 6373.0 global sel_lat global sel_lon lat2 = sel_lat lon2 = sel_lon dlon = lon2 - lon1 dlat = lat2 - lat1 a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2 c = 2 * atan2(sqrt(a), sqrt(1 - a)) distance = R * c print(distance) if distance > radius: return False return True if radius != 0: towers_c = towers.copy() ver = towers_c.apply(lambda x: chk(x['lat'], x['lon'], radius), axis=1) towers_req = towers_c[ver]['TowerID'].unique() filtered_df = filtered_df[filtered_df['TowerID'].isin(towers_req)] if ml_value in [1, 2, 3, 4, 5, 6]: filtered_df = pd.merge(filtered_df, towers[['lat', 'lon', 'TowerID', 'Suspicious']], on='TowerID') filtered_df['Time_new'] = pd.to_datetime(filtered_df['Time'], format='%H:%M:%S') filtered_df["Time_new"] = filtered_df["Time_new"].apply( lambda x: (x - x.replace(hour=0, minute=0, second=0, microsecond=0) ).total_seconds()) filtered_df["Suspicious users"] = filtered_df[[ "Caller", "Receiver" ]].apply(lambda x: 1 if (x.Caller in suspicious_users or x.Receiver in suspicious_users) else 0, axis=1) contamination /= 100 if (ml_value == 3): iso = IsolationForest(contamination=contamination) mask = iso.fit_predict(filtered_df[[ "Time_new", "Duration", "lat", "lon", 'Suspicious', 'Suspicious users' ]]) == -1 filtered_df = filtered_df[mask].drop(['lat', 'lon', 'Time_new'], axis=1) elif (ml_value == 4): iso = EllipticEnvelope(contamination=contamination) mask = iso.fit_predict(filtered_df[[ "Time_new", "Duration", "lat", "lon", 'Suspicious', 'Suspicious users' ]]) == -1 filtered_df = filtered_df[mask].drop(['lat', 'lon', 'Time_new'], axis=1) elif (ml_value == 5): iso = LocalOutlierFactor(contamination=contamination) mask = iso.fit_predict(filtered_df[[ "Time_new", "Duration", "lat", "lon", 'Suspicious', 'Suspicious users' ]]) == -1 filtered_df = filtered_df[mask].drop(['lat', 'lon', 'Time_new'], axis=1) elif (ml_value == 1): filtered_df = filtered_df[filtered_df["Suspicious"] == 1] filtered_df = filtered_df.drop(['lat', 'lon', 'Time_new'], axis=1) elif (ml_value == 2): filtered_df = filtered_df[filtered_df["Suspicious users"] == 1] filtered_df = filtered_df.drop(['lat', 'lon', 'Time_new'], axis=1) # Number Filter # If Caller is Selected if (selected_option == 1): if selected_caller != 'None': filtered_df = filtered_df[(filtered_df['Caller'].isin( list(selected_caller)))].reset_index(drop=True) # If Receiver is selected if (selected_option == 2): if selected_receiver != 'None': filtered_df = filtered_df[(filtered_df['Receiver'].isin( (selected_receiver)))].reset_index(drop=True) # If the option either is selected if (selected_option == 3): if selected_caller != 'None' or selected_receiver != 'None': filtered_df = filtered_df[((filtered_df['Caller'].isin( list(selected_caller))) | (filtered_df['Receiver'].isin( list(selected_receiver))))].reset_index(drop=True) # If option both is selected if (selected_option == 4): if selected_caller != 'None' and selected_receiver != 'None': filtered_df = df[((filtered_df['Caller'].isin( list(selected_caller))) & (filtered_df['Receiver'].isin( list(selected_receiver))))].reset_index(drop=True) if filtered_df.shape[0] == 0: # No update since nothing matches return dash.no_update, 'Nothing Matches that Query' else: # Update Filtered Dataframe return filtered_df.to_json(date_format='iso', orient='split'), 'Updated'
import numpy as np from sklearn.model_selection import train_test_split from sklearn import neighbors from sklearn.covariance import EllipticEnvelope latitude = float(sys.argv[1]) longitude = float(sys.argv[2]) df = pd.read_csv( "C:/Users/Sangameswaran/WebstormProjects/Voldemort/pythonScripts/crime.csv" ) df = df.drop(['crimetime'], axis=1) X = np.array(df.drop(['type'], 1)) y = np.array(df['type']) elliptic = EllipticEnvelope(contamination=0.15) elliptic.fit(X) prediction = elliptic.predict([[latitude, longitude]]) if prediction == -1: possibility = "Safe zone" else: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf = neighbors.KNeighborsClassifier(n_neighbors=5) clf.fit(X_train, y_train) clf.score(X_test, y_test) val = np.array([[latitude, longitude]]) p = clf.predict(val) if p == 0: possibility = "Sexual abuse" elif p == 1:
# License: BSD import numpy as np from sklearn.covariance import EllipticEnvelope from sklearn.svm import OneClassSVM import matplotlib.pyplot as plt import matplotlib.font_manager from sklearn.datasets import load_boston # Get data X1 = load_boston()['data'][:, [8, 10]] # two clusters X2 = load_boston()['data'][:, [5, 12]] # "banana"-shaped # Define "classifiers" to be used classifiers = { "Empirical Covariance": EllipticEnvelope(support_fraction=1., contamination=0.261), "Robust Covariance (Minimum Covariance Determinant)": EllipticEnvelope(contamination=0.261), "OCSVM": OneClassSVM(nu=0.261, gamma=0.05)} colors = ['m', 'g', 'b'] legend1 = {} legend2 = {} # Learn a frontier for outlier detection with several classifiers xx1, yy1 = np.meshgrid(np.linspace(-8, 28, 500), np.linspace(3, 40, 500)) xx2, yy2 = np.meshgrid(np.linspace(3, 10, 500), np.linspace(-5, 45, 500)) for i, (clf_name, clf) in enumerate(classifiers.iteritems()): plt.figure(1) clf.fit(X1) Z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()]) Z1 = Z1.reshape(xx1.shape)
return criterion(y_pred, target.long()) seed = 1200 annotation_path = "../Data/data/preprocessed_annotation_global.csv" y = pd.read_csv(annotation_path)["label"] names = y.astype('category').cat.categories y = y.astype('category').cat.codes meth_path = "../Data/data/preprocessed_Matrix_meth.csv" mRNA_path = "../Data/data/preprocessed_Matrix_miRNA_deseq_correct.csv" mRNA_normalized_path = "../Data/data/preprocessed_Matrix_mRNA_deseq_normalized_prot_coding_correct.csv" files = [meth_path, mRNA_path, mRNA_normalized_path] outliers = [ LocalOutlierFactor(novelty=True), IsolationForest(), EllipticEnvelope(random_state=0), svm.OneClassSVM() ] filenames = ["meth", "mrna", "micro mrna"] modelnames = [ "mlp-local-outlier", "mlp-isolation-forest", "mlp-elliptic", "mlp-one-class" ] for modelname, outlier in zip(modelnames, outliers): for file, filename in zip(files, filenames): with open('../Data/outputs/' + filename + '-bnn-output.txt', 'w') as f: X = pd.read_csv(file, index_col=False, header=None) if (filename == "mrna"): X = pd.DataFrame(X[X.std().sort_values( ascending=False).head(1200).index].values.tolist()) X_train, X_test, y_train, y_test = train_test_split(
from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler sys.path.append('../deployment/prediction/') from src.utils import upload_model_to_s3 from src.config import MODEL_FILENAME, MODEL_EXTENSION, BUCKET_NAME, AWS_PROFILE, SCALER_FILENAME df = pd.read_csv('../../data/raw/data_2020_05.csv', parse_dates=['ts']) user_ids = df.user_id.unique() min_hr = 1 # Values below this ignored min_rr = 3 # Values below this ignored for user in user_ids: X = df[(df.in_room == True) & (df.user_id == user) & (df.hr > min_hr) & (df.rr > min_rr)][['hr', 'rr']] X_tr, X_va = train_test_split(X, test_size=0.2, shuffle=False) scaler = StandardScaler() X_tr_scaled = scaler.fit_transform(X_tr) # Fit Gaussian to data to detect outliers el = EllipticEnvelope(contamination=0.12) el.fit(X_tr_scaled) savepath = '../../data/models/' upload_model_to_s3(el, user, MODEL_FILENAME, savepath, profile=AWS_PROFILE) upload_model_to_s3(scaler, user, SCALER_FILENAME, savepath, profile=AWS_PROFILE)
def model_monitor(country="total", dev=DEV, training=True): """ performance monitoring """ print("Monitor Model") ## import data datasets = engineer_features(training=training, dev=dev) X, y, dates, labels = datasets[country] dates = pd.to_datetime(dates) print(X.shape) ## train the model if training: _model_train(X, y, labels, tag=country, dev=dev) ## monitor RMSE samples = [10, 20, 30, 50, 60] for n in samples: X_new, y_new, dates_new = simulate_samples(n, X, y, dates) queries = [(str(d.year), str(d.month), str(d.day), country) for d in dates_new] y_pred = [ model_predict(year=query[0], month=query[1], day=query[2], country=query[3], verbose=False, dev=dev)["y_pred"][0].round(2) for query in queries ] rmse = np.sqrt(mean_squared_error(y_new.tolist(), y_pred)) print("sample size: {}, RSME: {}".format(n, rmse.round(2))) ## monitor performance ## scaling scaler = StandardScaler() X = scaler.fit_transform(X) samples = [25, 50, 75, 90] clf_y = EllipticEnvelope(random_state=0, contamination=0.01) clf_X = EllipticEnvelope(random_state=0, contamination=0.01) clf_X.fit(X) clf_y.fit(y.reshape(y.size, 1)) results = defaultdict(list) for n in samples: X_new, y_new, dates_new = simulate_samples(n, X, y, dates) results["sample_size"].append(n) results['wasserstein_X'].append( np.round(wasserstein_distance(X.flatten(), X_new.flatten()), 2)) results['wasserstein_y'].append( np.round(wasserstein_distance(y, y_new), 2)) test1 = clf_X.predict(X_new) test2 = clf_y.predict(y_new.reshape(y_new.size, 1)) results["outlier_percent_X"].append( np.round(1.0 - (test1[test1 == 1].size / test1.size), 2)) results["outlier_percent_y"].append( np.round(1.0 - (test2[test2 == 1].size / test2.size), 2)) return pd.DataFrame(results)
def fuse_to_get_results(self, weights, num_comp): if weights[0] != 0: self.apply_pca(num_comp) # Make sure you apply pca before using Envelop -- it is very sensitive to the feature dimensions clf_een = EllipticEnvelope(store_precision=True, assume_centered=False, support_fraction=0.25, contamination=0.1, random_state=True) # Fitting the model on reduced dimensionality clf_een.fit(self.gen_tr_data) # The anomaly score of the input samples. The lower, the more abnormal. #输入样本的异常分数。越低越不正常。 pred_gen_scores_ee = clf_een.decision_function(self.gen_ts_data) pred_imp_scores_ee = clf_een.decision_function(self.imp_ts_data) pred_scores_ts_ee = np.concatenate( (pred_gen_scores_ee, pred_imp_scores_ee)) norm_scores_ee = self.mymm_scaler(pred_scores_ts_ee) else: norm_scores_ee = self.fill_sc_with_zero( np.concatenate( (self.get_gen_ts_labels(), self.get_imp_ts_labels()))) if weights[1] != 0: # Make sure you apply pca before using envelop -- it is very sensitive to the feature dimensions clf_if = IsolationForest(max_samples="auto", contamination=0.2, random_state=True) # Fitting the model on reduced dimensionality clf_if.fit(self.gen_tr_data) # The anomaly score of the input samples. The lower, the more abnormal. pred_gen_scores_if = clf_if.decision_function(self.gen_ts_data) pred_imp_scores_if = clf_if.decision_function(self.imp_ts_data) # print('pred_gen_scores_if',self.mymm_scaler(pred_gen_scores_if)) # print(clf_if.predict(self.gen_ts_data)) # print('pred_imp_scores_if', self.mymm_scaler(pred_imp_scores_if)) # print(clf_if.predict(self.imp_ts_data)) pred_scores_ts_if = np.concatenate( (pred_gen_scores_if, pred_imp_scores_if)) norm_scores_if = self.mymm_scaler(pred_scores_ts_if) # print('norm_scores_if',norm_scores_if) # print('plabel',np.concatenate((clf_if.predict(self.gen_ts_data),clf_if.predict(self.imp_ts_data)))) else: norm_scores_if = self.fill_sc_with_zero( np.concatenate( (self.get_gen_ts_labels(), self.get_imp_ts_labels()))) if weights[2] != 0: num_neighbors = 35 clf_lof = LocalOutlierFactor(n_neighbors=num_neighbors, metric='l2', contamination=0.25) X = np.concatenate((self.gen_tr_data, self.gen_ts_data)) X_all = np.concatenate((X, self.imp_ts_data)) pred_all_score = clf_lof.fit_predict(X_all) #print('pred_all_score') #print(pred_all_score) pred_scores_ts_lof = pred_all_score[ range(len(self.gen_tr_data), len(pred_all_score)), ] norm_scores_lof = self.mymm_scaler(pred_scores_ts_lof) else: norm_scores_lof = self.fill_sc_with_zero( np.concatenate( (self.get_gen_ts_labels(), self.get_imp_ts_labels()))) if weights[3] != 0: # Make sure you apply pca before using envelop -- it is very sensitive to the feature dimensions clf_svm1c = svm.OneClassSVM(kernel='rbf', degree=3, gamma=0.001, coef0=0.0, tol=0.00001, nu=0.001, shrinking=True, cache_size=200, verbose=False, max_iter=-1, random_state=True) # Fitting the model on reduced dimensionality clf_svm1c.fit(self.gen_tr_data) # The anomaly score of the input samples. The lower the more abnormal. pred_gen_scores_svm = clf_svm1c.decision_function(self.gen_ts_data) pred_imp_scores_svm = clf_svm1c.decision_function(self.imp_ts_data) pred_scores_ts_svm = np.concatenate( (pred_gen_scores_svm, pred_imp_scores_svm)) norm_scores_svm = self.mymm_scaler(pred_scores_ts_svm) else: norm_scores_svm = self.fill_sc_with_zero( np.concatenate( (self.get_gen_ts_labels(), self.get_imp_ts_labels()))) # Score level fusion pred_ts_labels = [] fused_scores = [] for ees, ifs, lofs, svms in zip(norm_scores_ee, norm_scores_if, norm_scores_lof, norm_scores_svm): cfscore = (weights[0] * ees + weights[1] * ifs + weights[2] * lofs + weights[3] * svms) / sum(weights) fused_scores.append(cfscore) if cfscore < self.threshold: pred_ts_labels.append(-1) else: pred_ts_labels.append(1) act_ts_labels = np.concatenate( (self.get_gen_ts_labels(), self.get_imp_ts_labels())) tn, fp, fn, tp = confusion_matrix(act_ts_labels, pred_ts_labels).ravel() far = fp / (fp + tn) frr = fn / (fn + tp) pr = tp / (tp + fp) final_score_table = [ norm_scores_ee, norm_scores_if, norm_scores_lof, norm_scores_svm, fused_scores, act_ts_labels ] #ee分数 print(norm_scores_ee) #if分数 print(norm_scores_if) #lof是0,1标签 print(norm_scores_lof) #svm分数 print(norm_scores_svm) #混合后也是分数 print(fused_scores) #标签 print(act_ts_labels) return far, frr, pr, final_score_table
def anomaly_detection_ex8_ng(): """Run anomaly detection. Example from Andrew Ng's coursera course """ # ===================== # load data dataset = loadmat('data/ex8data1.mat') # dataset = loadmat('data/ex8data2.mat') print(dataset.keys()) X = dataset['X'] print('X:', X.shape, X[0, :]) # 307x2 Xval = dataset['Xval'] print('X_val:', Xval.shape, Xval[0, :]) # 307x2 yval = dataset['yval'] print('y_val:', yval.shape, yval[0, :]) # 307x1 # ===================== # display fig = plt.figure(facecolor='white') fig1 = fig.add_subplot(2, 2, 1) plt.scatter(X[:, 0], X[:, 1], c='k') plt.title("Outlier detection") plt.xlabel('Latency (ms)') plt.ylabel('Throughput (mb/s)') # ===================== # detecting outliers in a Gaussian distributed dataset. clf = EllipticEnvelope() clf.fit(X) # Calculate the decision function and use threshold to determine outliers y_pred = clf.decision_function(X).ravel() # print('y pred', y_pred) # ===================== # find best threshold for outlier detection if False: samples = np.linspace(0.1, 10.0, num=100) best_f1 = 0.0 best_perc = 0.0 for sample in samples: Xval_pred = clf.decision_function(Xval) perc = sample th = np.percentile(Xval_pred, perc) outl = Xval_pred < th f1score = f1_score(yval, outl) print('f1 score (', sample, '):', f1score) if best_f1 < f1score: best_f1 = f1score best_perc = perc print('best f1:', best_f1, ', best perc:', best_perc) # set threshold for outlier detection percentile = 1.9 # 5.1 # 1.9 #best_perc # 1.9607843 threshold = np.percentile(y_pred, percentile) outliers = y_pred < threshold # print('outliers:', X[outliers]) # ===================== # plot contours fig.add_subplot(2, 2, 2) # create the grid for plotting if False: xx, yy = np.meshgrid(np.linspace(0, 25, 200), np.linspace(0, 30, 200)) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='blue', linestyles='dotted') threshold = np.percentile(y_pred, 1.0) plt.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='blue', linestyles='dotted') threshold = np.percentile(y_pred, 0.5) plt.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='blue', linestyles='dotted') # plot outliers plt.scatter(X[:, 0], X[:, 1], c='k') plt.scatter(X[outliers, 0], X[outliers, 1], c='r') print('num outliers:', sum(outliers)) # samples_idx = yval == 1 # print(yval[samples_idx]) # print('X_val:', Xval.shape, Xval[0, :]) # 307x2 # print(Xval[samples_idx]) plt.show()
def main(): #Variables setting columnArrangement = ['id', 'selectedFault', 'faultType', 'faultIntensity', 'externalControllerOutput', 'pressureValveInlet', 'pressureValveOutlet', 'disturbedMediumFlow', 'mediumTemperature', 'rodDisplacement'] plottingVariables = ['externalControllerOutput', 'pressureValveInlet', 'pressureValveOutlet', 'disturbedMediumFlow', 'mediumTemperature', 'rodDisplacement'] ellipticEnvelopeContamination = 0.04 classifiers = {'DummyClf':DummyClassifier(), 'EllipticEnvelope':EllipticEnvelope(contamination=ellipticEnvelopeContamination)} pd.options.mode.chained_assignment = None nsamples = 1000 random_seed = 0 #Change this to make it really random, 0 for testing purposes cv_folds = 4 desiredComponents = ['Valve'] scoringMetrics = ['precision_macro', 'recall_macro', 'f1_macro'] startDateTime = datetime(2017, 11, 6, hour=0, minute=0, second=0, microsecond=0) endDateTime = datetime(2017, 11, 16, hour=0, minute=0, second=0, microsecond=0) dataManager = DataManager(user="******", password="******", engineType="mysql+mysqldb://", dbName="damadics", host="localhost", port="3306") y_trains = {'DummyClf':None, 'EllipticEnvelope':list()} y_tests = {'DummyClf':None, 'EllipticEnvelope':list()} #Data acquisition and formatting dataFrames = dataManager.readData(startDateTime, endDateTime, desiredComponents) df = dataFrames['ValveReadings'] df = dataManager.reshapeAndCleanDataFrame(df) df = df[columnArrangement] #Rearrange columns #display(df.head()) X_raw = df[['externalControllerOutput', 'disturbedMediumFlow', 'pressureValveInlet', 'pressureValveOutlet', 'mediumTemperature', 'rodDisplacement']] df.loc[df['selectedFault'] != 20, 'selectedFault'] = 1 df.loc[df['selectedFault'] == 20, 'selectedFault'] = 0 totalCount = df.shape[0] faultydf = df.loc[df['selectedFault'] == 1, 'selectedFault'] faultCount = faultydf.shape[0] nonFaultCount = df.shape[0] - faultCount faultNonFaultRatio = faultCount/nonFaultCount y_raw = df['selectedFault'] #get a jointplot of the 7 variables """pt.jp_plotData(df, 'The 7 variables in the data', saveToFile='snspp_damadics.png', nsamples = 1000, vars=plottingVariables, hue='selectedFault')""" #Anomaly detection #First standardize the data X_transformed = StandardScaler().fit_transform(X_raw) X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_raw, random_state=random_seed) y_trains['EllipticEnvelope'] = [-1 if y == 1 else 1 for y in y_train] y_tests['EllipticEnvelope'] = [-1 if y == 1 else 1 for y in y_test] print('Performing cross validations') for classifierKey in classifiers: print('\nResults for {} classifier'.format(classifierKey)) clf = classifiers[classifierKey] if y_trains[classifierKey] != None: y_train = y_trains[classifierKey] cv_scores = cross_validate(clf, X_train, y_train, scoring=scoringMetrics, cv=cv_folds) """clf.fit(X_train, y_train) y_pred = clf.predict(X_train) score_acc = accuracy_score(y_train, y_pred) print('accuracy {}'.format(score_acc)) print('Type: {}, first 5 elements {}, element type {}'.format(type(y_pred), y_pred[:5], type(y_pred[0])))""" print('{}-fold cross validation'.format(cv_folds)) for key in cv_scores: print("For metric %s Accuracy: %0.5f (+/- %0.5f)" % (key, cv_scores[key].mean(), cv_scores[key].std() * 2)) print('\nTotal sample size {}, Train Size {}, Test Size {}'.format(X_raw.shape[0], X_train.shape[0], X_test.shape[0])) print('Total sample size {}, Faulty samples {}, Normal samples {}, Fault/Non Fault Ratio {:.4f}'. format(totalCount, faultCount, nonFaultCount, faultNonFaultRatio)) #print('Total sample size {}, Train Size {}, Test Size {}'.format(X_raw.shape[0], X_train.shape[0], X_test.shape[0])) #display((df)) dataManager.endDataManager()
# Generate labels, 1 for inliers and -1 for outliers labels = np.ones(num_samples, dtype=int) labels[-num_outliers:] = -1 # plt.figure() inlier_plot = plt.plot(x[:num_inliers,0], x[:num_inliers,1], 'go', label='inliers') outlier_plot = plt.plot(x[-num_outliers:,0], x[-num_outliers:,1], 'ko', label='outliers') plt.xlim(-11,11) plt.ylim(-11,11) plt.legend(numpoints=1) # plt.show() plt.savefig(pdf, format='pdf') ## Applying sklearn.covariance.EllipticEnvelope classifier = EllipticEnvelope(contamination=outlier_ratio) classifier.fit(x) y_pred = classifier.predict(x) num_errors = sum(y_pred != labels) print('Number of errors fitting Elliptic Envelope to Gaussian distribution: {}'.format(num_errors)) xx, yy, Z, threshold = output(x, outlier_ratio) # plt.figure() inlier_plot = plt.plot(x[:num_inliers,0], x[:num_inliers,1], 'go', label='inliers') outlier_plot = plt.plot(x[-num_outliers:,0], x[-num_outliers:,1], 'ko', label='outliers') plt.contour(xx, yy, Z, levels=[threshold],linewidths=5, colors='gray') plt.contour(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 10), cmap=plt.cm.Greys_r) plt.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='gray') plt.xlim(-11,11)
def assign_to_nearest(samples, centroids, label2id): """ Args: samples:text-cnn後の出力 centroids:クラスターそれぞれの重心 shape=(cluster_num,output_dim) Returns: nearest: 入力データと最も近いクラスターID shape=(data_num, ) """ #1-1.KNearest Neighborで一番近いクラスタと紐付け neigh = KNeighborsClassifier(n_neighbors=1) neigh.fit(centroids, np.arange(len(centroids))) nearest = neigh.predict(samples) ###1-1.5 NEW!! #### from sklearn.covariance import EllipticEnvelope outliers_fraction = 0.05 sup_ids = np.array([label2id[_] for _ in train_y[supervised]]) unsup_ids = nearest[unsupervised] #print(label2id) for i in range(n_cluster): if i != label2id[drop_cat]: #print(i) clf = EllipticEnvelope(contamination=outliers_fraction) #ラベル付きデータで学習 ind = np.where(sup_ids == i)[0] clf.fit(samples[supervised][ind]) #ラベルなしデータで最近傍であっても離れすぎてないか確認 unind = np.where(unsup_ids == i)[0] r = clf.predict(samples[unsupervised][unind]) nearest[unind[r < 0]] = label2id[drop_cat] #from scipy.stats import itemfreq #print(itemfreq(nearest)) ################# #1-2.ハンガリアンアルゴリズムでラベル付きデータと重心を紐付ける sup_labels = train_y[supervised] hglabel = np.unique(sup_labels) hgx = [] #教師データのラベルごとに平均した点(重心)を求める for i in hglabel: ind = np.where(sup_labels == i)[0] hgx.append(np.mean(samples[supervised][ind], axis=0)) hgx = np.array(hgx) #教師ラベルごとの重心と現在の重心との距離行列 DistanceMatrix = np.linalg.norm(hgx[:, np.newaxis, :] - centroids[np.newaxis, :, :], axis=2) # ハンガリアンアルゴリズムで合計が一番小さくなるように紐づける from scipy.optimize import linear_sum_assignment row_ind, col_ind = linear_sum_assignment(DistanceMatrix) #ラベルとclusterIDを紐づける label2id = {hglabel[i]: col for i, col in enumerate(col_ind)} label2id[drop_cat] = list( set(list(range(n_cluster))) - set(label2id.values()))[0] return nearest, label2id
from sklearn import svm from sklearn.datasets import make_moons, make_blobs from sklearn.covariance import EllipticEnvelope from sklearn.ensemble import IsolationForest from sklearn.neighbors import LocalOutlierFactor matplotlib.rcParams['contour.negative_linestyle'] = 'solid' # 设置 n_samples = 300 outliers_fraction = 0.15 n_outliers = int(outliers_fraction * n_samples) n_inliers = n_samples - n_outliers # 定义比/异常检测方法 anomaly_algorithms = [ ("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)), ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1)), ("Isolation Forest", IsolationForest(contamination=outliers_fraction, random_state=42)), ("Local Outlier Factor", LocalOutlierFactor( n_neighbors=35, contamination=outliers_fraction))] # 定义数据集 blobs_params = dict(random_state=0, n_samples=n_inliers, n_features=2) datasets = [ make_blobs(centers=[[0, 0], [0, 0]], cluster_std=0.5, **blobs_params)[0], make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[0.5, 0.5], **blobs_params)[0], make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[1.5, .3],
#using pandas df = pd.DataFrame(features, columns = ['feature_1','feature-2']) #print(df.apply(add_ten)) #Detecting Outliers features, _ = make_blobs(n_samples = 10, n_features = 2, centers = 1, random_state = 1) features[0,0] = 10000 features[0,1] = 10000 outlier_detector = EllipticEnvelope(contamination = .1) outlier_detector.fit(features) outlier_detector.predict(features) feature = features[:,0] def indicies_of_outliers(x): q1, q3 = np.percentile(x, [25, 75]) iqr = q3 - q1 lower_bound = q1 - (iqr * 1.5) upper_bound = q3 + (iqr * 1.5) return np.where((x > upper_bound) | (x < lower_bound)) #print(indicies_of_outliers(feature))
model_LocalOutlierFactor = LocalOutlierFactor(n_neighbors=20, contamination=0.2, novelty=True, leaf_size=10) # define model model_svm = svm.OneClassSVM(nu=0.2, gamma='scale') model_isolation = IsolationForest( contamination=0.2, random_state=42, behaviour='new', n_estimators=100, ) # define model model_EllipticEnvelope = EllipticEnvelope(contamination=0.2, support_fraction=1) #f.write('LOF'+'\n') #calculate_accuracies(model_LocalOutlierFactor, test_data, fitter, number, 100, f, LOF) #f.write('Isolation forest'+'\n') #calculate_accuracies(model_isolation, test_data, fitter, number, 100, f, iso) #f.write('Envelop'+'\n') #calculate_accuracies(model_EllipticEnvelope, test_data, fitter, number, 100, f, envelop) ### raw data test_data = np.concatenate((no_mod[number:], mod[number:])) model_LocalOutlierFactor = LocalOutlierFactor(n_neighbors=20, contamination=0.2,
def ElliEnvelope(data): clf = EllipticEnvelope(contamination=0.1) clf.fit(data) outlier_pre = clf.predict(data) outlier = data[outlier_pre == -1] return (outlier)
mse = mean_squared_error(y_test, yhat) rmse = math.sqrt(mse) r2score = r2_score(y_test, yhat) print('MAE: %.3f' % mae) print('MSE: %.3f' % mse) print('RMSE: %.3f' % rmse) print('r2 Score: %.3f' % r2score) print("IsolationForest Complete Duration: --- %s seconds ---" % (time.time() - sttime)) #=========================================== # evaluate model performance with outliers removed using elliptical envelope # identify outliers in the training dataset sttime = time.time() print("EllipticEnvelope") ee = EllipticEnvelope(contamination=0.01, support_fraction=1.7) yhat = ee.fit_predict(X_train) print(yhat) # select all rows that are not outliers mask = yhat != -1 X_train, y_train = X_train[mask, :], y_train[mask] # summarize the shape of the updated training dataset print(X_train.shape, y_train.shape) # fit the model model = LinearRegression() model.fit(X_train, y_train) # evaluate the model yhat = model.predict(X_test) # evaluate predictions
import pandas as pd import numpy as np #from sklearn.neighbors import LocalOutlierFactor from sklearn.svm import OneClassSVM from sklearn.ensemble import IsolationForest from sklearn.covariance import EllipticEnvelope import sys import warnings if not sys.warnoptions: warnings.simplefilter("ignore") algorithms = [ #("Local Outlier Factor", LocalOutlierFactor()), ("One Class SVM", OneClassSVM()), ("Isolation Forest", IsolationForest()), ("Elliptical Envelope", EllipticEnvelope()), ] white = pd.read_csv('C:/Users/gajja/Desktop/winequality-white.csv', sep=';') X = white for name, algo in algorithms: if name == "Local Outlier Factor": pred = algo.fit_predict(X) else: pred = algo.fit(X).predict(X) outliers = [x for x in pred if x == -1] print(*outliers, sep='\n') print(name, ':', len(outliers), 'potential outliers detected.')
"mirror": [pre.couples_raw.Mirror(), None], "cluster": [ pre.couples_xy.Cluster(SpectralClustering()), pre.couples_xy.Cluster(MiniBatchKMeans()), pre.couples_xy.Cluster(AgglomerativeClustering()), pre.couples_xy.Cluster(Birch()), pre.transformers.Pass(), ], "cluster__clusterer__n_clusters": [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 24, 26], "cluster__replace": [True], "sanitize": [ pre.couples_xy.SanitizeStartEnd(), pre.transformers.Pass(), ], "sanitize__alg": [IsolationForest(), EllipticEnvelope()], "sanitize__contamination": [.01, .03, .05, .07, .1, .15, .2, .3, .4, .5], "form_data__alg": [ None, # I have to explicitly specify n_components for each because of this TSNE(n_components=3), LocallyLinearEmbedding(n_components=3), LocallyLinearEmbedding(n_components=4), Isomap(n_components=3), Isomap(n_components=4), MDS(n_components=3), MDS(n_components=4), SpectralEmbedding(n_components=3), SpectralEmbedding(n_components=4), PCA(n_components=3), PCA(n_components=4), ],
feature_sum = np.sum(temp, axis=0) sum_sort = np.sort(feature_sum) pca1 = feature_names[np.where(feature_sum == sum_sort[0])[0][0]] pca2 = feature_names[np.where(feature_sum == sum_sort[1])[0][0]] pca3 = feature_names[np.where(feature_sum == sum_sort[2])[0][0]] print(f"selected features :{pca1}, {pca2}, {pca3}") """ Result -> Performance """ """ One class Svm """ anomaly_algorithms = [ ("Robust covariance", EllipticEnvelope()), ("One-Class SVM", svm.OneClassSVM(kernel="rbf", gamma=0.001)), ("Isolation Forest", IsolationForest(random_state=42)), ("Local Outlier Factor", LocalOutlierFactor()) ] X = np.append(arr=X, values=features_pca, axis=0) X_num = X.shape[0] base_estimators = [LOF(), IForest(), OCSVM(kernel="rbf", gamma=0.001)] model = SUOD( base_estimators=base_estimators, n_jobs=2, # number of workers(if -1 it use full core) rp_flag_global=True, # global flag for random projection bps_flag=True, # global flag for balanced parallel scheduling approx_flag_global=False, # global flag for model approximation
from sklearn import svm from sklearn.covariance import EllipticEnvelope # Example settings n_samples = 200 outliers_fraction = 0.25 clusters_separation = [0, 1, 2] # define two outlier detection tools to be compared classifiers = { "One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, kernel="rbf", gamma=0.1), "robust covariance estimator": EllipticEnvelope(contamination=.1) } # Compare given classifiers under given settings xx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500)) n_inliers = int((1. - outliers_fraction) * n_samples) n_outliers = int(outliers_fraction * n_samples) ground_truth = np.ones(n_samples, dtype=int) ground_truth[-n_outliers:] = 0 # Fit the problem with varying cluster separation for i, offset in enumerate(clusters_separation): np.random.seed(42) # Data generation X1 = 0.3 * np.random.randn(0.5 * n_inliers, 2) - offset X2 = 0.3 * np.random.randn(0.5 * n_inliers, 2) + offset
df["usage proportion"] = df["usage_cycles"] / (df["usage_cycles"] + df["non_usage_cycles"]) X = df["usage proportion"].values Y = df["usage_percentage"].values XY = df[["usage proportion", "usage_percentage"]].values # Example settings n_samples = len(df) outliers_fraction = 0.2 clusters_separation = [0] # define two outlier detection tools to be compared classifiers = { "robust covariance estimator": EllipticEnvelope(contamination=.1) } # Compare given classifiers under given settings #xx, yy = np.meshgrid(np.linspace(-0.1, 1.1, 1000), np.linspace(0, 100, 1000)) n_inliers = int((1. - outliers_fraction) * n_samples) n_outliers = int(outliers_fraction * n_samples) # Fit the problem with varying cluster separation np.random.seed(42) # Data generation # Fit the model with the One-Class SVM #plt.figure(figsize=(10, 5)) clf = EllipticEnvelope(contamination=.1)
def get_elliptic_envelope(X1): # Define "classifiers" to be used classifiers = { "Elliptic Envelope": EllipticEnvelope(), "Empirical Covariance": EllipticEnvelope(support_fraction=1., contamination=0.261), "Robust Covariance (Minimum Covariance Determinant)": EllipticEnvelope(contamination=0.261), } # list color codes for plotting colors = ['firebrick', 'gold', 'mediumorchid'] legend1 = {} # Learn a frontier for outlier detection with several classifiers a = [0.995, 1.001] b = [0.9999, 1.0001] lim_min = np.min(X1, axis=0) lim_max = np.max(X1, axis=0) # create meshgrids for plotting ellipses (contours) xx1, yy1 = np.meshgrid( np.linspace(lim_min[0] * a[0], lim_max[0] * a[1], 500), np.linspace(lim_min[1] * b[0], lim_max[1] * b[1], 500)) # loop over classifiers and fit then plot for i, (clf_name, clf) in enumerate(classifiers.items()): # compute and store plot for X1 plt.figure(1) clf.fit(X1) # fit current classifier # get decision function with outlier thresh = 0 (default) Z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()]) # reshape for plotting Z1 = Z1.reshape(xx1.shape) # plot #legend1[clf_name] = plt.contour( plt.contour(xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[i]) #legend1_values_list = list(legend1.values()) #legend1_keys_list = list(legend1.keys()) # Plot the results for X1 plt.figure(1) # two clusters plt.title("Outlier detection on NBA Players") plt.scatter(X1[:, 0], X1[:, 1], color='royalblue') # just data points # set figure limits from meshgrids plt.xlim((xx1.min(), xx1.max())) plt.ylim((yy1.min(), yy1.max())) # set labels plt.ylabel("Latitude") plt.xlabel("Longitude") # create legend #plt.legend((legend1_values_list[0].collections[0], # legend1_values_list[1].collections[0], # legend1_values_list[2].collections[0]), # (legend1_keys_list[0], legend1_keys_list[1], legend1_keys_list[2]), # loc="lower center", # prop=matplotlib.font_manager.FontProperties(size=12)) plt.show()