def predict_EllipticEnvelope(X, fraction_outlier): xx, yy = get_meshgrid(X) x1, x2 = xx.min(), xx.max() y1, y2 = yy.min(), yy.max() d = (x2 - x2) * 0.1 A = EllipticEnvelope(contamination=fraction_outlier) A.fit(X) Y = A.predict(X) confidence_mat = numpy.array([(A.predict(x.reshape(-1, 2))).astype(int) for x in numpy.c_[xx.flatten(), yy.flatten()]]) grid_confidence = (confidence_mat).reshape((100, 100)) P.plot_contourf(X[Y > 0], X[Y <= 0], xx, yy, grid_confidence, x_range=[x1 - d, x2 + d], y_range=[y1 - d, y2 + d], filename_out='4_pred_EllipticEnvelope_density.png') P.plot_2D_features_multi_Y(X, -Y, x_range=[x1 - d, x2 + d], y_range=[y1 - d, y2 + d], filename_out='4_pred_EllipticEnvelope.png') return
class EllipticEnvelope_Classifier: """docstring for EllipticEnvelope""" def __init__(self, save_path): # 默认路径 # 保存路径 self.save_path = os.path.join(save_path,'EllipticEnvelope') if not os.path.exists(self.save_path): os.makedirs(self.save_path) self.contamination = 0.1 self.classifier = EllipticEnvelope(contamination=self.contamination) def fit_model(self, train_data_matrix, test_data_matrix, test_true_label): """训练模型""" train_data_matrix = train_data_matrix.toarray() test_data_matrix = test_data_matrix.toarray() self.classifier.fit(train_data_matrix) y_pred_label = self.classifier.predict(test_data_matrix) n_errors_test = (y_pred_label!=test_true_label).sum() accuracy, classification_report, confusion_matrix = sklearn_evaluation(test_true_label, y_pred_label) print('Accuracy: {} \nClassification Report:\n{}\n'.format(accuracy, classification_report)) sys.stdout.flush() def test_model(test_data,): """测试模型 such as test_label = [1,1,-1,....] """ scores_pred = self.classifier.decision_function(train_data) y_pred = self.classifier.predict(train_data) n_error_train = y_pred_test[y_pred_test == -1].size
def compare_drift(X_src, y_src, X_new, y_new): clf_y = EllipticEnvelope(random_state=0, contamination=0.01) clf_X = EllipticEnvelope(random_state=0, contamination=0.01) clf_X.fit(X_src) clf_y.fit(y_src.reshape(y_src.size, 1)) test_X = clf_X.predict(X_new) test_y = clf_y.predict(y_new.reshape(-1, 1)) X_distance = wasserstein_distance(X_src.values.flatten(), X_new.values.flatten()) y_distance = wasserstein_distance(y_src.flatten(), y_new.flatten()) X_outlier = len(test_X[test_X == -1]) / len(test_X) y_outlier = len(test_y[test_y == -1]) / len(test_y) results = { 'X_wasserstein_distance': X_distance, 'y_wasserstein_distance': y_distance, 'X_outlier_percentage': X_outlier, 'y_outlier_percentage': y_outlier } return results
def view_anomalies(df): data = reindex_data(df) df.index = data.index df_class0 = df.loc[df['srch_saturday_night_bool'] == 0, 'price_usd'] df_class1 = df.loc[df['srch_saturday_night_bool'] == 1, 'price_usd'] fig, axs = plt.subplots(1,2) df_class0.hist(ax=axs[0], bins=30) df_class1.hist(ax=axs[1], bins=30); outliers_fraction = 0.01 envelope = EllipticEnvelope(contamination = outliers_fraction) X_train = df_class0.values.reshape(-1,1) envelope.fit(X_train) df_class0 = pd.DataFrame(df_class0) df_class0['deviation'] = envelope.decision_function(X_train) df_class0['anomaly'] = envelope.predict(X_train) envelope = EllipticEnvelope(contamination = outliers_fraction) X_train = df_class1.values.reshape(-1,1) envelope.fit(X_train) df_class1 = pd.DataFrame(df_class1) df_class1['deviation'] = envelope.decision_function(X_train) df_class1['anomaly'] = envelope.predict(X_train) # plot the price repartition by categories with anomalies a0 = df_class0.loc[df_class0['anomaly'] == 1, 'price_usd'] b0 = df_class0.loc[df_class0['anomaly'] == -1, 'price_usd'] a2 = df_class1.loc[df_class1['anomaly'] == 1, 'price_usd'] b2 = df_class1.loc[df_class1['anomaly'] == -1, 'price_usd'] fig, axs = plt.subplots(1,2) axs[0].hist([a0,b0], bins=32, stacked=True, color=['blue', 'red']) axs[1].hist([a2,b2], bins=32, stacked=True, color=['blue', 'red']) axs[0].set_title("Search Non Saturday Night") axs[1].set_title("Search Saturday Night") df_class = pd.concat([df_class0, df_class1]) df['anomaly5'] = df_class['anomaly'] # df['anomaly5'] = np.array(df['anomaly22'] == -1).astype(int) fig, ax = plt.subplots(figsize=(10, 6)) df = df.sort_values('date_time') df['date_time_int'] = pd.to_datetime(df['date_time']).astype('int64') a = df.loc[df['anomaly5'] == -1, ('date_time_int', 'price_usd')] #anomaly ax.plot(df['date_time_int'], df['price_usd'], color='blue', label='Normal') ax.scatter(a['date_time_int'],a['price_usd'], color='red', label='Anomaly') plt.legend() a = df.loc[df['anomaly5'] == 1, 'price_usd'] b = df.loc[df['anomaly5'] == -1, 'price_usd'] fig, axs = plt.subplots(figsize=(10, 6)) axs.hist([a,b], bins=32, stacked=True, color=['blue', 'red']) plt.show();
def oneClassSVM(self, encoded_imgs_test): encoded_imgs_list = encoded_imgs_test.tolist() print(encoded_imgs_list) # clf = OneClassSVM(gamma='auto', nu=self.nu).fit(encoded_imgs_list) clf = EllipticEnvelope(contamination=self.nu).fit( np.array(encoded_imgs_list)) print('test: ', clf.predict(encoded_imgs_list)) return clf.predict(encoded_imgs_list) '''
def model_monitor(country="all", dev=DEV, training=True): """ performance monitoring """ print("Monitor Model") ## import data #datasets = engineer_features(training=training, dev=dev) datasets = engineer_features(training=training) X, y, dates, labels = datasets[country] dates = pd.to_datetime(dates) print(X.shape) ## train the model if training: _model_train(X, y, labels, tag=country, dev=dev) ## monitor RMSE samples = [10, 20, 30, 50, 60] for n in samples: X_new, y_new, dates_new = simulate_samples(n, X, y, dates) queries = [(str(d.year), str(d.month), str(d.day), country) for d in dates_new] y_pred = [model_predict(year=query[0], month=query[1], day=query[2], country=query[3],verbose=False, dev=dev)["y_pred"][0].round(2) for query in queries] rmse = np.sqrt(mean_squared_error(y_new.tolist(),y_pred)) print("sample size: {}, RSME: {}".format(n, rmse.round(2))) ## monitor performance ## scaling scaler = StandardScaler() X = scaler.fit_transform(X) samples = [25, 50, 75, 90] clf_y = EllipticEnvelope(random_state=0,contamination=0.01) clf_X = EllipticEnvelope(random_state=0,contamination=0.01) clf_X.fit(X) clf_y.fit(y.reshape(y.size,1)) results = defaultdict(list) for n in samples: X_new, y_new, dates_new = simulate_samples(n,X,y, dates) results["sample_size"].append(n) results['wasserstein_X'].append(np.round(wasserstein_distance(X.flatten(),X_new.flatten()),2)) results['wasserstein_y'].append(np.round(wasserstein_distance(y,y_new),2)) test1 = clf_X.predict(X_new) test2 = clf_y.predict(y_new.reshape(y_new.size,1)) results["outlier_percent_X"].append(np.round(1.0 - (test1[test1==1].size / test1.size),2)) results["outlier_percent_y"].append(np.round(1.0 - (test2[test2==1].size / test2.size),2)) return pd.DataFrame(results)
def ellepticEnvelopeAnomaly(df, outliersFraction): # creation of 4 differents data set based on categories defined before df_class0 = df.loc[df['categories'] == 0, 'value'] df_class1 = df.loc[df['categories'] == 1, 'value'] df_class2 = df.loc[df['categories'] == 2, 'value'] df_class3 = df.loc[df['categories'] == 3, 'value'] # apply ellipticEnvelope(gaussian distribution) at each categories envelope = EllipticEnvelope(contamination=outliersFraction) X_train = df_class0.values.reshape(-1, 1) envelope.fit(X_train) df_class0 = pd.DataFrame(df_class0) df_class0['deviation'] = envelope.decision_function(X_train) df_class0['anomaly'] = envelope.predict(X_train) envelope = EllipticEnvelope(contamination=outliersFraction) X_train = df_class1.values.reshape(-1, 1) envelope.fit(X_train) df_class1 = pd.DataFrame(df_class1) df_class1['deviation'] = envelope.decision_function(X_train) df_class1['anomaly'] = envelope.predict(X_train) envelope = EllipticEnvelope(contamination=outliersFraction) X_train = df_class2.values.reshape(-1, 1) envelope.fit(X_train) df_class2 = pd.DataFrame(df_class2) df_class2['deviation'] = envelope.decision_function(X_train) df_class2['anomaly'] = envelope.predict(X_train) envelope = EllipticEnvelope(contamination=outliersFraction) X_train = df_class3.values.reshape(-1, 1) envelope.fit(X_train) df_class3 = pd.DataFrame(df_class3) df_class3['deviation'] = envelope.decision_function(X_train) df_class3['anomaly'] = envelope.predict(X_train) # add the data to the main df_class = pd.concat([df_class0, df_class1, df_class2, df_class3]) df['anomaly22'] = df_class['anomaly'] df['anomaly22'] = np.array(df['anomaly22'] == -1).astype(int) # visualisation of anomaly throughout time (viz 1) fig, ax = plt.subplots() a = df.loc[df['anomaly22'] == 1, ['time_epoch', 'value']] #anomaly ax.plot(df['time_epoch'], df['value'], color='blue') ax.scatter(a['time_epoch'], a['value'], color='red') ax.set_title('Elliptic Envelope Multi Clustering') plt.show() return df
def EllipticEnvelop(X): Outlier_fraction = 0.0001 from sklearn.covariance import EllipticEnvelope # (n+k+1)/2 points whose empirical covariance has the smallest determinant ell = EllipticEnvelope(contamination=Outlier_fraction).fit(X) Outlier_pred = ell.predict(X) return Outlier_pred
def show(samplepath): paths = [] sname = os.path.splitext(samplepath)[0] print sname with open(sname+"_path.txt", 'r') as f: for line in f: paths.append(line.strip()) X = load_one_class_feature(samplepath) X = norm_data(X) #clf = OneClassSVM(kernel='rbf',gamma=0.01,nu=0.098) clf = EllipticEnvelope(contamination=0.05) clf.fit(X) Y = clf.predict(X) DY = clf.decision_function(X) for k in range(len(Y)): if Y[k] < 0: #abnormality is positive print k + 1, ',', DY[k], ',',paths[k] err = np.sum( [ y < 0 for y in Y] ) print '%d/%d'%(err, len(Y)) x1,y1 = np.meshgrid(np.linspace(-20,20,400), np.linspace(-20,20,400)) z1 = clf.decision_function(np.c_[x1.ravel(), y1.ravel()]) z1 = z1.reshape(x1.shape) legend = {} legend['test'] = plt.contour(x1,y1,z1, levels=[0], linewidths=2,color='r') plt.scatter(X[:,0], X[:,1], color='black') values_list = list(legend.values()) keys_list = list(legend.keys()) plt.legend([values_list[0].collections[0]],[keys_list[0]]) plt.show()
def outliers_detection(expr): x = PCA(n_components=2).fit_transform(expr) ee = EllipticEnvelope() ee.fit(x) oo = ee.predict(x) return oo
def EllipticEnvelopeDetection(clm_select, all_tss, df_data, plot=False): rng = np.random.RandomState(42) outliers_fraction = 0.6 if plot: plt.figure() ee_pred = {} for i in range(len(clm_select)): col = clm_select[i] j = 1 ee_pred[col] = [] for kind in all_tss[col].keys(): j += 1 X = np.array(all_tss[col][kind]) # ONE-class SVM clf = EllipticEnvelope(contamination=outliers_fraction) clf.fit(X) y_pred = clf.predict(X) ee_pred[col].extend(y_pred) if plot: subplot = plt.subplot(len(clm_select), 1, i + 1) subplot.scatter(df_data['val'], df_data[col], c=ee_pred[col]) subplot.set_title('Dimension ' + clm_select[i]) if plot: plt.suptitle('Outlier detection with one class EllipticEnvelope') plt.show() return ee_pred
class EllipticEnvelopeOutlierStream(OutlierStream): def __init__(self, data, data_stream): OutlierStream.__init__(self, data, data_stream) self.model = EllipticEnvelope(contamination=0.045) self.DEBUG = False self.pca_plot = StreamPCA() def train_model(self, data): self.model.fit(data) def update_model(self, data): return None def predict_model(self, data): return self.model.predict(data) def summary(self, predictions, data_stream): print("Non outliers: {}".format(len(list(filter(lambda x: x > 0, predictions))))) print("Outliers: {}".format(len(list(filter(lambda x: x < 0, predictions))))) import numpy as np y_axes = np.linspace(0, len(predictions), len(predictions)) plt.scatter(y_axes,predictions) plt.show()
def ellipticCurve(dataset): classifier = EllipticEnvelope(contamination=outlierFraction) classifier.fit(dataset) predScore = classifier.decision_function(dataset) pred = classifier.predict(dataset) outlierRows = [i for i in range(len(pred)) if pred[i] == -1] return predScore, outlierRows
def envelop(self): # Make sure you apply pca before using Envelop -- it is very sensitive to the feature dimensions clf_een = EllipticEnvelope(store_precision=True, assume_centered=False, support_fraction=0.25, contamination=0.1, random_state=True) # Fitting the model on reduced dimensionality clf_een.fit(self.gen_tr_data) # Prediction labels pred_gen_ts_labels = clf_een.predict(self.gen_ts_data) pred_imp_ts_labels = clf_een.predict(self.imp_ts_data) act_ts_labels = np.concatenate( (self.get_gen_ts_labels(), self.get_imp_ts_labels())) pred_ts_labels = np.concatenate( (pred_gen_ts_labels, pred_imp_ts_labels)) tn, fp, fn, tp = confusion_matrix(act_ts_labels, pred_ts_labels).ravel() far = fp / (fp + tn) frr = fn / (fn + tp) pr = tp / (tp + fp) return far, frr, pr
def outlier(TRAIN, contam): for i in range(TRAIN.shape[1]): v = TRAIN[:, i] v_hat = (v - np.median(v)) TRAIN[:, i] = v_hat # model creation clf = EllipticEnvelope(support_fraction=1., contamination=contam, assume_centered=True) clf.fit(TRAIN) C = clf.correct_covariance(TRAIN) pred = clf.predict(TRAIN) # eigen decomposition E, U = LA.eig(C) P = U[0:2, :] X_hat = np.dot(TRAIN, np.transpose(P)) # plotting pred += 1 for i in range(pred.shape[0]): pred[i] = pred[i] // 2 plotting(X_hat, pred) return pred
def cov(self, X_train, contamination=None, random_state=None): """ Train Elliptic Envelope model from scikit-learn Parameters __________ X_train: scaled training data contamination: percentage of anomalies in the data random_state: random number seed Returns ________ Anomaly scores """ model = EllipticEnvelope(contamination=contamination, random_state=random_state) model.fit(X_train) # Predict raw anomaly score labels = model.predict(X_train) # -1 for outliers and 1 for inliers labels = (labels.max() - labels) // 2 # rescaled labels (1: outliers, 0: inliers) cov_anomaly_scores = model.decision_function( X_train) * -1 # anomaly score cov_anomaly_scores = self.min_max_scaler(cov_anomaly_scores) return cov_anomaly_scores, labels
def predict_AB(train,test,result,num,sshop): filter_feature_train = ['user_id', 'time_stamp', 'mall_id', 'wifi_infos','wifi_id_signal','shop_id'] filter_feature_test = ['user_id', 'time_stamp', 'mall_id', 'wifi_infos','wifi_id_signal'] train = train.drop(filter_feature_train,axis=1) test = test.drop(filter_feature_test,axis=1) train = train.fillna(-999) test = test.fillna(-999) test = test[list(train.columns)].join(test['row_id']) # # 存储矩阵 # train.to_csv(r'D:\刘帅专用\XGBoost天池\mall_data_train&test\train_%d.csv'% num,index=None) # test.to_csv(r'D:\刘帅专用\XGBoost天池\mall_data_train&test\test_%d.csv' % num, index=None) model = EllipticEnvelope() model.fit(train) test['label'] = model.predict(test.drop(['row_id'],axis=1)) # 标签转化回去 test['shop_id'] = None print('***************************',len(test)) print(len(test[test['label']==1])) print('***************************') test = test[test['label']==1] test['shop_id'][test['label']==1] = sshop #todo r = test[['row_id', 'shop_id']] result = pd.concat([result, r]) result['row_id'] = result['row_id'].astype('int') return result
class EllipticDetection(BaseEstimator, TransformerMixin): def __init__(self, contamination=0): self.contamination = contamination def fit(self, X, y=None): if self.contamination == 0: return self self.ell = EllipticEnvelope(contamination=self.contamination) if y is None: self.ell.fit(X) else: self.ell.fit(X, y) return self def transform(self, X_): X = deepcopy(X_) if self.contamination == 0: return X idx_outlier = self.ell.predict(X) == -1 X[idx_outlier, :] = np.nan simple_imputer = SimpleImputer() X = simple_imputer.fit_transform(X) return X
class Baseline(ModelBase): def __init__(self, model_name, packet_length=1500, seq_length=1, epochs=1): super().__init__(packet_length, seq_length, epochs) self.model_name = model_name if model_name == 'svm': self.model = OneClassSVM(kernel='rbf', nu=0.05) elif model_name == 'if': self.model = IsolationForest(contamination=0.05, max_features=15, random_state=0) elif model_name == 'lof': self.model = LocalOutlierFactor(contamination=0.05, novelty=True) elif model_name == 'gm': self.model = GaussianMixture(random_state=0) elif model_name == 'ee': self.model = EllipticEnvelope(contamination=0.05, random_state=0) def fit(self, X): self.model.fit(X) def predict(self, X): labels = self.model.predict(X) scores = self.model.score_samples(X) return scores, labels def save(self, name): joblib.dump(self.model, name + '_{}.pkl'.format(self.model_name)) def load(self, name): self.model = joblib.load(name + '_{}.pkl'.format(self.model_name)) def exist(self, name): return os.path.exists(name + '_{}.pkl'.format(self.model_name))
def robustcovariance(nparray, contamination): """ The scikit-learn provides an object covariance.EllipticEnvelope that fits a robust covariance estimate to the data, and thus fits an ellipse to the central data points, ignoring points outside the central mode. References: Rousseeuw, P.J., Van Driessen, K. “A fast algorithm for the minimum covariance determinant estimator”. Technometrics 41(3), 212 (1999) """ df = pd.DataFrame(nparray) # Fit the model clf = EllipticEnvelope(contamination=contamination) clf.fit(df) y_pred = clf.predict(df) y_pred[y_pred == 1] = 0 y_pred[y_pred == -1] = 1 # df['RC'] = y_pred # ax = df[df['RC']==1][0].plot(style='.') # df[df['RC']==-1][0].plot(style='.',ax=ax) return y_pred
def ellipses_indices_of_outliers(X, contamination=0.1): ''' Detects outliers using the elliptical envelope method Input: An array of all variables to detect outliers for Output: An array with indices of detected outliers ''' from sklearn.covariance import EllipticEnvelope # Copying to prevent changes to the input array X = X.copy() # Dropping categorical columns non_categorical = [] for feature in range(X.shape[1]): num_unique_values = len(np.unique(X[:, feature])) if num_unique_values > 30: non_categorical.append(feature) X = X[:, non_categorical] # Subsetting to columns without categorical indexes # Testing if there are an adequate number of features if X.shape[0] < X.shape[1] ** 2.: print('Will not perform well. Reduce the dimensionality and try again.') return # Creating and fitting the detector outlier_detector = EllipticEnvelope(contamination=contamination) outlier_detector.fit(X) # Predicting outliers and outputting an array with 1 if it is an outlier outliers = outlier_detector.predict(X) outlier_indices = np.where(outliers == -1) return outlier_indices
def filter_outliers_in_features(X): # clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1) clf = EllipticEnvelope(support_fraction=1, contamination=0.2) clf.fit(X) # r = clf.predict(X) X = X[clf.predict(X) == 1] return X
def oneClassSVM(): for percent in percents: model = EllipticEnvelope(contamination=percent).fit(clean_matrix) results = orig_df[model.predict(mixed_matrix) == -1] name = 'covariance_' + str(percent) results.to_csv('output/' + name + '_outliers.csv') parseResults(results, name)
class EllipticEnvelopeFilter(BaseEstimator): def __init__(self, assume_centered=False, support_fraction=None, contamination=0.1, random_state=None): self.assume_centered = assume_centered self.support_fraction = support_fraction self.contamination = contamination self.random_state = random_state def fit_pipe(self, X, y=None): self.elliptic_envelope_ = EllipticEnvelope(**self.get_params()) self.elliptic_envelope_.fit(X) return self.transform_pipe(X, y) def transform_pipe(self, X, y): # XXX: sample_props not taken care off is_inlier = self.elliptic_envelope_.predict(X) == 1 X_out = X[is_inlier] if y is None: y_out = None else: y_out = y[is_inlier] return X_out, y_out def transform(self, X, y=None): return X
def treate_outliers(df, action="parallel", debug=True, remove=True): if action == "colective": columns = df.columns # Saca todas las caregorias de Y categories = df[df.columns[-1]].unique() # Cambia las categorias por numeros for i in range(len(categories)): df[df.columns[-1]].replace(categories[i], i, inplace=True) elip_env = EllipticEnvelope().fit(df) detection = elip_env.predict(df) #Outilers using Mahalanobis distance. outlier_positions_mah = [ x for x in range(df.shape[0]) if detection[x] == -1 ] if remove: df.drop(df.index[outlier_positions_mah], inplace=True) return outlier_positions_mah elif action == "individual": all_outliers_positions_box = [] columns = df.columns _, bp = pd.DataFrame.boxplot(df, return_type='both') outliers = [flier.get_ydata() for flier in bp["fliers"]] for i in range(len(outliers)): prop_outliers = outliers[i] if prop_outliers.size > 0: IQR = df.describe()[columns[i]]["75%"] - df.describe()[ columns[i]]["25%"] whiskers = [ df.describe()[columns[i]]["25%"] - (1.5 * IQR), df.describe()[columns[i]]["75%"] + (1.5 * IQR) ] outlier_positions_box = [ x for x in range(df.shape[0]) if df[columns[i]].values[x] < whiskers[0] or df[columns[i]].values[x] > whiskers[1] ] all_outliers_positions_box += outlier_positions_box if debug: print("outliers for variable ['" + str(columns[i]) + "'] = " + str(outlier_positions_box)) if remove: df.drop(df.index[outlier_positions_box], inplace=True) return all_outliers_positions_box elif action == "parallel": outlier_positions_mah = treate_outliers(df, action="colective", remove=False) outlier_positions_box = treate_outliers(df, action="individual", remove=False) outliers_position = list( np.sort(outlier_positions_mah + outlier_positions_box)) if remove: df.drop(df.index[outliers_position], inplace=True) return outliers_position
def calcu2(mppt): clf = EllipticEnvelope(contamination=0.01) my_mppt1 = mppt.iloc[:, 0:106] clf.fit(my_mppt1) y_pred = clf.predict(my_mppt1) # y_pred = clf.predict(my_mppt1) output = mppt[y_pred == -1].iloc[:, 108] return output
def detectAnomalies(X, model_params): """ Detects the anomalies using Mahalonobis Distance Arguments: X {2d numpy.array} -- features of the windowed sequences model_params {dictionary} -- SSG-LUGIA model configuration Returns: yp {numpy.array} -- binary prediction of the anomalies ys {numpy.array} -- mahalonobis distance of the anomalies """ # we use the EllipticEnvelope model from Scikit-Learn library # to detect anomalies using mahalonobis distance elenv = EllipticEnvelope( contamination=model_params['contamination_model1'], support_fraction=model_params['support_fraction_model1'], random_state=3) elenv.fit(X) yp = elenv.predict(X) # binary prediction ys = elenv.decision_function(X) # mahalonobis distance computation X2 = X[np.where(yp == 1)] # selecting only the windows predicted native # performing anomaly detection again elenv2 = EllipticEnvelope( contamination=model_params['contamination_model2'], support_fraction=model_params['support_fraction_model2'], random_state=3) elenv2.fit(X2) yp2 = elenv2.predict(X2) # binary prediction ys2 = elenv2.decision_function(X2) # mahalonobis distance computation ys[np.where( yp == 1)] = ys2 # updating the binary prediction based on level 2 detection yp[np.where( yp == 1 )] = yp2 # updating the mahalonobis distance based on level 2 detection return (yp, ys)
def test_elliptic_envelope(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) with pytest.raises(NotFittedError): clf.predict(X) with pytest.raises(NotFittedError): clf.decision_function(X) clf.fit(X) y_pred = clf.predict(X) scores = clf.score_samples(X) decisions = clf.decision_function(X) assert_array_almost_equal(scores, -clf.mahalanobis(X)) assert_array_almost_equal(clf.mahalanobis(X), clf.dist_) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.) assert (sum(y_pred == -1) == sum(decisions < 0))
def elliptic_envelope_out(self, contamination): self.report.append('elliptic_envelope_out') ds = self.training[self.numerical_var] elliptic = EllipticEnvelope(contamination=contamination) elliptic.fit(ds) results = elliptic.predict(ds) outlier_elliptic = pd.Series(results) outlier_elliptic.index = ds.index return outlier_elliptic[outlier_elliptic == -1].index
def plot(X, y): proj = TSNE().fit_transform(X) e = EllipticEnvelope(assume_centered=True, contamination=.25) # Outlier detection e.fit(X) good = np.where(e.predict(X) == 1) X = X[good] y = y[good] scatter(proj, y)
def transform(features, labels): # for ff, ll in zip(features, labels): # print ll, ff # for rr in range(0, len(features) ): # features[rr] = scaler.fit_transform( features[rr] ) print "transforming features via pca" pca = PCA(n_components=30) features = pca.fit_transform(features) envelope = EllipticEnvelope() envelope.fit(features) print envelope.predict(features) scaler = MinMaxScaler() features = scaler.fit_transform(features) return features, labels
def test_outlier_detection(): """ """ rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) clf.fit(X) y_pred = clf.predict(X) assert_array_almost_equal(clf.decision_function(X, raw_mahalanobis=True), clf.mahalanobis(X - clf.location_)) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0)
def transform( features, labels ): # for ff, ll in zip(features, labels): # print ll, ff # for rr in range(0, len(features) ): # features[rr] = scaler.fit_transform( features[rr] ) print "transforming features via pca" pca = PCA(n_components = 30) features = pca.fit_transform( features ) envelope = EllipticEnvelope() envelope.fit( features ) print envelope.predict( features ) scaler = MinMaxScaler() features = scaler.fit_transform( features ) return features, labels
def test_outlier_detection(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) assert_raises(NotFittedError, clf.predict, X) assert_raises(NotFittedError, clf.decision_function, X) clf.fit(X) y_pred = clf.predict(X) decision = clf.decision_function(X, raw_values=True) decision_transformed = clf.decision_function(X, raw_values=False) assert_array_almost_equal(decision, clf.mahalanobis(X)) assert_array_almost_equal(clf.mahalanobis(X), clf.dist_) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0) assert sum(y_pred == -1) == sum(decision_transformed < 0)
def ellipticenvelope(data, fraction = 0.02): elenv = EllipticEnvelope(contamination=fraction) elenv.fit(data) score = elenv.predict(data) numeration = [[i] for i in xrange(1, len(data)+1, 1)] numeration = np.array(numeration) y = np.hstack((numeration, score)) anomalies = numeration for num,s in y: if (y == 1): y = np.delete(anomalies, num-1, axis=0) return anomalies
def test_elliptic_envelope(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) assert_raises(NotFittedError, clf.predict, X) assert_raises(NotFittedError, clf.decision_function, X) clf.fit(X) y_pred = clf.predict(X) scores = clf.score_samples(X) decisions = clf.decision_function(X) assert_array_almost_equal( scores, -clf.mahalanobis(X)) assert_array_almost_equal(clf.mahalanobis(X), clf.dist_) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.) assert(sum(y_pred == -1) == sum(decisions < 0))
def anomaly_detection(features, labels): # In this function, I try to use anomaly detection method (using mutivariate gaussian distribution) to identify poi-s non_pois = features[labels==0] pois = features[labels==1] print "non poi size", non_pois.shape, pois.shape, features.shape ## Spliting data to train, test and cross validation set for anomaly detection split1 = produce_spliting_array(non_pois.shape[0], .75 ) X_train = non_pois[split1==1] X_intermediate = non_pois[split1==0] print "size intermediate", X_intermediate.shape split2 = produce_spliting_array(X_intermediate.shape[0], .5 ) X_test = X_intermediate[split2==1] label_test = np.zeros((X_test.shape[0],), dtype=np.int) - 1 X_cv = X_intermediate[split2==0] label_cv = np.zeros((X_cv.shape[0],), dtype=np.int) - 1 split3 = produce_spliting_array(pois.shape[0], .5 ) X_test = np.vstack((X_test, pois[split3==1])) label_test = np.hstack((label_test, np.ones(sum(split3), dtype=np.int))) X_cv = np.vstack((X_cv, pois[split3==0])) label_cv = np.hstack((label_cv, np.ones(sum(split3==0), dtype=np.int))) print "size X_train", X_train.shape print "size test data", X_test.shape, label_test.shape print "size cv data", X_cv.shape, label_cv.shape print "size splits", len(split1), len(split2), len(split3) from sklearn.covariance import EllipticEnvelope detector = EllipticEnvelope(contamination=.85) detector.fit(X_train) pred_cv = detector.predict(X_cv) print pred_cv print label_cv print detector.score(X_cv, label_cv)
def CovEstOD(data, classifier=None, N=1, **kw): if classifier is None: from sklearn.covariance import EllipticEnvelope contamination = N / data.shape[0] classifier = EllipticEnvelope(support_fraction=1., contamination=contamination) classifier.fit(data) clipix, = np.where( classifier.predict(data) == -1) wdb = kw.pop( 'with_decision_boundary', False ) #TODO: A better way of finding the decision boundary if wdb: w,T = np.linalg.eigh( clf.precision_ ) #T (eigenvectors of precision matrix) is the transformation matrix between principle axes and data coordinates Ti = np.linalg.inv(T) M = np.dot(Ti, clf.precision_) * T #Diagonalizing the precision matrix ==> quadratic representation of decision boundary (ellipse): z^T M z = threshold. where x-<x> = Tz transforms to principle axes a, b = np.sqrt(clf.threshold / np.diag(M)) #semi-major & semi-minor axes theta = np.degrees( np.arccos(T[0,0]) ) #T is (im)proper rotation matrix theta = np.linalg.det(T) * theta #If det(T)=-1 ==> improper rotation matrix (rotoinversion - one of the axes is inverted) decision_boundary = Ellipse( clf.location_, 2*a, 2*b, theta, color='m' ) return clipix, decision_boundary else: return clipix
# print(Y) # Find outliers in the interaction rate data # Step 1 - Convert the dataset into pandas series util = Utility.SeriesUtility() datasetFileName = "fans_change_taylor_swift.csv" series = util.convertDatasetsToSeries(datasetFileName) series = util.resampleSeriesSum(series, "D") numberOfPoints = series.data.shape[0] X = series.values.flatten().reshape(numberOfPoints,1) det.fit(X) predicted = det.predict(X) for i in range(numberOfPoints): outputClass = det.predict(X[i])[0] if(outputClass == -1): print("Outlier detected...")
try: return float(val) except ValueError: return np.nan cytos = ['VEGF','IL-1beta','G-CSF','EGF','IL-10','HGF','FGF-basic', 'IFN-alpha','IL-6','IL-12','Rantes','Eotaxin','IL-13','IL-15', 'IL-17','MIP-1alpha','GM-CSF','MIP-1beta','MCP-1','IL-5', 'IFN-gamma','TNF-alpha','IL-RA','IL-2','IL-7', 'IP-10','IL-2R','MIG','IL-4','IL-8'] for col in cytos: data[col] = data[col].map(safe_float) try: env = EllipticEnvelope().fit(data[col].dropna().values.reshape(-1,1)) mask = env.predict(data[col].values.reshape(-1,1)) data[col][mask == -1] = np.nan except: pass #print mask #break # <codecell> pos = dict(zip('ABCDEFGH', range(8))) def xpos(val): _, p = val.split('(') return pos[p.split(',')[1][0]]
for i in range(0,len(SectionData)): if SectionData['newAngle'][i]==0: SectionData['angle'][i]=180 else: SectionData['angle'][i]=SectionData['newAngle'][i] x=SectionData['newX'][i] y=SectionData['newY'][i] SectionData['Distance'][i]=math.sqrt((x*x)+(y*y)) #fit the outlier detector to the data and predict X=SectionData[['angle','newX','newY']] outlier_detector = EllipticEnvelope(contamination=0.14).fit(X.values) outliers = outlier_detector.predict(X.values) #finds outliers for i in range(0,len(outliers)): SectionData['OUTLIER'][i]=outliers[i] if outliers[i]==-1: print 'outlier at: ',SectionData['center'][i] fig = plt.figure(figsize=(20,20)) #plotting the section map #outliers indicated on map with larger circles for i in range(0,len(SectionData)): if SectionData['OUTLIER'][i]==-1: plt.scatter(SectionData['X'][i],SectionData['Y'][i],s=40) plt.annotate(str(int(round(SectionData['gradient_angle'][i],0))),(SectionData['X'][i],SectionData['Y'][i]+5))
def search_outliers_EllipticEnvelope(X): clf = EllipticEnvelope(contamination=0.2) clf.fit(X) is_outliers = clf.predict(X) return is_outliers
'Age', 'HAART-Naive', 'HAART-Non-Adherent', 'HAART-Off', 'HAART-On', 'Hepatitis C status (HCV)'] for col in tranfer_cols: _, cyto_data[col] = cyto_data.align(pat_data[col], join='left', axis = 0) cyto_data['HCV'] = cyto_data['Hepatitis C status (HCV)'] # <codecell> for col in cytos: env = EllipticEnvelope(contamination=0.05) env.fit(cyto_data[col].dropna().values.reshape(-1, 1)) mask = env.predict(cyto_data[col].values.reshape(-1,1)) cyto_data[col][mask==-1] = np.nan # <codecell> fig, axs = plt.subplots(11,3, figsize = (10,20)) for ax, col in zip(axs.flatten(), cytos): boxes = [] mus = [] stds = [] for trop in trops: mask = cyto_data['Tropism'] == trop #mask &= cyto_data['Keep']