def test_score_samples(): X_train = [[1, 1], [1, 2], [2, 1]] clf1 = EllipticEnvelope(contamination=0.2).fit(X_train) clf2 = EllipticEnvelope().fit(X_train) assert_array_equal(clf1.score_samples([[2., 2.]]), clf1.decision_function([[2., 2.]]) + clf1.offset_) assert_array_equal(clf2.score_samples([[2., 2.]]), clf2.decision_function([[2., 2.]]) + clf2.offset_) assert_array_equal(clf1.score_samples([[2., 2.]]), clf2.score_samples([[2., 2.]]))
def test_outlier_detection(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) assert_raises(NotFittedError, clf.predict, X) assert_raises(NotFittedError, clf.decision_function, X) clf.fit(X) y_pred = clf.predict(X) decision = clf.decision_function(X, raw_values=True) decision_transformed = clf.decision_function(X, raw_values=False) assert_array_almost_equal(decision, clf.mahalanobis(X)) assert_array_almost_equal(clf.mahalanobis(X), clf.dist_) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0) assert sum(y_pred == -1) == sum(decision_transformed < 0)
def filter_remove_outlayers(self, flat, minimum_value=0): """ Remove outlayers using ellicptic envelope from scikits learn :param flat: :param minimum_value: :return: """ from sklearn.covariance import EllipticEnvelope flat0 = flat.copy() flat0[np.isnan(flat)] = 0 x,y = np.nonzero(flat0) # print np.prod(flat.shape) # print len(y) z = flat[(x,y)] data = np.asarray([x,y,z]).T clf = EllipticEnvelope(contamination=.1) clf.fit(data) y_pred = clf.decision_function(data) out_inds = y_pred < minimum_value flat[(x[out_inds], y[out_inds])] = np.NaN return flat
def outlier_removal2(features, samples, cv_predict): outliers_fraction = 0.1 print cv_predict.shape print samples.shape test = np.column_stack((cv_predict, samples)) #clf = EllipticEnvelope(contamination=.1) clf = EllipticEnvelope(contamination=.1) #clf = svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, # kernel="rbf", gamma=0.1) clf.fit(test) y_pred = clf.decision_function(test).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) y_pred_new = y_pred > threshold print y_pred_new #print samples[y_pred_new] print samples.shape print samples[y_pred_new].shape print features.shape print features[y_pred_new].shape return features[y_pred_new], samples[y_pred_new]
def clean_series(self, token, discard=5): """ Remove outliers from the ratio series for a token. Args: discard (int): Drop the most outlying X% of the data. Returns: OrderedDict{year: wpm} """ series = self.ratios[token] X = np.array(list(series.values()))[:, np.newaxis] env = EllipticEnvelope() env.fit(X) # Score each data point. y_pred = env.decision_function(X).ravel() # Get the discard threshold. threshold = stats.scoreatpercentile(y_pred, discard) return OrderedDict([ (year, ratio) for (year, ratio), pred in zip(series.items(), y_pred) if pred > threshold ])
def find_outlier_test_homes(df,all_homes, appliance, outlier_features, outliers_fraction=0.1): from scipy import stats from sklearn import svm from sklearn.covariance import EllipticEnvelope clf = EllipticEnvelope(contamination=.1) try: X = df.ix[all_homes[appliance]][outlier_features].values clf.fit(X) except: try: X = df.ix[all_homes[appliance]][outlier_features[:-1]].values clf.fit(X) except: try: X = df.ix[all_homes[appliance]][outlier_features[:-2]].values clf.fit(X) except: print "outlier cannot be found" return df.ix[all_homes[appliance]].index.tolist() y_pred = clf.decision_function(X).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) y_pred = y_pred > threshold return df.ix[all_homes[appliance]][~y_pred].index.tolist()
def filterOut(x): x = np.array(x) outliers_fraction=0.05 #clf = svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, kernel="rbf", gamma=0.1) clf = EllipticEnvelope(contamination=outliers_fraction) clf.fit(x) y_pred = clf.decision_function(x).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) y_pred = y_pred > threshold return y_pred
def test_outlier_detection(): """ """ rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) clf.fit(X) y_pred = clf.predict(X) assert_array_almost_equal(clf.decision_function(X, raw_mahalanobis=True), clf.mahalanobis(X - clf.location_)) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0)
def module4(self): ''' 入力された一次元配列からanomaly detectionを用いて外れ値を検出する ''' # get data img = cv2.imread('../saliency_detection/image/pearl.png') b,g,r = cv2.split(img) B,G,R = map(lambda x,y,z: x*1. - (y*1. + z*1.)/2., [b,g,r],[r,r,g],[g,b,b]) Y = (r*1. + g*1.)/2. - np.abs(r*1. - g*1.)/2. - b*1. # 負の部分は0にする R[R<0] = 0 G[G<0] = 0 B[B<0] = 0 Y[Y<0] = 0 rg = cv2.absdiff(R,G) by = cv2.absdiff(B,Y) img1 = rg img2 = by rg, by = map(lambda x:x.reshape((len(b[0])*len(b[:,0]),1)),[rg,by]) data = np.hstack((rg,by)) data = data.astype(np.float64) data = np.delete(data, range( 0,len(data[:,0]),2),0) # grid xx1, yy1 = np.meshgrid(np.linspace(-10, 300, 500), np.linspace(-10, 300, 500)) # 学習して境界を求める # contamination大きくすると円は小さく clf = EllipticEnvelope(support_fraction=1, contamination=0.01) print 'data.shape =>',data.shape print 'learning...' clf.fit(data) #学習 # 0があるとだめっぽいかも print 'complete learning!' # 学習した分類器に基づいてデータを分類して楕円を描画 z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()]) z1 = z1.reshape(xx1.shape) plt.contour(xx1,yy1,z1,levels=[0],linewidths=2,colors='r') # plot plt.scatter(data[:,0],data[:,1],color= 'black') plt.title("Outlier detection") plt.xlim((xx1.min(), xx1.max())) plt.ylim((yy1.min(), yy1.max())) plt.pause(.001) # plt.show() cv2.imshow('rg',img1/np.amax(img1)) cv2.imshow('by',img2/np.amax(img2))
def test_outlier_detection(): """ """ np.random.RandomState(0) X = np.random.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) clf.fit(X) y_pred = clf.predict(X) assert_array_almost_equal(clf.decision_function(X, raw_mahalanobis=True), clf.mahalanobis(X - clf.location_)) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.)
def test_elliptic_envelope(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) assert_raises(NotFittedError, clf.predict, X) assert_raises(NotFittedError, clf.decision_function, X) clf.fit(X) y_pred = clf.predict(X) scores = clf.score_samples(X) decisions = clf.decision_function(X) assert_array_almost_equal(scores, -clf.mahalanobis(X)) assert_array_almost_equal(clf.mahalanobis(X), clf.dist_) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.) assert (sum(y_pred == -1) == sum(decisions < 0))
def labelValidSkeletons(skel_file, valid_index, trajectories_data, fit_contamination = 0.05): #calculate valid widths if they were not used calculate_widths(skel_file) #calculate classifier for the outliers X4fit = nodes2Array(skel_file, valid_index) clf = EllipticEnvelope(contamination = fit_contamination) clf.fit(X4fit) #calculate outliers using the fitted classifier X = nodes2Array(skel_file) #use all the indexes y_pred = clf.decision_function(X).ravel() #less than zero would be an outlier #labeled rows of valid individual skeletons as GOOD_SKE trajectories_data['auto_label'] = ((y_pred>0).astype(np.int))*wlab['GOOD_SKE'] #+ wlab['BAD']*np.isnan(y_prev) saveLabelData(skel_file, trajectories_data)
def __init__(self, M, samples, filtermode, threshold, projdir, seed): # outlier detection clf = LocalOutlierFactor(n_neighbors=20, contamination=threshold) y_pred = clf.fit_predict(M) cee = EllipticEnvelope(contamination=threshold, random_state=seed) cee.fit(M) scores_pred = cee.decision_function(M) y_pred2 = cee.predict(M) cif = IsolationForest(contamination=threshold, random_state=seed) cif.fit(M) scores_pred = cif.decision_function(M) y_pred3 = cif.predict(M) outlier_methods = ["lof", "ee", "if"] ol_df = DataFrame(np.column_stack((y_pred, y_pred2, y_pred3)), index=samples[0].tolist(), columns=outlier_methods) keep_samples, drop_samples, drop_indices = ([] for i in range(3)) omnibus_methods = ["any", "any2", "all"] if filtermode in omnibus_methods: dft = ol_df.sum(axis=1) dft = DataFrame(dft) if filtermode == "any": drop_samples = dft[dft[0] != 3].index.values.tolist() keep_samples = dft[dft[0] == 3].index.values.tolist() elif filtermode == "any2": drop_samples = dft[dft[0] <= -1].index.values.tolist() keep_samples = dft[dft[0] > -1].index.values.tolist() elif filtermode == "all": drop_samples = dft[dft[0] == -3].index.values.tolist() keep_samples = dft[dft[0] != -3].index.values.tolist() elif filtermode in outlier_methods: drop_samples = ol_df[ol_df[filtermode] == -1].index.values.tolist() keep_samples = ol_df[ol_df[filtermode] == 1].index.values.tolist() drop_bool = np.isin(samples[0], drop_samples) drop_indices = np.where(drop_bool)[0].tolist() self.keep = keep_samples self.drop = drop_samples self.drop_indices = drop_indices
def test_elliptic_envelope(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) assert_raises(NotFittedError, clf.predict, X) assert_raises(NotFittedError, clf.decision_function, X) clf.fit(X) y_pred = clf.predict(X) scores = clf.score_samples(X) decisions = clf.decision_function(X) assert_array_almost_equal( scores, -clf.mahalanobis(X)) assert_array_almost_equal(clf.mahalanobis(X), clf.dist_) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.) assert(sum(y_pred == -1) == sum(decisions < 0))
def outlierDetector(clf_name, rng, X_train): outliers_fraction = 0.04 if clf_name == 'RobustCovariance': clf_ell = EllipticEnvelope(contamination=outliers_fraction) clf_ell.fit(X_train) anomaly_score = clf_ell.decision_function(X_train) outliers = clf_ell.predict(X_train) if clf_name == 'IsolationForest': clf_iforest = IsolationForest(n_estimators=100, random_state=rng, contamination=outliers_fraction) clf_iforest.fit(X_train) anomaly_score = clf_iforest.decision_function(X_train) outliers = clf_iforest.predict(X_train) return outliers
def labelValidSkeletons(skel_file): calculate_widths(skel_file) #get valid rows using the trajectory displacement and the skeletonization success valid_index, trajectories_data = getValidIndexes(skel_file) #calculate classifier for the outliers X4fit = nodes2Array(skel_file, valid_index) clf = EllipticEnvelope(contamination=.1) clf.fit(X4fit) #calculate outliers using the fitted classifier X = nodes2Array(skel_file) y_pred = clf.decision_function(X).ravel() #less than zero would be an outlier #labeled rows of valid individual skeletons as GOOD_SKE trajectories_data['auto_label'] = ((y_pred>0).astype(np.int))*wlab['GOOD_SKE'] #+ wlab['BAD']*np.isnan(y_prev) saveLabelData(skel_file, trajectories_data)
def res_ee(features, contamination=0.1, score=False): ''' use loess curve residuals and elliptic envelope to identify outliers Parameters ---------- features: dataframe dataframe of features contamination: decimal proportion of outliers expected in data score: boolean return binary prediction and outlier scores Returns ------- list Binary series with same length as input TS. A value of -1 indicates the corresponding value in TS is an outlier list outlier score. series with same length as input TS. only returned if score = True ''' #res = np.asarray(res).reshape(-1,1) # instantiate and predict lof on features model = EllipticEnvelope(assume_centered=True, store_precision=False, contamination=.1, random_state=888) model.fit(features) y_pred = model.predict(features) if score == False: return (y_pred.tolist()) # outlier scores scores = model.decision_function(features) return (y_pred.tolist(), scores.tolist())
def labelValidSkeletons_old(skeletons_file, good_skel_row, fit_contamination = 0.05): base_name = getBaseName(skeletons_file) progress_timer = timeCounterStr(''); print_flush(base_name + ' Filter Skeletons: Starting...') with pd.HDFStore(skeletons_file, 'r') as table_fid: trajectories_data = table_fid['/trajectories_data'] trajectories_data['is_good_skel'] = trajectories_data['has_skeleton'] if good_skel_row.size > 0: #nothing to do if there are not valid skeletons left. print_flush(base_name + ' Filter Skeletons: Reading features for outlier identification.') #calculate classifier for the outliers nodes4fit = ['/skeleton_length', '/contour_area'] + \ ['/' + name_width_fun(part) for part in worm_partitions] X4fit = nodes2Array(skeletons_file, nodes4fit, good_skel_row) assert not np.any(np.isnan(X4fit)) #%% print_flush(base_name + ' Filter Skeletons: Fitting elliptic envelope. Total time:' + progress_timer.getTimeStr()) #TODO here the is a problem with singular covariance matrices that i need to figure out how to solve clf = EllipticEnvelope(contamination = fit_contamination) clf.fit(X4fit) print_flush(base_name + ' Filter Skeletons: Calculating outliers. Total time:' + progress_timer.getTimeStr()) #calculate outliers using the fitted classifier X = nodes2Array(skeletons_file, nodes4fit) #use all the indexes y_pred = clf.decision_function(X).ravel() #less than zero would be an outlier print_flush(base_name + ' Filter Skeletons: Labeling valid skeletons. Total time:' + progress_timer.getTimeStr()) #labeled rows of valid individual skeletons as GOOD_SKE trajectories_data['is_good_skel'] = (y_pred>0).astype(np.int) #Save the new is_good_skel column saveModifiedTrajData(skeletons_file, trajectories_data) print_flush(base_name + ' Filter Skeletons: Finished. Total time:' + progress_timer.getTimeStr())
def detect_outliers(X, station): if station=='hoerning': outlierfraction = 0.0015 classifier = svm.OneClassSVM(nu=0.95*outlierfraction + 0.05, kernel='rbf', gamma=0.1) Xscaler = StandardScaler(copy=True, with_mean=True, with_std=True).fit(X) X_scaled = Xscaler.transform(X) classifier.fit(X_scaled) svcpred = classifier.decision_function(X_scaled).ravel() threshold = stats.scoreatpercentile(svcpred, 100*outlierfraction) inlierpred = svcpred>threshold else: outlierfraction = 0.0015 classifier = EllipticEnvelope(contamination=outlierfraction) classifier.fit(X) gausspred = classifier.decision_function(X).ravel() threshold = stats.scoreatpercentile(gausspred, 100*outlierfraction) inlierpred = gausspred>threshold return inlierpred
def outlier_detection(datframe, vis=0): """ identify and remove outliers by EllipticalEnvelope visualize with PCA if desired """ dat = datframe[datframe.columns[:14]] clf = EllipticEnvelope(contamination=.1) clf.fit(dat) y_pred = clf.decision_function(dat).ravel() outliers_fraction = 0.25 threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) datframe['detect'] = y_pred datframe = datframe[datframe.detect > threshold] if vis == 1: pca_visualize(datframe[datframe.columns[:14]]) return datframe
def detect_outliers(X, station): if station == 'hoerning': outlierfraction = 0.0015 classifier = svm.OneClassSVM(nu=0.95 * outlierfraction + 0.05, kernel='rbf', gamma=0.1) Xscaler = StandardScaler(copy=True, with_mean=True, with_std=True).fit(X) X_scaled = Xscaler.transform(X) classifier.fit(X_scaled) svcpred = classifier.decision_function(X_scaled).ravel() threshold = stats.scoreatpercentile(svcpred, 100 * outlierfraction) inlierpred = svcpred > threshold else: outlierfraction = 0.0015 classifier = EllipticEnvelope(contamination=outlierfraction) classifier.fit(X) gausspred = classifier.decision_function(X).ravel() threshold = stats.scoreatpercentile(gausspred, 100 * outlierfraction) inlierpred = gausspred > threshold return inlierpred
def outlier_removal(features, samples): outliers_fraction = 0.1 #clf = EllipticEnvelope(contamination=.1) clf = EllipticEnvelope(contamination=.1) #clf = svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, # kernel="rbf", gamma=0.1) clf.fit(features, samples) y_pred = clf.decision_function(features).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) y_pred_new = y_pred > threshold print y_pred_new #print samples[y_pred_new] #print samples.shape print samples[y_pred_new].shape print features.shape print features[y_pred_new].shape return features[y_pred_new], samples[y_pred_new]
def find_outlier_train(ser, outliers_fraction=0.1, min_units=0.2): # Returns outlier, inliers X = ser[ser>min_units].reshape(-1,1) #is_normal_data = is_normal(ser) # FOR NOW only using Robust estimator of Covariance is_normal_data = True if is_normal_data: # Use robust estimator of covariance from sklearn.covariance import EllipticEnvelope clf = EllipticEnvelope(contamination=.1) else: #Data is not normally distributed, use OneClassSVM based outlier detection from sklearn import svm clf = svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, kernel="rbf", gamma=0.1) from scipy import stats clf.fit(X) y_pred = clf.decision_function(X).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) y_pred = y_pred > threshold return ser[ser>min_units][~y_pred], ser[ser>min_units][y_pred]
def find_outlier_train(ser, outliers_fraction=0.1, min_units=0.2): # Returns outlier, inliers X = ser[ser > min_units].reshape(-1, 1) #is_normal_data = is_normal(ser) # FOR NOW only using Robust estimator of Covariance is_normal_data = True if is_normal_data: # Use robust estimator of covariance from sklearn.covariance import EllipticEnvelope clf = EllipticEnvelope(contamination=.1) else: #Data is not normally distributed, use OneClassSVM based outlier detection from sklearn import svm clf = svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, kernel="rbf", gamma=0.1) from scipy import stats clf.fit(X) y_pred = clf.decision_function(X).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) y_pred = y_pred > threshold return ser[ser > min_units][~y_pred], ser[ser > min_units][y_pred]
## Local Outlier Factor lof = LocalOutlierFactor(n_neighbors=2, novelty=True) lof.fit(data_scaled_means) answerLOF_proba = lof.decision_function(data_scaled_means) answerLOF_proba = 1 - ((answerLOF_proba - answerLOF_proba.min()) / (answerLOF_proba.max() - answerLOF_proba.min())) answerLOF_proba = pd.DataFrame({'target': answerLOF_proba}) pickle.dump(lof, open("../../data/model/LocalOutlierFactor", "wb")) ## Elliptic Envelope ee = EllipticEnvelope() ee.fit(data_scaled_means) answerEE_proba = ee.decision_function(data_scaled_means) answerEE_proba = 1 - (answerEE_proba - 3 * answerEE_proba.min()) * 10**12 answerEE_proba = pd.DataFrame({'target': answerEE_proba}) pickle.dump(ee, open("../../data/model/EllipticEnvelope", "wb")) ############## ### Soft voting voting_answer = pd.DataFrame({ 'target': ((answerIF_proba * 2 + answerLOF_proba * 1 + answerEE_proba * 2) / 5).T.apply(lambda x: -1 if x.values[0] > 0.4 else 1) }) ##############
def find_outliers(datestart,dateend,plot=False,cut=-0.05): numtopics=84 di=datetime2str2(datestart) dfin=datetime2str2(dateend) #print di,dfin if dfin<di: temp=dfin dfin=di di=temp #print di,dfin afile="/home/ubuntu/mysql_insightwiki_auth.txt" a=open(afile) passwd=a.readline().rstrip() a.close() host='localhost'; user='******';db='wikidata' con = mdb.connect(host, user, passwd, db)#,port=3307) with con: curt= con.cursor() #sql="SELECT COUNT(*) FROM `topics` " sql="SELECT `Id`,`topic_label`,`topic_string` FROM `topics`;" curt.execute(sql) topics=[[0,'nothing','Filler to match index']] for topic in curt: topics.append(topic) data={} df=range(numtopics+1) with con: curt= con.cursor() sql="SELECT `Id`,`topic_label`,`topic_string` FROM `topics`;" curt.execute(sql) for row in curt: cur = con.cursor() sql='''SELECT `page_views`.`dateonly` AS `vd`, AVG(`page_views`.`count`) AS `vc`, `topics`.`topic_label`,`topics`.`topic_string` FROM `topics` INNER JOIN `page_views` ON `topics`.`ID` = `page_views`.`topic_id` WHERE `topic_id`=%s GROUP BY `page_views`.`dateonly` ''' data[row[1]]=read_sql(sql, con,params=[row[0]]) df[row[0]]=data[row[1]] topicdata=df d=topicdata[topics[3][0]] p=d[ (d['vd']>di) & (d['vd']<dfin )]['vc'].values topicdata=df #initializing array to hold the rows to cluster #the 0th position is fake so that my index matches the sql index clusinp=[] clusinp.append(gen_feat([0,0,0,0,0])) chinaoff=6000 #populating my array to go into my Kmean for index,topic in enumerate(topics): #topic=list(topics[index]) if topic[0]!=0: d=topicdata[topic[0]] ppre=d[ (d['vd']>di) & (d['vd']<dfin )]['vc'].values p=gen_feat(ppre) if topic[0]==52: p=gen_feat([x-chinaoff if x-chinaoff>=0 else 0 for x in ppre ]) clusinp.append(p) #cleaning up my array making it numpy to go into my kmean clusinp=np.array(clusinp) clusinp[0]=clusinp[5] #making sure my through away first row matches in size #contam=0.325 contamfix=0.1 colors = ['m', 'g', 'b'] X1=clusinp xx1, yy1 = np.meshgrid(np.linspace(0, 10000, 500), np.linspace(-1.5, 1.5, 500)) ee=EllipticEnvelope(support_fraction=1., contamination=contamfix) #ee=OneClassSVM(nu=contam2, gamma=0.05,kernel='rbf') ee.fit(clusinp) outliers=ee.decision_function(X1, raw_values=False) if plot==True: print "here" get_ipython().magic(u'matplotlib inline') Z1 = ee.decision_function(np.c_[xx1.ravel(), yy1.ravel()]) Z1 = Z1.reshape(xx1.shape) legend1 = plt.contour(xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[1]) plt.scatter(X1[:, 0], X1[:, 1], color='black') plt.xlim((xx1.min(), xx1.max())) plt.ylim((yy1.min(), yy1.max())) plt.show() out=[] for index,outlier in enumerate(outliers): row=[index,outlier,topics[index][1],int(np.round(clusinp[index][0])),int(np.round(100*clusinp[index][1]))] #row=[index,outlier,topics[index][1],int(np.round(clusinp[index][0])),clusinp[index][1]] if outlier<cut and index!=0 and row[3]>8: out.append(row) #print index,outlier,topics[index][2],clusinp[index][0],clusinp[index][1] #out=sorted(out,operator.itemgetter(4)) #out.sort() out=sorted(out,key =lambda x:-x[4]) return out
xx, yy = np.meshgrid(np.linspace(-0.1, 1.1, 1000), np.linspace(0, 100, 1000)) n_inliers = int((1. - outliers_fraction) * n_samples) n_outliers = int(outliers_fraction * n_samples) # Fit the problem with varying cluster separation np.random.seed(42) # Data generation # Fit the model with the One-Class SVM #plt.figure(figsize=(10, 5)) clf = EllipticEnvelope(contamination=.1) # fit the data and tag outliers clf.fit(XY) y_pred = clf.decision_function(XY).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) y_pred = y_pred > threshold # plot the levels lines and the points Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) subplot = ax[i] subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), cmap=plt.cm.Blues_r) a = subplot.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='red') subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='orange') b = subplot.scatter(XY[:-n_outliers, 0], XY[:-n_outliers, 1], c='white') c = subplot.scatter(XY[-n_outliers:, 0], XY[-n_outliers:, 1], c='white')
def run_model(train_data: np.ndarray, predict_data: np.ndarray): clf = EllipticEnvelope() clf.fit(train_data.reshape(-1, 1)) outlier = clf.decision_function(predict_data.reshape(-1, 1)) return outlier
x = np.concatenate((no_mod_train, mod_train)) ## extract good features fitter = umap.UMAP().fit(x.reshape((len(x)), 60)) test_data = fitter.transform( np.concatenate((no_mod[number:], mod[number:]))) model_EllipticEnvelope = EllipticEnvelope(contamination=0.05, support_fraction=1) model_EllipticEnvelope.fit(fitter.embedding_[:number]) # selected extra-outliers decision = model_EllipticEnvelope.decision_function(test_data) index = [] for i in range(len(decision)): if decision[i] < 0.00 and decision[i] > threshold: index.append(i) #get rid of non-confident ones mod_training_filtered = np.delete(test_data, index, axis=0) #f.write('Number of signals left '+str(len(mod_training_filtered))+'\n') median_number.append(len(mod_training_filtered)) labels = np.concatenate((np.ones(100), np.repeat(-1, 100))) labels = np.delete(labels, index, axis=0) prediction = model_EllipticEnvelope.predict(mod_training_filtered) median_false_posit.append(len( prediction[:100][prediction[:100] == -1])) #f.write('testing accuracy with threshold '+str(accuracy(labels, prediction))+'\n')
def anomaly_detection_ex8_ng(): """Run anomaly detection. Example from Andrew Ng's coursera course """ # ===================== # load data dataset = loadmat('data/ex8data1.mat') # dataset = loadmat('data/ex8data2.mat') print(dataset.keys()) X = dataset['X'] print('X:', X.shape, X[0, :]) # 307x2 Xval = dataset['Xval'] print('X_val:', Xval.shape, Xval[0, :]) # 307x2 yval = dataset['yval'] print('y_val:', yval.shape, yval[0, :]) # 307x1 # ===================== # display fig = plt.figure(facecolor='white') fig1 = fig.add_subplot(2, 2, 1) plt.scatter(X[:, 0], X[:, 1], c='k') plt.title("Outlier detection") plt.xlabel('Latency (ms)') plt.ylabel('Throughput (mb/s)') # ===================== # detecting outliers in a Gaussian distributed dataset. clf = EllipticEnvelope() clf.fit(X) # Calculate the decision function and use threshold to determine outliers y_pred = clf.decision_function(X).ravel() # print('y pred', y_pred) # ===================== # find best threshold for outlier detection if False: samples = np.linspace(0.1, 10.0, num=100) best_f1 = 0.0 best_perc = 0.0 for sample in samples: Xval_pred = clf.decision_function(Xval) perc = sample th = np.percentile(Xval_pred, perc) outl = Xval_pred < th f1score = f1_score(yval, outl) print('f1 score (', sample, '):', f1score) if best_f1 < f1score: best_f1 = f1score best_perc = perc print('best f1:', best_f1, ', best perc:', best_perc) # set threshold for outlier detection percentile = 1.9 # 5.1 # 1.9 #best_perc # 1.9607843 threshold = np.percentile(y_pred, percentile) outliers = y_pred < threshold # print('outliers:', X[outliers]) # ===================== # plot contours fig.add_subplot(2, 2, 2) # create the grid for plotting if False: xx, yy = np.meshgrid(np.linspace(0, 25, 200), np.linspace(0, 30, 200)) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='blue', linestyles='dotted') threshold = np.percentile(y_pred, 1.0) plt.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='blue', linestyles='dotted') threshold = np.percentile(y_pred, 0.5) plt.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='blue', linestyles='dotted') # plot outliers plt.scatter(X[:, 0], X[:, 1], c='k') plt.scatter(X[outliers, 0], X[outliers, 1], c='r') print('num outliers:', sum(outliers)) # samples_idx = yval == 1 # print(yval[samples_idx]) # print('X_val:', Xval.shape, Xval[0, :]) # 307x2 # print(Xval[samples_idx]) plt.show()
#xx, yy = np.meshgrid(np.linspace(-0.1, 1.1, 1000), np.linspace(0, 100, 1000)) n_inliers = int((1. - outliers_fraction) * n_samples) n_outliers = int(outliers_fraction * n_samples) # Fit the problem with varying cluster separation np.random.seed(42) # Data generation # Fit the model with the One-Class SVM #plt.figure(figsize=(10, 5)) clf = EllipticEnvelope(contamination=.1) # fit the data and tag outliers clf.fit(XY) y_pred = clf.decision_function(XY).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) y_pred = y_pred > threshold # plot the levels lines and the points #Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) #Z = Z.reshape(xx.shape) df_outlier = df[~y_pred] df_feedback = df_outlier[(df_outlier["usage proportion"]>df["usage proportion"].median()) & (df_outlier["usage_percentage"]>df["usage_percentage"].median())] feedback_homes = df_feedback["home"].values extra_pred = np.setdiff1d(feedback_homes, submetered_homes_feedback)
def make_subplot_again(X, c, ax, pcX=0, pcY=1, fontSize=24, fontName='sans serif', ms=20, leg=True, title=None): outliers_fraction = 0.30 clf = EllipticEnvelope(contamination=outliers_fraction) x = X['DK salary'].values y = X['points_per_dollar'].values.reshape(-1, 1) Xn = X X = X.values buff = 0.02 bufferX = buff * (X[:, pcX].max() - X[:, pcX].min()) bufferY = buff * (X[:, pcY].max() - X[:, pcY].min()) mm = [(X[:, pcX].min() - bufferX, X[:, pcX].max() + bufferX), (X[:, pcY].min() - bufferY, X[:, pcY].max() + bufferY)] xx, yy = np.meshgrid(np.linspace(mm[0][0], mm[0][1], 500), np.linspace(mm[1][0], mm[1][1], 500)) # fit the data and tag outliers clf.fit(X) y_pred = clf.decision_function(X).ravel() threshold = scoreatpercentile(y_pred, 100 * outliers_fraction) y_pred = y_pred > threshold print y_pred Xn['pred'] = y_pred # plot the levels lines and the points Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), cmap=plt.cm.Blues_r) a = ax.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='burlywood') ax.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='orange') ax.axis('tight') care_about = Xn[Xn['points_per_dollar'] > 3.5] care_about_false = care_about[care_about['pred'] == False] x_c_f = care_about_false['DK salary'] y_c_f = care_about_false['points_per_dollar'] ax.scatter(x_c_f, y_c_f, alpha=0.5, lw=2, edgecolor='k', s=50, marker='d', c='#5DC541', label='Great Value') dont_care_about = Xn[Xn['points_per_dollar'] <= 3.5] dont_care_about_false = dont_care_about[dont_care_about['pred'] == False] x_d_f = dont_care_about_false['DK salary'] y_d_f = dont_care_about_false['points_per_dollar'] ax.scatter(x_d_f, y_d_f, alpha=0.5, lw=2, s=70, marker='+', c='#6F0D73', label='Bad Value') Xn_true = Xn[Xn['pred'] == True] x_true = Xn_true['DK salary'] y_true = Xn_true['points_per_dollar'] ax.scatter(x_true, y_true, alpha=0.5, marker='o', c='#BD4864', label='Normal Value') ax.annotate('Ben\nRoethlisburger\nWeek 8 2014', fontsize=20, xy=(5800, 8.237931), xytext=(7300, 7), arrowprops=dict(facecolor='black', shrink=0.05)) ax.annotate('Tom Brady\nWeek 11 2014', fontsize=20, xy=(9800, 1.640816), xytext=(7000, -0.5), arrowprops=dict(facecolor='black', shrink=0.05)) ## axes for tick in ax.xaxis.get_major_ticks(): tick.label.set_fontsize(fontSize - 2) for tick in ax.yaxis.get_major_ticks(): tick.label.set_fontsize(fontSize - 2) ax.set_xlabel('Salary', fontsize=fontSize, fontname=fontName) ax.set_ylabel('Points per $1000', fontsize=fontSize, fontname=fontName) plt.locator_params(axis='x', nbins=5) ax.set_aspect(1. / ax.get_data_ratio()) ax.set_xlim(3000, 10000) ax.set_ylim(mm[1]) ax.axhline(3.5, c='r', label='Threshold') box = ax.get_position() ax.set_position([box.x0 + box.width * 0.2, box.y0, box.width, box.height]) ax.legend(loc='center right', bbox_to_anchor=(-0.2, 0.4), fontsize=20, scatterpoints=3, frameon=True) if title: ax.set_title(title, fontsize=fontSize + 2, fontname=fontName)
import pickle from sklearn.cluster import KMeans import numpy as np from sklearn.decomposition import PCA import matplotlib.pyplot as plt import csv from sklearn import svm from sklearn.covariance import EllipticEnvelope from scipy import stats data = [] with open('newdata.csv', 'rb') as f: rdr = csv.reader(f) for row in rdr: data.append([int(row[1]), int(row[2])]) data = np.array(data) # print(data) outliers_fraction = 0.05 # est=svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,kernel="rbf", gamma=0.1) est = EllipticEnvelope(contamination=.1) # est=KMeans(n_clusters=3) est.fit(data) # labels=est.labels_ y_pred = est.decision_function(data).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) labels = [(2 if y > threshold else 1) for y in y_pred] # labels=est.labels_ print(labels) plt.scatter(data[:, 0], data[:, 1], c=labels, lw=0) plt.show()
def outliers_from_ellipticEnvelope(): from sklearn.covariance import EllipticEnvelope env=EllipticEnvelope() env.fit(features_pca) outlier_pred=env.decision_function(features_pca).ravel() return outlier_pred
# plot the temperature repartition by categories fig, axs = plt.subplots(2, 2) df_class0.hist(ax=axs[0, 0], bins=32) df_class1.hist(ax=axs[0, 1], bins=32) df_class2.hist(ax=axs[1, 0], bins=32) df_class3.hist(ax=axs[1, 1], bins=32) # In[ ]: # apply ellipticEnvelope(gaussian distribution) at each categories envelope = EllipticEnvelope(contamination=outliers_fraction) X_train = df_class0.values.reshape(-1, 1) envelope.fit(X_train) df_class0 = pd.DataFrame(df_class0) df_class0['deviation'] = envelope.decision_function(X_train) df_class0['anomaly'] = envelope.predict(X_train) envelope = EllipticEnvelope(contamination=outliers_fraction) X_train = df_class1.values.reshape(-1, 1) envelope.fit(X_train) df_class1 = pd.DataFrame(df_class1) df_class1['deviation'] = envelope.decision_function(X_train) df_class1['anomaly'] = envelope.predict(X_train) envelope = EllipticEnvelope(contamination=outliers_fraction) X_train = df_class2.values.reshape(-1, 1) envelope.fit(X_train) df_class2 = pd.DataFrame(df_class2) df_class2['deviation'] = envelope.decision_function(X_train) df_class2['anomaly'] = envelope.predict(X_train)
# Fit the model clf = EllipticEnvelope(support_fraction=1., contamination=contamination) clf.fit(data) # Perform outlier detection predicted_data = clf.predict(data) inlier_predicted_data = data[predicted_data == 1] outlier_predicted_data = data[predicted_data == -1] num_inliers_predicted = inlier_predicted_data.shape[0] num_outliers_predicted = outlier_predicted_data.shape[0] # Plot decision function values xr = np.linspace(-2, 2, 500) yr = np.linspace(-2, 2, 500) xx, yy = np.meshgrid(xr, yr) zz = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) zz = zz.reshape(xx.shape) scores = clf.decision_function(data) threshold = stats.scoreatpercentile(scores, 100 * contamination) plt.contourf(xx, yy, zz, levels=np.linspace(zz.min(), threshold, 7), cmap=plt.cm.Blues_r) # Outlier plt.contour(xx, yy, zz, levels=np.array([threshold]), linewidths=2, colors="red") # The frontier plt.contourf(xx,
#plt.figure(15) #for l in set(L): #p=(L==l) #if l==-1: #color='r' #else: #color=colors[l] #plt.plot(rcp[p,0],rcp[p,1],'o',c=color,markersize=10) #plt.show() # -17- # from sklearn.covariance import EllipticEnvelope anom_perc = 20 clf = EllipticEnvelope(contamination=.1) clf.fit(rcp) clf.decision_function(rcp).ravel() pred = clf.decision_function(rcp).ravel() threshold = stats.scoreatpercentile(pred, anom_perc) Anom = pred > threshold print(Anom) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) waitforEnter() #plt.figure(17) #plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),cmap=plt.cm.Blues_r) #plt.contour(xx, yy, Z, levels=[threshold],linewidths=2, colors='red') #plt.plot(rcp[:, 0], rcp[:, 1], 'ko') #plt.show() #waitforEnter() # -18- #
def find_outliers(datestart, dateend, plot=False, cut=-0.05): numtopics = 84 di = datetime2str2(datestart) dfin = datetime2str2(dateend) #print di,dfin if dfin < di: temp = dfin dfin = di di = temp #print di,dfin afile = "/home/ubuntu/mysql_insightwiki_auth.txt" a = open(afile) passwd = a.readline().rstrip() a.close() host = 'localhost' user = '******' db = 'wikidata' con = mdb.connect(host, user, passwd, db) #,port=3307) with con: curt = con.cursor() #sql="SELECT COUNT(*) FROM `topics` " sql = "SELECT `Id`,`topic_label`,`topic_string` FROM `topics`;" curt.execute(sql) topics = [[0, 'nothing', 'Filler to match index']] for topic in curt: topics.append(topic) data = {} df = range(numtopics + 1) with con: curt = con.cursor() sql = "SELECT `Id`,`topic_label`,`topic_string` FROM `topics`;" curt.execute(sql) for row in curt: cur = con.cursor() sql = '''SELECT `page_views`.`dateonly` AS `vd`, AVG(`page_views`.`count`) AS `vc`, `topics`.`topic_label`,`topics`.`topic_string` FROM `topics` INNER JOIN `page_views` ON `topics`.`ID` = `page_views`.`topic_id` WHERE `topic_id`=%s GROUP BY `page_views`.`dateonly` ''' data[row[1]] = read_sql(sql, con, params=[row[0]]) df[row[0]] = data[row[1]] topicdata = df d = topicdata[topics[3][0]] p = d[(d['vd'] > di) & (d['vd'] < dfin)]['vc'].values topicdata = df #initializing array to hold the rows to cluster #the 0th position is fake so that my index matches the sql index clusinp = [] clusinp.append(gen_feat([0, 0, 0, 0, 0])) chinaoff = 6000 #populating my array to go into my Kmean for index, topic in enumerate(topics): #topic=list(topics[index]) if topic[0] != 0: d = topicdata[topic[0]] ppre = d[(d['vd'] > di) & (d['vd'] < dfin)]['vc'].values p = gen_feat(ppre) if topic[0] == 52: p = gen_feat( [x - chinaoff if x - chinaoff >= 0 else 0 for x in ppre]) clusinp.append(p) #cleaning up my array making it numpy to go into my kmean clusinp = np.array(clusinp) clusinp[0] = clusinp[ 5] #making sure my through away first row matches in size #contam=0.325 contamfix = 0.1 colors = ['m', 'g', 'b'] X1 = clusinp xx1, yy1 = np.meshgrid(np.linspace(0, 10000, 500), np.linspace(-1.5, 1.5, 500)) ee = EllipticEnvelope(support_fraction=1., contamination=contamfix) #ee=OneClassSVM(nu=contam2, gamma=0.05,kernel='rbf') ee.fit(clusinp) outliers = ee.decision_function(X1, raw_values=False) if plot == True: print "here" get_ipython().magic(u'matplotlib inline') Z1 = ee.decision_function(np.c_[xx1.ravel(), yy1.ravel()]) Z1 = Z1.reshape(xx1.shape) legend1 = plt.contour(xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[1]) plt.scatter(X1[:, 0], X1[:, 1], color='black') plt.xlim((xx1.min(), xx1.max())) plt.ylim((yy1.min(), yy1.max())) plt.show() out = [] for index, outlier in enumerate(outliers): row = [ index, outlier, topics[index][1], int(np.round(clusinp[index][0])), int(np.round(100 * clusinp[index][1])) ] #row=[index,outlier,topics[index][1],int(np.round(clusinp[index][0])),clusinp[index][1]] if outlier < cut and index != 0 and row[3] > 8: out.append(row) #print index,outlier,topics[index][2],clusinp[index][0],clusinp[index][1] #out=sorted(out,operator.itemgetter(4)) #out.sort() out = sorted(out, key=lambda x: -x[4]) return out
def outliers_from_ellipticEnvelope(): from sklearn.covariance import EllipticEnvelope env = EllipticEnvelope() env.fit(features_pca) outlier_pred = env.decision_function(features_pca).ravel() return outlier_pred
def perform_robust_covariance_novelty_detection(data): ''' With the five patterns' counts, this method performs Robust Covariance that can help concentrate on a relevant cluster when outlying points exist. The experimentation is performed with different time chunks and number of sequences. ''' # Importing necessary libraries from sklearn.covariance import EllipticEnvelope from sklearn.model_selection import train_test_split X = data.iloc[:, 0:5].values pca = PCA(n_components=2) X = pca.fit(StandardScaler().fit_transform(X)).transform( StandardScaler().fit_transform(X)) # Spliting the observations into 75% training and 25% testing X_train, X_test = train_test_split(X, test_size=0.25, random_state=42) # Robust Covariance classifier intialization and generate results classifier = EllipticEnvelope(contamination=0.25) classifier.fit(X_train) Y_pred_train = classifier.predict(X_train) Y_pred_test = classifier.predict(X_test) n_error_train = Y_pred_train[Y_pred_train == -1].size n_error_test = Y_pred_test[Y_pred_test == -1].size error_train = n_error_train / Y_pred_train.shape[0] * 100 error_novel = n_error_test / Y_pred_test.shape[0] * 100 # Visualization plt.clf() myFig = plt.figure(figsize=[10, 8]) xx, yy = np.meshgrid(np.linspace(-4.5, 8.5, 500), np.linspace(-4.5, 4.5, 500)) Z = classifier.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu) a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred') plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred') s = 60 b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s, edgecolors='k') b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='gold', s=s, edgecolors='k') plt.axis('tight') plt.legend([a.collections[0], b1, b2], [ "Learned Frontier", "Training Observations", "New Regular Observations" ], loc="best", prop=matplotlib.font_manager.FontProperties(size=14)) plt.xlabel("Error Train: %.2f%% and Error Novel Regular: %.2f%%" % (error_train, error_novel), fontsize=13, weight="bold") plt.yticks(fontsize=14) plt.xticks(fontsize=14) plt.title( 'Novelty Detection using Robust Covariance of Ransomware Families\'\nAll Sequence Counts from 15 minutes of IRP Logs', fontsize=14, weight='bold') plt.show() # Save figure myFig.savefig( 'sequence_mining_analysis/Results/novelty_detection/Robust_Covariance/15_mins_sequences_all.png', format='png', dpi=150) myFig.savefig( 'sequence_mining_analysis/Results/novelty_detection/Robust_Covariance/15_mins_sequences_all.eps', format='eps', dpi=1200)
'timestamp', 'Sample Number', 'Seconds', 'Minutes', 'Hours', 'Date', 'Month' ], axis=1) from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() #num2 = scaler.fit_transform(data.drop(['timestamp'],axis=1)) num2 = scaler.fit_transform(data) num2 = pd.DataFrame(num2, columns=data.columns) # %% [code] {"scrolled":true} from sklearn.covariance import EllipticEnvelope clf = EllipticEnvelope(contamination=.1, random_state=0) clf.fit(num2) ee_scores = pd.Series(clf.decision_function(num2)) ee_predict = clf.predict(num2) ee_predict = pd.Series(ee_predict).replace([-1, 1], [1, 0]) # %% [code] {"scrolled":true} print(ee_scores) print(ee_predict) # %% [markdown] # * ee_scores contains fitted densities.<br> # * ee_predict contains labels, where -1 indicates an outlier and 1 does not. <br> # * Labels are calculated based on clf.threshold_ and ee_scores. # %% [code] {"scrolled":true} anomaly_ind = ee_predict[ee_predict == 1].index anomaly_ind
plt.savefig("svm_oneclass.pdf") plt.show() ################################## # Robust covariance ################################## ## Train robust covariance classifier outliers_fraction = 0.05 robust_classifier = EllipticEnvelope(contamination=outliers_fraction) robust_classifier.fit(X) ## Create a grid to draw the classifier xx, yy = np.meshgrid(np.linspace(-20, 25, 500), np.linspace(-10, 35, 500)) Z = robust_classifier.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) ## Draw the boundary plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 10), cmap=plt.cm.PuBu) a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors="darkred") plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors="palevioletred") ## Draw the data points b1 = plt.scatter(X_good[:, 0], X_good[:, 1], c="blueviolet", edgecolors="k") b2 = plt.scatter(X_bad[:, 0], X_bad[:, 1], c="gold", edgecolors="k") plt.title("Robust Covariance on PCA") plt.xlabel("Principal component 1") plt.ylabel("Principal component 2") plt.legend(
import ms.version ms.version.addpkg('numpy', '1.14.2') ms.version.addpkg('scipy', '1.0.0') ms.version.addpkg('sklearn', '0.19.1') import sys import time import numpy as np from sklearn.covariance import EllipticEnvelope import util outlier_frac = 0.05 ell = EllipticEnvelope(contamination=outlier_frac) while True: X_train = util.receive_point_list_from_stdin() X_predict = util.receive_point_list_from_stdin() X_train, X_predict = util.nomalize_train_evaluate_data(X_train, X_predict) ell.fit(X_train) pred = ell.predict(X_predict) bools = pred == -1 decisions = ell.decision_function(X_predict) util.send_bool_list_to_stdout(bools) util.send_double_list_to_stdout(decisions)
from sklearn.cluster import KMeans import numpy as np from sklearn.decomposition import PCA import matplotlib.pyplot as plt import csv from sklearn import svm from sklearn.covariance import EllipticEnvelope from scipy import stats data=[] with open('newdata.csv', 'rb') as f: rdr=csv.reader(f) for row in rdr: data.append([int(row[1]), int(row[2])]) data=np.array(data) # print(data) outliers_fraction = 0.05 # est=svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,kernel="rbf", gamma=0.1) est=EllipticEnvelope(contamination=.1) # est=KMeans(n_clusters=3) est.fit(data) # labels=est.labels_ y_pred=est.decision_function(data).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) labels=[ (2 if y>threshold else 1) for y in y_pred]; # labels=est.labels_ print(labels) plt.scatter(data[:,0], data[:,1], c=labels, lw=0) plt.show()
##y_pred_outliers = clf.predict(X_outliers) n_error_train = y_pred_train[y_pred_train == -1].size n_error_test = y_pred_test[y_pred_test == -1].size error_train = n_error_train/X_train1.shape[0] error_test = n_error_test/X_test1.shape[0] print("train: {:.3f}, test{:.3f}".format(error_train,error_test)) rc_clf = EllipticEnvelope(contamination=0.05) rc_clf.fit(X_train1) y_pred_train_rc = rc_clf.predict(X_train1) y_pred_test_rc = rc_clf.predict(X_test1) scores_pred_train_rc = rc_clf.decision_function(X_train1) scores_pred_test_rc = rc_clf.decision_function(X_test1) ##y_pred_outliers = clf.predict(X_outliers) n_error_train_rc = y_pred_train_rc[y_pred_train_rc == -1].size n_error_test_rc = y_pred_test_rc[y_pred_test_rc == -1].size error_train_rc = n_error_train_rc/X_train1.shape[0] error_test_rc = n_error_test_rc/X_test1.shape[0] print("train: {:.3f}, test{:.3f}".format(error_train_rc,error_test_rc)) """ if_clf = IsolationForest(max_samples='auto', contamination=0.05, random_state=rng) if_clf.fit(X_train1)
def fuse_to_get_results(self, weights, num_comp): if weights[0] != 0: self.apply_pca(num_comp) # Make sure you apply pca before using Envelop -- it is very sensitive to the feature dimensions clf_een = EllipticEnvelope(store_precision=True, assume_centered=False, support_fraction=0.25, contamination=0.1, random_state=True) # Fitting the model on reduced dimensionality clf_een.fit(self.gen_tr_data) # The anomaly score of the input samples. The lower, the more abnormal. #输入样本的异常分数。越低越不正常。 pred_gen_scores_ee = clf_een.decision_function(self.gen_ts_data) pred_imp_scores_ee = clf_een.decision_function(self.imp_ts_data) pred_scores_ts_ee = np.concatenate( (pred_gen_scores_ee, pred_imp_scores_ee)) norm_scores_ee = self.mymm_scaler(pred_scores_ts_ee) else: norm_scores_ee = self.fill_sc_with_zero( np.concatenate( (self.get_gen_ts_labels(), self.get_imp_ts_labels()))) if weights[1] != 0: # Make sure you apply pca before using envelop -- it is very sensitive to the feature dimensions clf_if = IsolationForest(max_samples="auto", contamination=0.2, random_state=True) # Fitting the model on reduced dimensionality clf_if.fit(self.gen_tr_data) # The anomaly score of the input samples. The lower, the more abnormal. pred_gen_scores_if = clf_if.decision_function(self.gen_ts_data) pred_imp_scores_if = clf_if.decision_function(self.imp_ts_data) # print('pred_gen_scores_if',self.mymm_scaler(pred_gen_scores_if)) # print(clf_if.predict(self.gen_ts_data)) # print('pred_imp_scores_if', self.mymm_scaler(pred_imp_scores_if)) # print(clf_if.predict(self.imp_ts_data)) pred_scores_ts_if = np.concatenate( (pred_gen_scores_if, pred_imp_scores_if)) norm_scores_if = self.mymm_scaler(pred_scores_ts_if) # print('norm_scores_if',norm_scores_if) # print('plabel',np.concatenate((clf_if.predict(self.gen_ts_data),clf_if.predict(self.imp_ts_data)))) else: norm_scores_if = self.fill_sc_with_zero( np.concatenate( (self.get_gen_ts_labels(), self.get_imp_ts_labels()))) if weights[2] != 0: num_neighbors = 35 clf_lof = LocalOutlierFactor(n_neighbors=num_neighbors, metric='l2', contamination=0.25) X = np.concatenate((self.gen_tr_data, self.gen_ts_data)) X_all = np.concatenate((X, self.imp_ts_data)) pred_all_score = clf_lof.fit_predict(X_all) #print('pred_all_score') #print(pred_all_score) pred_scores_ts_lof = pred_all_score[ range(len(self.gen_tr_data), len(pred_all_score)), ] norm_scores_lof = self.mymm_scaler(pred_scores_ts_lof) else: norm_scores_lof = self.fill_sc_with_zero( np.concatenate( (self.get_gen_ts_labels(), self.get_imp_ts_labels()))) if weights[3] != 0: # Make sure you apply pca before using envelop -- it is very sensitive to the feature dimensions clf_svm1c = svm.OneClassSVM(kernel='rbf', degree=3, gamma=0.001, coef0=0.0, tol=0.00001, nu=0.001, shrinking=True, cache_size=200, verbose=False, max_iter=-1, random_state=True) # Fitting the model on reduced dimensionality clf_svm1c.fit(self.gen_tr_data) # The anomaly score of the input samples. The lower the more abnormal. pred_gen_scores_svm = clf_svm1c.decision_function(self.gen_ts_data) pred_imp_scores_svm = clf_svm1c.decision_function(self.imp_ts_data) pred_scores_ts_svm = np.concatenate( (pred_gen_scores_svm, pred_imp_scores_svm)) norm_scores_svm = self.mymm_scaler(pred_scores_ts_svm) else: norm_scores_svm = self.fill_sc_with_zero( np.concatenate( (self.get_gen_ts_labels(), self.get_imp_ts_labels()))) # Score level fusion pred_ts_labels = [] fused_scores = [] for ees, ifs, lofs, svms in zip(norm_scores_ee, norm_scores_if, norm_scores_lof, norm_scores_svm): cfscore = (weights[0] * ees + weights[1] * ifs + weights[2] * lofs + weights[3] * svms) / sum(weights) fused_scores.append(cfscore) if cfscore < self.threshold: pred_ts_labels.append(-1) else: pred_ts_labels.append(1) act_ts_labels = np.concatenate( (self.get_gen_ts_labels(), self.get_imp_ts_labels())) tn, fp, fn, tp = confusion_matrix(act_ts_labels, pred_ts_labels).ravel() far = fp / (fp + tn) frr = fn / (fn + tp) pr = tp / (tp + fp) final_score_table = [ norm_scores_ee, norm_scores_if, norm_scores_lof, norm_scores_svm, fused_scores, act_ts_labels ] #ee分数 print(norm_scores_ee) #if分数 print(norm_scores_if) #lof是0,1标签 print(norm_scores_lof) #svm分数 print(norm_scores_svm) #混合后也是分数 print(fused_scores) #标签 print(act_ts_labels) return far, frr, pr, final_score_table
plt.figure(15) for l in set(L): p = (L == l) if l == -1: color = 'r' else: color = colors[l] plt.plot(rcp_concat[p, 0], rcp_concat[p, 1], 'o', c=color, markersize=10) plt.show() # -17- # anom_perc = 20 # original 20 clf = EllipticEnvelope(contamination=.1) clf.fit(rcp_concat) clf.decision_function(rcp_concat).ravel() pred = clf.decision_function(rcp_concat).ravel() threshold = stats.scoreatpercentile(pred, anom_perc) Anom = pred > threshold print(Anom) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.figure(16) plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), cmap=plt.cm.Blues_r) plt.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='red') plt.plot(rcp_concat[:, 0], rcp_concat[:, 1], 'ko') plt.show() plt.savefig("../imagens/anomaly/ex17_20.png") # End wait_for_enter("END!")
# Compare given classifiers under given settings #xx, yy = np.meshgrid(np.linspace(-0.1, 1.1, 1000), np.linspace(0, 100, 1000)) n_inliers = int((1. - outliers_fraction) * n_samples) n_outliers = int(outliers_fraction * n_samples) # Fit the problem with varying cluster separation np.random.seed(42) # Data generation # Fit the model with the One-Class SVM #plt.figure(figsize=(10, 5)) clf = EllipticEnvelope(contamination=.1) # fit the data and tag outliers clf.fit(XY) y_pred = clf.decision_function(XY).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) y_pred = y_pred > threshold # plot the levels lines and the points #Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) #Z = Z.reshape(xx.shape) df_outlier = df[~y_pred] df_feedback = df_outlier[ (df_outlier["usage proportion"] > df["usage proportion"].median()) & (df_outlier["usage_percentage"] > df["usage_percentage"].median())] feedback_homes = df_feedback["home"].values extra_pred = np.setdiff1d(feedback_homes, submetered_homes_feedback) missed = np.setdiff1d(submetered_homes_feedback, feedback_homes)
#!/usr/bin/env python #-*- coding:utf-8 -*- import numpy as np from sklearn.covariance import EllipticEnvelope import matplotlib.pyplot as plt X1 = np.loadtxt('slocbool.txt') ee = EllipticEnvelope(support_fraction=1., contamination=0.02) xx, yy = np.meshgrid(np.linspace(0, 1500000, 542), np.linspace(0, 15000, 542)) ee.fit(X1) Z = ee.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.figure(1) plt.title("Outlier detection: SLOC vs BOOL") plt.scatter(X1[:, 0], X1[:, 1], color='black') plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='m') plt.ylabel("count of boolean expressions") plt.xlabel("count of source lines of code") plt.show()
#visualize the cluster df_class.hist(bins=32) #add df_classes.append(df_class) #%% # apply ellipticEnvelope(gaussian distribution) at each categories df_classesAnom=[] fig, ax = plt.subplots() for c in df_classes: envelope = EllipticEnvelope(contamination = outliers_fraction) X_train = c.values.reshape(-1,1) envelope.fit(X_train) c = pd.DataFrame(c) c['deviation'] = envelope.decision_function(X_train) c['anomaly'] = envelope.predict(X_train) a0 = c.loc[c['anomaly'] == 1, dataValues] b0 = c.loc[c['anomaly'] == -1, dataValues] ax.hist([a0,b0], bins=32, stacked=True, color=['blue', 'red']) df_classesAnom.append(c) #%% # add the data to the main df_class=pd.concat(df_classesAnom) df_temp['anomaly22'] = df_class['anomaly'] df_temp['anomaly22'] = np.array(df_temp['anomaly22'] == -1).astype(int) #%% [markdown] # Let's visualize the tagged anomaly points throughout time
x, y = find_boundary(X_transformed[kclusters == i, 0], X_transformed[kclusters == i, 1], 5) plt.plot(x, y, '-k', lw=2., color=cluster_color) # create a mesh to plot in h = .02 # step size in the mesh x_min, x_max = X_transformed[kclusters == i, 0].min() - 1, X_transformed[kclusters == i, 0].max() + 1 y_min, y_max = X_transformed[kclusters == i, 1].min() - 1, X_transformed[kclusters == i, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) clf = EllipticEnvelope(contamination=.1) clf.fit(X_transformed[kclusters == i]) pred = clf.decision_function(X_transformed[kclusters == i]).ravel() threshold = stats.scoreatpercentile(pred, 100 * outliers_fraction) print("INFO: Cluster: ", i, " Threshold: ", threshold) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) # plt.contour(xx, yy, Z, # levels=[threshold], # linewidths=2, # linestyles='solid', # colors=(cluster_color,)) # plt.contourf(xx, yy, Z, levels=[threshold, Z.max()], # colors='orange')