test = alt_df.iloc[14 + os:24 + os] test = test.append(rand_signs) #print(test) #test=pd.DataFrame(test.append(alt_df.iloc[888])) #for i in range(1,21): # test=test.append(alt_df.iloc[i*30]) test = test.drop("Writer_no", axis=1) test = test.drop("Sample_no", axis=1) print(test.shape) # In[376]: clf = svm.OneClassSVM(nu=best_nu, kernel="rbf", gamma=best_gamma) clf.fit(data) preds = clf.predict(test) print(preds) pdf1 = clf.decision_function(test[0:10]) pdf2 = clf.decision_function(test[10:]) pdf = clf.decision_function(test) # ## Probability Density Function of Real Signatures # In[377]: pd.DataFrame(pdf1).plot(kind="density", figsize=(5, 5)) plt.show()
# read binary data feature_folder = os.listdir(filepath) lenth = len(feature_folder) feature_folder.sort(key=lambda i: int(re.match(r'(\d+)', i).group())) for id in feature_folder: filepath_ = os.path.join(filepath, id) f = open(filepath_, "rb") # read all bytes into a string s = f.read() f.close() (n, c, l, h, w) = array.array("i", s[:20]) feature_vec = np.array(array.array("f", s[20:])) li.append(feature_vec) X_train = np.array(li) # fit the model clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1) clf.fit(X_train) # 预测的结果为-1 abnormal 或 1 normal ,在这个群落中为1,不在为-1. y_pred_train = clf.predict(X_train) print(y_pred_train) normal = X_train[y_pred_train == 1] abnormal = X_train[y_pred_train == -1] print(normal) print(abnormal) print(normal.shape) print(abnormal.shape) print("labels_true") print(labels_true) plt.plot(normal[:, 0], normal[:, 1], 'bx')
from sklearn import svm xx, yy = np.meshgrid(np.linspace(-10, 10, 500), np.linspace(-10, 10, 500)) # Generate train data X = 0.3 * np.random.randn(100, 2) X_train = np.r_[X + 5, X - 5] X_train1 = np.r_[X, X] # Generate some regular novel observations X = 0.3 * np.random.randn(20, 2) X_test = np.r_[X + 5, X - 5] X_test1 = np.r_[X, X] # Generate some abnormal novel observations X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2)) # fit the model clf = svm.OneClassSVM(nu=0.1, kernel="linear", gamma=0.1) clf.fit(X_train) y_pred_train = clf.predict(X_train) y_pred_test = clf.predict(X_test) y_pred_test1 = clf.predict(X_test1) y_pred_outliers = clf.predict(X_outliers) n_error_train = y_pred_train[y_pred_train == -1].size n_error_test = y_pred_test[y_pred_test == -1].size n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size # plot the line, the points, and the nearest vectors to the plane Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.title("Novelty Detection") plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu)
def train(train_set): clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma='auto') clf.fit(train_set) return clf
import matplotlib.font_manager from scipy import stats from sklearn import svm from sklearn.covariance import EllipticEnvelope # Example settings n_samples = 200 outliers_fraction = 0.25 clusters_separation = [0, 1, 2] # define two outlier detection tools to be compared classifiers = { "One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, kernel="rbf", gamma=0.1), "robust covariance estimator": EllipticEnvelope(contamination=.1), } # Compare given classifiers under given settings xx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500)) n_inliers = int((1. - outliers_fraction) * n_samples) n_outliers = int(outliers_fraction * n_samples) ground_truth = np.ones(n_samples, dtype=int) ground_truth[-n_outliers:] = 0 # Fit the problem with varying cluster separation for i, offset in enumerate(clusters_separation): np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2) #split X_train to normal and outliers X_train_normal = X_train[X_train['label_filled'] == 0].drop("label_filled", axis=1, inplace=False) #X_train_outliers = X_train[X_train['label_filled'] == 1].drop("label_filled",axis=1, inplace=False) X_test = X_test.drop("label_filled", axis=1, inplace=False) X_train = X_train.drop("label_filled", axis=1, inplace=False) print("Load data done.") #print(X_train.shape,X_train_normal.shape,X_train_outliers.shape) #fit model clf = svm.OneClassSVM(nu=0.05, kernel="rbf", gamma="auto") #nu是异常点的比例 clf.fit(X_train_normal) #predict y_pred_train = clf.predict(X_train) y_pred_test = clf.predict(X_test) print y_test[:10] print y_pred_test[:10] #change predict_labeks (1:-1)->to (0,1) y_pred_train = np.where(y_pred_train > 0, 0, 1) y_pred_test = np.where(y_pred_test > 0, 0, 1) # #print result print("train data classification report: ")
def OC_SVM_linear(dataset, model_type, class_number, hyper_para): _, _, relu, mean, cov, imagenet_mean, imagenet_std, _ = get_fuv(hyper_para, model_type) if(hyper_para.verbose==True): print('Loading dataset '+dataset+'...') train_data, test_data, test_label = load_dataset(dataset, class_number, imagenet_mean, imagenet_std, hyper_para) if(hyper_para.verbose==True): print(dataset+' dataset loaded.') no_train_data = np.shape(train_data.numpy())[0] no_test_data = np.shape(test_data.numpy())[0] ### choose one network which produces D dimensional features model = choose_network(model_type, hyper_para.pre_trained_flag) ### training on gpu if(hyper_para.gpu_flag): relu.cuda() model.cuda() model.eval() relu.eval() if(hyper_para.verbose==True): print('Extracting training features...') train_features = np.memmap('../../temp_files/train_features_temp.bin', dtype='float32', mode='w+', shape=(no_train_data,hyper_para.D)) train_features = torch.from_numpy(train_features) for i in range(no_train_data): train_features[i:(i+1)] = (model(torch.autograd.Variable(train_data[i:(i+1)].cuda().contiguous().float(), volatile=True)).float()).data.cpu() train_data = None if(hyper_para.verbose==True): print('Features extracted.') if(hyper_para.verbose==True): print('Training one class SVM with linear kernel...') # train one-class svm oc_svm_clf = svm.OneClassSVM(kernel='linear', nu=float(hyper_para.N)) # oc_svm_clf.fit(train_features) oc_svm_clf.fit(train_features.numpy()) if(hyper_para.verbose==True): print('One class SVM with Linear kernel trained.') ## test on the test set test_features = np.memmap('../../temp_files/test_features_temp.bin', dtype='float32', mode='w+', shape=(no_test_data,hyper_para.D)) test_scores = np.memmap('../../temp_files/test_scores_temp.bin', dtype='float32', mode='w+', shape=(no_test_data,1)) test_features = torch.from_numpy(test_features) k=0 mean_kwn = np.zeros( (no_test_data,1) ) for j in range(no_test_data): temp = (model(torch.autograd.Variable(test_data[j:(j+1)].cuda().contiguous().float(), volatile=True)).float()) test_features[k:(k+1)] = temp.data.cpu() temp = np.reshape((temp).data.cpu().numpy(), (1, hyper_para.D)) test_scores[k:(k+1)] = oc_svm_clf.decision_function(temp)[0] k = k+1 test_features = test_features.numpy() train_features = train_features.numpy() fpr, tpr, thresholds = metrics.roc_curve(test_label, test_scores) area_under_curve = metrics.auc(fpr, tpr) joblib.dump(oc_svm_clf,'../../save_folder/saved_models/'+dataset+'/classifier/'+str(class_number)+'/'+model_type+'_OCSVMlin_'+str(hyper_para.N)+'.pkl') scipy.io.savemat('../../save_folder/results/'+dataset+'/'+str(class_number) +'/'+ model_type+'_OCSVMlin_'+str(hyper_para.N)+'.mat', { 'train_features':train_features, 'test_features':test_features, 'test_label':test_label, 'test_scores':test_scores }) return area_under_curve
import numpy as np from sklearn import svm import cv2 import calHistogramOpticalFlow as chof train_path = "/home/kun/data/UCSD/UCSDped1/Train/train_encoder_feature/train_encoder_patch_104.npy" test_path = "/home/kun/data/UCSD/UCSDped1/Test/test_encoder_feature/test_encoder_patch_104.npy" # train_path = "UCSD/UCSDped1/train/train_feature/train_patch_90.npy" # test_path = "UCSD/UCSDped1/test/test_feature/test_patch_90.npy" train_x = np.load(train_path) test_x = np.load(test_path) clf = svm.OneClassSVM(nu=0.01, kernel='rbf', gamma=0.6) clf.fit(train_x) y_pred_train = clf.predict(train_x) y_pred_test = clf.predict(test_x) n_error_train = y_pred_train[y_pred_train == -1].size n_error_test = y_pred_test[y_pred_test == -1].size print n_error_train # for i in range(y_pred_test.shape[0]): # if y_pred_test[i] == -1: # file = i / 195 + 1 # frame = i % 195 + 1 # image_path = "/home/kun/data/UCSD/UCSDped1/Test/Test%03d/%03d.tif"%(file, frame)
X = X.transpose() # Remove rows with 0 (NA) for wetCode X_train = X[X[:, -1] != 0] # Remove non-finite values X_train = X_train[np.isfinite(X_train).all(axis=1)] # Split into variables (X) and class (y) y_train = X_train[:, -1] X_train = X_train[:, 0:-1] # Train CSV classifier print('define clf') # clf= svm.OneClassSVM(kernel='rbf',nu=0.2,gamma='auto',verbose=False) clf = svm.OneClassSVM(kernel='poly', nu=0.1, gamma='auto', verbose=True) print('fit clf') clf.fit(X_train, y_train) # Set NaN values to 0 X = np.where(np.isfinite(X), X, 0) # Apply classification print('apply classification ') predictClass = clf.predict(X[:, 0:-1]) # Write out data to RAT print('write RAT') rat.writeColumn(ratDataset, 'predictClass', predictClass) ratDataset = None
Xtest.append(dataset_task1[int(0.6*dataset_task1.shape[0])+1:dataset_task1.shape[0]]) Xtrain.append(dataset_task2[0:int(0.6*dataset_task2.shape[0])]) Xtest.append(dataset_task2[int(0.6*dataset_task2.shape[0])+1:dataset_task2.shape[0]]) for param_kernel in kernel: save_path = f'{path}/{param_kernel}/' for param_nu in nu: for param_gamma in gamma: if(param_kernel!='poly'): print(f'kernel={param_kernel} - gamma={param_gamma} - nu={param_nu}') clfs = [] for i in range(2): clfs.append(svm.OneClassSVM(nu=param_nu, kernel=param_kernel, gamma=param_gamma)) clfs[i].fit(Xtrain[i]) pkl_filename = f'{save_path}/svm_model_3seq_T{i}.pkl' with open(pkl_filename, 'wb') as file: pickle.dump(clfs[i], file) conf_matrix = compute_confusion_matrix(clfs) fig = plt.gcf() ax = plt.subplot() sns.heatmap(conf_matrix, annot=True, ax = ax, cmap="YlGnBu"); ax.xaxis.set_ticklabels(['T1', 'T2']); ax.yaxis.set_ticklabels(['T1', 'T2']); plt.savefig(f'{save_path}/FINAL_n{param_nu}_r{param_gamma}.png') plt.clf() else: for param_degree in degree: print(f'kernel={param_kernel} - gamma={param_gamma} - nu={param_nu} - d={param_degree}') clfs = []
def GetLabel(self, X): '''------------------OSVM--------------------------''' from sklearn import svm # use the same dataset clf = svm.OneClassSVM(nu=0.05, kernel="rbf", gamma=0.1) clf.fit(X) svm.OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma=0.1, kernel='rbf', max_iter=-1, nu=0.05, random_state=None, shrinking=True, tol=0.001, verbose=False) osvm = clf.predict(X) # inliers are labeled 1, outliers are labeled -1 normal = X[osvm == 1] abnormal = X[osvm == -1] '''---------------------IForest--------------------------''' from sklearn.ensemble import IsolationForest data = pd.DataFrame(X, columns=["Price", "Time"]) # train isolation forest model = IsolationForest(contamination=0.1) model.fit(data) data['IForest'] = pd.Series(model.predict(data)) # visualization '''---------------------KNN--------------------------''' # train kNN detector from pyod.models.knn import KNN clf_name = 'KNN' clf = KNN() clf.fit(X) # get the prediction labels and outlier scores of the training data ss = clf.labels_ # binary labels (0: inliers, 1: outliers) #y_train_scores = clf.decision_scores_ # raw outlier scores data['OSVM'] = osvm data['KNN'] = ss # Convert each value of Knn to be equal with they others data.loc[(data.KNN == 0), 'KNN'] = '1' data.loc[(data.KNN == 1), 'KNN'] = '-1' # data['KNN'] = data['KNN'].astype(int) data['OSVM'] = data['OSVM'].astype(int) data['IForest'] = data['IForest'].astype(int) #data['RES']=data['RES'].astype(int) # data['RES'] = data.OSVM + data.IForest + data.KNN data.dtypes # Operation to get RES data.RES[data.RES == 1] = 0 data.RES[data.RES == 3] = 1 data.RES[data.RES == -3] = 0 data.RES[data.RES == -1] = 0 x = data.iloc[:, [0, 1]].values y = data.iloc[:, [5]].values self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( x, y, test_size=0.3, random_state=100) return self.X_train, self.X_test, self.y_train, self.y_test
def sample_svdd(x_train, outlier_fraction=0.001, kernel_s=2, maxiter=1000, sample_size=10, resample_n=3, stop_tol=1e-6, n_iter=30, iter_history=True, seed=2513646): """ Perform sampling based approximate svdd. Input Parameters: x_train : input data to train, must be a two-dim numpy array kernel_s: the bandwidth for the Gaussian kernel, the Gaussian kernel is assumed to be of the form exp( -||x - y||^2 / (2 *kernel_s^2)) sample_size: the size of each random sample resample_n: take these many samples in each iteration, and merge the union of their support vectors with the master, the method documented in the paper corresponds to resample_n = 1 stop_tol: the tolerance value to detect convergence n_iter: the raidus and center must be close to each other for this many consecutive iterations for convergence to be declared iter: flag to determine whether convergence history will be stored seed: seed value for the random number generator Output: The output is a named tuple. If the output is denoted by res then: res.IterHist: a named tuple containing the iteration history res.IterHist.niter_ : number of iterations till convergence res.IterHist.radius_history_ : the iteration history for the radius res.IterHist.center_history_: the iteration history of the center res.IterHist.converged_ : convergence status flag res.Params: a named tuple containing the output parameters of the suggested SVDD res.Params.sv_: the indices of the fitted support vectors res.Params.center_: final center point res.Params.radius_ : final radius res.OneClassSVM: A sklearn.svm.OneClassSVM instance corresponding to the result. Can be used for scoring. """ # Only matrix input allowed if len(x_train.shape) != 2: print("ERROR: invalid x_train input found, expecting a matrix") raise ValueError #sanity checks if maxiter <= 0: print("ERROR: maxiter must be positive integer") raise ValueError nobs = x_train.shape[0] if nobs <= sample_size: print( "ERROR: sample size must be strictly smaller than number of observations in input data" ) raise ValueError # convert kernel_s to gamma gamma, nu = 0.5 / (kernel_s * kernel_s), outlier_fraction if np.isfinite(gamma) != True or np.isfinite(nu) != True or (nu < 0) or ( nu > 1): print("ERROR: Invalid kernel_s or outlier_fraction input") raise ValueError #if negative seed is provided use a system chosen seed np.random.seed(seed=seed if seed >= 0 else None) if iter_history: radius_history, center_history = np.empty(maxiter + 1), list() clf = None sv_ind_prev, radius_prev, center_prev = _do_one_class_svm_random( gamma, nu, x_train, sample_size) if iter_history: radius_history[0] = radius_prev center_history.append(center_prev) i, converged, iter_n = 0, 0, 0 while i < maxiter: if converged: break sv_ind_local = _do_one_class_svm_random(gamma, nu, x_train, sample_size, compute_rc=False) for dummy1 in range(resample_n - 1): sv_ind_locals = _do_one_class_svm_random(gamma, nu, x_train, sample_size, compute_rc=False) sv_ind_local = np.union1d(sv_ind_locals, sv_ind_local) sv_ind_merge = np.union1d(sv_ind_local, sv_ind_prev) sv_ind_master, radius_master, center_master = _do_one_class_svm_sample( gamma, nu, x_train, sv_ind_merge) if iter_history: radius_history[i + 1] = radius_master center_history.append(center_master) iter_n = iter_n + 1 if np.fabs( radius_master - radius_prev) <= stop_tol * np.fabs(radius_prev) else 0 if iter_n >= n_iter: converged = 1 else: sv_ind_prev, center_prev, radius_prev = sv_ind_master, center_master, radius_master i += 1 if iter_history: radius_history = radius_history[0:i + 1] niter = i + 1 SampleSVDDRes = namedtuple("SampleSVDDRes", "Params IterHist OneClassSVM") SampleSVDDParams = namedtuple("SampleSVDDParams", "sv_ center_ radius_") SampleSVDDIterHist = namedtuple( "SampleSVDDIterHist", "niter_ radius_history_ center_history_ converged_") params = SampleSVDDParams(sv_ind_master, center_master, radius_master) iterhist = None if iter_history: iterhist = SampleSVDDIterHist(niter, radius_history, center_history, converged) nsv = sv_ind_master.shape[0] clf = svm.OneClassSVM(gamma=gamma, nu=nu if nu * nsv > 1 else 1. / nsv) clf.fit(x_train[sv_ind_master, ...]) return SampleSVDDRes(params, iterhist, clf)
def run_main(): import matplotlib.pyplot as plt import time #create a donut data. def one_donut(rmin, rmax, origin, nobs): """ rmin: inner radius rmax: outer radis origin: origin nobs: number of observations in the data """ r = np.sqrt(rmin * rmin + (rmax - rmin) * (rmax + rmin) * np.random.ranf(nobs)) theta = 2 * np.pi * np.random.ranf(nobs) res = np.array([(r_ * np.cos(theta_), r_ * np.sin(theta_)) for r_, theta_ in zip(r, theta)]) return res + origin seed = 24215125 np.random.seed(seed) #store time taken by the two methods tsample, tfull = list(), list() #run the method over data sets of these sizes dsize_list = [5000, 10000, 100000, 500000, 1000000, 1250000, 2000000] #this will take about 10mins to run for ndat in dsize_list: #parameters of the two donuts r_min1, r_max1, origin1, nobs1 = 3, 5, (0, 0), np.floor(0.75 * ndat) r_min2, r_max2, origin2, nobs2 = 2, 4, (10, 10), ndat - nobs1 #create the training data test_data = np.append(one_donut(r_min1, r_max1, origin1, nobs1), one_donut(r_min2, r_max2, origin2, nobs2), axis=0) print('the test data has {0} observations'.format( test_data.shape[0])) #parameters of the training SVDD. Tweak for performance/accuracy. outlier_fraction, kernel_s = 0.0001, 1.3 sample_size, resample_n, n_iter = 10, 1, 10 stop_tol, maxiter = 1e-4, 5000 #train using sampling svdd start = time.time() result = sample_svdd(test_data, outlier_fraction=outlier_fraction, kernel_s=kernel_s, resample_n=resample_n, maxiter=maxiter, sample_size=sample_size, stop_tol=stop_tol, n_iter=n_iter, iter_history=True, seed=seed) end = time.time() tsample.append(end - start) print( "sample svdd took {0} seconds to train, iteration history stored" .format(end - start)) radius_history = result.IterHist.radius_history_ sv_indices = result.Params.sv_ #train using full svdd start = time.time() clf1 = svm.OneClassSVM( nu=outlier_fraction if test_data.shape[0] * outlier_fraction > 1 else 1. / test_data.shape[0], kernel="rbf", gamma=0.5 / (kernel_s * kernel_s)) clf1.fit(test_data) end = time.time() tfull.append(end - start) print("full svdd took {0} seconds to train".format(end - start)) #plot the support vectors plt.figure(1) plt.grid(True) plt.title('Support Vectors (Sampling Method)') plt.scatter(test_data[sv_indices, 0], test_data[sv_indices, 1]) plt.show() plt.figure(2) plt.grid(True) plt.title('Support Vectors (Full SVDD))') plt.scatter(clf1.support_vectors_[..., 0], clf1.support_vectors_[..., 1]) plt.show() plt.figure(3) plt.title('Iteration History for Sampling Method') plt.plot(radius_history) plt.show() #create a 200 x 200 grid on the bounding rectangle of the training data # for scoring ngrid = 200 max_x, max_y = np.amax(test_data, axis=0) min_x, min_y = np.amin(test_data, axis=0) x_ = np.linspace(min_x, max_x, ngrid) y_ = np.linspace(min_y, max_y, ngrid) x, y = np.meshgrid(x_, y_) score_data = np.array([(x1, y1) for x1, y1 in zip(x.ravel(), y.ravel())]) #the OneClasSVM result corresponding to the sample method clf2 = result.OneClassSVM scores1 = clf1.predict(score_data) scores2 = clf2.predict(score_data) #plot the scored data plt.figure(4) p2 = np.where(scores2 == 1) plt.grid(True) plt.title( "Scoring Results : Inside Points Colored green (using sampling svdd)" ) plt.scatter(score_data[p2, 0], score_data[p2, 1], color='g', s=0.75) plt.show() plt.figure(5) p1 = np.where(scores1 == 1) plt.grid(True) plt.title( "Scoring Results : Inside Points Colored (using full svdd)") plt.scatter(score_data[p1, 0], score_data[p1, 1], color='g', s=0.75) plt.show() plt.figure(6) plt.grid(True) plt.title( "Sampling SVDD Performance. Sample Size {0}".format(sample_size)) plt.xlabel("Input Data Size") plt.ylabel("Time Taken (in seconds)") plt.plot(dsize_list, tsample) plt.figure(7) plt.grid(True) plt.title("Full SVDD Performance") plt.xlabel("Input Data Size") plt.ylabel("Time Taken (in seconds)") plt.plot(dsize_list, tfull)
############## data.to_csv('with_nbhd.csv', index=False) ############## # MODEL ############## sample_columns = ['DAY', 'HOUR', 'WEATHER', 'NBHD', 'SEVERITYCODE'] sample_data = data[sample_columns] # OneClassSVM svm_data = sample_data[sample_data.SEVERITYCODE == 3].drop('SEVERITYCODE', axis=1) model = svm.OneClassSVM() %time model.fit(svm_data) %time pd.value_counts(model.predict(svm_data)) %time pd.value_counts(model.predict(sample_data[sample_data.SEVERITYCODE < 3])) preds = model.predict(sample_data[sample_data.SEVERITYCODE < 3]) predicted_dangerous = data[data.SEVERITYCODE < 3][preds == 1] predicted_dangerous.groupby('DAY').size() predicted_dangerous.groupby('NBHD').size() pd_percent = predicted_dangerous.groupby('HOUR').size() / predicted_dangerous.groupby('HOUR').size().sum() * 100 sns.lineplot(x=list(range(1,23)), y=pd_percent)
def osvmClassification(nu, x_train_p, x_test, y_train, y_test): clf = svm.OneClassSVM(nu=nu, kernel='rbf', gamma=0.1) clf.fit(x_train_p) y_pred = clf.predict(x_test) accuracy = np.sum(y_pred == y_test) / len(y_pred) return clf, accuracy
y_pred_train_reshape, y_pred_test_reshape, y_train_reshape, y_test_reshape = train_model( model_svm, to_remove, x_class, y_class) get_scores(y_pred_train_reshape, y_pred_test_reshape, y_train_reshape, y_test_reshape) model_rfc = RandomForestClassifier(n_estimators=100, random_state=0, class_weight='balanced') y_pred_train_reshape, y_pred_test_reshape, y_train_reshape, y_test_reshape = train_model( model_rfc, to_remove, x_class, y_class) get_scores(y_pred_train_reshape, y_pred_test_reshape, y_train_reshape, y_test_reshape) #now lets do unsupervised learning: for unsupervised learning we don't need to subsample nor dividide between training #one class svm anomalies clf = svm.OneClassSVM(nu=.1, kernel='rbf', gamma=.1) fit_model(clf, x_class, y_class) fit_model(clf, x, y) # Split into anomaly and normal examples clf = svm.OneClassSVM(nu=.1, kernel='rbf', gamma=.1) fit_novelty_model(clf, x_class, y_class) clf = svm.OneClassSVM(nu=.1, kernel='rbf', gamma=.1) fit_novelty_model(clf, x, y) clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1) fit_model_loc(clf, x_class, y_class) clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1) fit_model_loc(clf, x, y)
def _build_classifiers(self): # Train on first 20% of images and create empty lists of features to_analyse = int(len(self.all_imgs) * 0.2) hu_feas = [] areas = [] lengths = [] initial_areas = [] initial_lengths = [] initial_hu_feas = [] # Create empty lists of colour features if using colour if self.use_colour: colors_r = [] colors_g = [] colors_b = [] # For all images and masks for index, (mask, img_f) in enumerate( list(zip(self.panel_masks[:], self.all_imgs[:]))): # Read respective image and create empty lists of features for this image img = imread(img_f) hu_feas_labelled = [] areas_labelled = [] lengths_labelled = [] # Crop image to just the panel, get region properties of objects in the panel img = self.panel.get_bbox_image(img) c_label, c_rprops = simple_label_next_frame( self.panel_labels, self.panel_regionprops, mask) # For each seed found in the panel for idx, rp in enumerate(c_rprops): # If colour is being used, append the respective colour channel lists with rgb values f from # the colour histogram function if self.use_colour: r, g, b = self.generate_color_histogram(img, rp) colors_r.append(r) colors_g.append(g) colors_b.append(b) # Append the features of the seed (Hu moments, area, major axis length, minor axis length, minor/major # axis length ratio) to a list of seed features hu_feas.append(rp.moments_hu) hu_feas_labelled.append(np.hstack((rp.moments_hu, rp.label))) areas.append(rp.area) areas_labelled.append([rp.area, rp.label]) lengths.append([ rp.minor_axis_length, rp.major_axis_length, float(rp.minor_axis_length + 1.0) / float(rp.major_axis_length + 1.0) ]) lengths_labelled.append( np.hstack(([ rp.minor_axis_length, rp.major_axis_length, float(rp.minor_axis_length + 1.0) / float(rp.major_axis_length + 1.0) ], rp.label))) # Append the list of seed features for that image to a list of all images' seed features initial_areas.append(np.array(areas_labelled)) initial_lengths.append(np.array(lengths_labelled)) initial_hu_feas.append(np.array(hu_feas_labelled)) areas = np.vstack(areas) hu_feas = np.vstack(hu_feas) lengths = np.vstack(lengths) if self.use_delta: self.delta_area = np.zeros((areas.shape[0], 1)) self.delta_hu_feas = np.zeros((hu_feas.shape[0], 7)) self.delta_lengths = np.zeros((lengths.shape[0], 3)) counter = 0 # For i in total number of images for i in range(len(initial_areas)): # For j in largest seed label for j in range(np.max(initial_areas[i][:, 1])): # If first image if i == 0: # If seed label is present in current image array if np.isin(j + 1, initial_areas[i][:, 1]): id = j + 1 if np.isin(id, initial_areas[i + 1][:, 1]): curr_arr = initial_areas[i][:, 1] curr = np.argwhere(curr_arr == id) next_arr = initial_areas[i + 1][:, 1] next = np.argwhere(next_arr == id) # As the delta for the first image is undefined, set it to the difference between the first # and second image self.delta_area[counter, 0] = np.abs( initial_areas[i + 1][next, 0] - initial_areas[i][curr, 0]) self.delta_lengths[counter, :] = np.abs( initial_lengths[i + 1][next, :3] - initial_lengths[i][curr, :3]) self.delta_hu_feas[counter, :] = np.abs( initial_hu_feas[i + 1][next, :7] - initial_hu_feas[i][curr, :7]) counter += 1 else: # If seed label is present in current image array if np.isin(j + 1, initial_areas[i][:, 1]): id = j + 1 # Get indices of same seed in previous and current image array curr_arr = initial_areas[i][:, 1] curr = np.argwhere(curr_arr == id) prev_arr = initial_areas[i - 1][:, 1] prev = np.argwhere(prev_arr == id) if curr.size != prev.size: # If seed disappears or new seed, set it's delta to the mean of other seeds self.delta_area[counter, 0] = np.mean( self.delta_area[0:counter, 0]) self.delta_lengths[counter, :] = np.mean( self.delta_lengths[0:counter, :]) self.delta_hu_feas[counter, :] = np.mean( self.delta_hu_feas[0:counter, :]) counter += 1 else: # Create delta features i.e. seed feature from this image - seed feature from previous image self.delta_area[counter, 0] = np.abs( initial_areas[i][curr, 0] - initial_areas[i - 1][prev, 0]) self.delta_lengths[counter, :] = np.abs( initial_lengths[i][curr, :3] - initial_lengths[i - 1][prev, :3]) self.delta_hu_feas[counter, :] = np.abs( initial_hu_feas[i][curr, :7] - initial_hu_feas[i - 1][prev, :7]) counter += 1 # Get the number of seeds to train on to_analyse = np.sum(item.shape[0] for item in initial_areas[:to_analyse]) # Create array containing seed features from all images if self.use_delta: self.all_data = np.hstack([ hu_feas, self.delta_hu_feas, areas, self.delta_area, lengths, self.delta_lengths ]) else: self.all_data = np.hstack([hu_feas, areas, lengths]) # Create training data for one class SVM if self.use_delta: hu_feas = np.hstack([ hu_feas[:to_analyse, :], self.delta_hu_feas[:to_analyse, :], areas[:to_analyse, :], self.delta_area[:to_analyse, :], lengths[:to_analyse, :], self.delta_lengths[:to_analyse, :] ]) #added in area and delta area. else: hu_feas = np.hstack([ hu_feas[:to_analyse, :], areas[:to_analyse, :], lengths[:to_analyse, :] ]) if self.use_colour: color_feas = np.hstack([ np.vstack(colors_r), np.vstack(colors_g), np.vstack(colors_b) ]) # Normalise the hu features and the delta mean i.e. z = (x-mu)/sigma self.hu_feas_mu = hu_feas.mean(axis=0) self.hu_feas_stds = hu_feas.std(axis=0) hu_feas = (hu_feas - self.hu_feas_mu) / (self.hu_feas_stds + 1e-9) # Train a one class SVM on the hu features self.clf_hu = svm.OneClassSVM(nu=0.03, kernel="rbf", gamma=0.001) self.clf_hu.fit(hu_feas) # If using colour, normalise the colour histograms i.e. z = (x-mu)/sigma if self.use_colour: self.color_feas_mu = color_feas.mean(axis=0) self.color_feas_stds = color_feas.std(axis=0) color_feas = (color_feas - self.color_feas_mu) / (self.color_feas_stds + 1e-9) # Train a one class SVM on the colour features self.clf_color = svm.OneClassSVM(nu=0.03, kernel="rbf", gamma=0.001) self.clf_color.fit(color_feas)
from sklearn.neighbors import LocalOutlierFactor print(__doc__) matplotlib.rcParams['contour.negative_linestyle'] = 'solid' # Example settings n_samples = 300 outliers_fraction = 0.15 n_outliers = int(outliers_fraction * n_samples) n_inliers = n_samples - n_outliers # define outlier/anomaly detection methods to be compared anomaly_algorithms = [ ("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)), ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1)), ("Isolation Forest", IsolationForest(contamination=outliers_fraction, random_state=42)), ("Local Outlier Factor", LocalOutlierFactor( n_neighbors=35, contamination=outliers_fraction))] # Define datasets blobs_params = dict(random_state=0, n_samples=n_inliers, n_features=2) datasets = [ make_blobs(centers=[[0, 0], [0, 0]], cluster_std=0.5, **blobs_params)[0], make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[0.5, 0.5], **blobs_params)[0], make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[1.5, .3], **blobs_params)[0], 4. * (make_moons(n_samples=n_samples, noise=.05, random_state=0)[0] -
def choose_classifier(dataset, class_number, model_type, model, classifier, D, hyper_para, train_data, test_data, test_label, no_train_data, no_test_data, inm, relu, m, s): if(hyper_para.verbose==True): print('Extracting features.....') train_features = np.memmap('../../temp_files/train_features_temp.bin', dtype='float32', mode='w+', shape=(no_train_data,hyper_para.D)) train_features = torch.from_numpy(train_features) for i in range(no_train_data): temp = model(torch.autograd.Variable(train_data[i:(i+1)].cuda().contiguous().float())).float() temp = temp.view(1,1,hyper_para.D) temp = inm(temp) temp = relu(temp.view(hyper_para.D)) train_features[i:(i+1)] = temp.data.cpu() train_data = None if(hyper_para.verbose==True): print('Features extracted.') ## test on the test set test_features = np.memmap('../../temp_files/test_features_temp.bin', dtype='float32', mode='w+', shape=(no_test_data,hyper_para.D)) test_scores = np.memmap('../../temp_files/test_scores_temp.bin', dtype='float32', mode='w+', shape=(no_test_data,1)) test_features = torch.from_numpy(test_features) if(hyper_para.verbose==True): print('Computing test scores and AUC....') area_under_curve=0.0 if(hyper_para.classifier_type=='OC_CNN'): test_scores = torch.from_numpy(test_scores) k=0 print(np.shape(test_features)) start = time.time() for j in range(no_test_data): temp = model(AddNoise(torch.autograd.Variable(test_data[j:(j+1)].cuda().contiguous().float()), hyper_para.sigma1)).float() temp = temp.view(1,1,hyper_para.D) temp = inm(temp) temp = temp.view(hyper_para.D) test_features[k:(k+1)] = temp.data.cpu() test_scores[k:(k+1)] = classifier(relu(temp)).data.cpu()[1] # print(classifier(relu(temp)).data.cpu()) k = k+1 end = time.time() print(end-start) test_scores = test_scores.numpy() test_features = test_features.numpy() train_features = train_features.numpy() test_scores = (test_scores-np.min(test_scores))/(np.max(test_scores)-np.min(test_scores)) elif(hyper_para.classifier_type=='OC_SVM_linear'): # train one-class svm oc_svm_clf = svm.OneClassSVM(kernel='linear', nu=float(hyper_para.N)) oc_svm_clf.fit(train_features.numpy()) k=0 mean_kwn = np.zeros( (no_test_data,1) ) for j in range(no_test_data): temp = model(torch.autograd.Variable(test_data[j:(j+1)].cuda().contiguous().float())).float() temp = temp.view(1,1,hyper_para.D) temp = inm(temp) temp = temp.view(hyper_para.D) test_features[k:(k+1)] = temp.data.cpu() temp = np.reshape(relu(temp).data.cpu().numpy(), (1, hyper_para.D)) test_scores[k:(k+1)] = oc_svm_clf.decision_function(temp)[0] k = k+1 test_features = test_features.numpy() train_features = train_features.numpy() joblib.dump(oc_svm_clf,'../../save_folder/saved_models/'+dataset+'/classifier/'+str(class_number) +'/'+ model_type+'_OCCNNlin' +'_'+ str(hyper_para.iterations)+'_'+ str(hyper_para.lr) +'_'+ str(hyper_para.sigma) +'_'+ str(hyper_para.N) +'.pkl') fpr, tpr, thresholds = metrics.roc_curve(test_label, test_scores) if(hyper_para.verbose==True): print('Test scores and AUC computed.') return area_under_curve, train_features, test_scores, test_features
def getTrainedSVM(trainingData,nuu,g): model=svm.OneClassSVM(nu=nuu,kernel='rbf',gamma=g) #trainingData=np.reshape(trainingData,(1,len(trainingData))) model.fit(trainingData) return model
print(np.array(NegativeTest).shape) print("数据处理完成") start = 72 # gamma = 0.001,0.01,0.1,1,10,100 gamma = 100 for nu in [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]: print(" ") print(" ") print("C = {}".format(nu)) print(" ") print(" ") #训练 # clf = SVC(kernel="rbf", C = 1.0, gamma= "auto") clf = svm.OneClassSVM(kernel="rbf", gamma=gamma, nu=nu) clf.fit(PositiveTrain) Modalpath = modal_path + os.sep + r"model" + str(gamma) + "n" + str( nu) + r".plk" joblib.dump(clf, Modalpath) TestPredict = clf.predict(TestData) print("训练完成") #评价 print("返回给定测试集和对应标签的平均准确率") n = 0 for i in range(len(TestPredict)): if TestLabel[i] == TestPredict[i]: n += 1 accuracy = n / (len(TestPredict)) print(accuracy) print("混淆矩阵:")
X_misaligned = misaligned_blobs(samples=n_inliers, sd=cluster_sd) ## 6: Whole dataset datasets3D = [X_lin, X_hex, X_sph, X_gau, X_misaligned] # define to data label: y_true y_true = np.concatenate([np.ones(n_inliers), -np.ones(n_outliers)], axis=0) # label 1 as inliers, -1 as outliers # Define algorithm to be compared ------------------------------- anomaly_algorithms = [ ("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)), ( "One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma="scale"), ), ( "Isolation Forest", IsolationForest( n_estimators=500, behaviour="new", contamination=outliers_fraction, random_state=42, ), ), ( "Local Outlier Factor", LocalOutlierFactor( n_neighbors=35, contamination=outliers_fraction, novelty=False ),
def runDetection(outliers, inliers, X, outs, plot=True, outliersNb=10.): outliers_fraction = outliersNb / X.shape[0] rng = np.random.RandomState(69) clusters_separation = [0] #, 1, 2] # les differents outils de detection d'anomalies classifiers = { "One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction, kernel="rbf", gamma=0.1), "Isolation Forest": IsolationForest(n_estimators=500, max_samples='auto', bootstrap=False, contamination=outliers_fraction, random_state=rng) } if (plot): classifiers["Robust covariance"] = EllipticEnvelope( contamination=outliers_fraction) # Compare given classifiers under given settings xx, yy = np.meshgrid(np.linspace(-0.2, 1.3, 100), np.linspace(-0.2, 1.9, 100)) # Fit the problem with varying cluster separation for i, offset in enumerate(clusters_separation): np.random.seed(69) # Fit the model plt.figure(figsize=(10.8, 3.6)) for i, (clf_name, clf) in enumerate(classifiers.items()): # fit the data and tag outliers clf.fit(X) scores_pred = clf.decision_function(X) threshold = stats.scoreatpercentile(scores_pred, 100 * outliers_fraction) y_pred = clf.predict(X) X_out_idx = np.where(y_pred == -1)[0] print clf_name #if (plot): print "True outliers :", outs print "Outliers detected :", X_out_idx # Calcul de la matrice de confusion a la main FP = len(np.intersect1d(outs, X_out_idx)) FN = len(X_out_idx) - FP V = X.shape[0] - len(X_out_idx) VN = len(outs) - FP VP = V - VN n_errors = (VN + FN) print "Matrice de confusion" print " _________________________________", "\n" \ "| P\R Outliers Inliers |","\n" \ "| -------------------------------- |","\n" \ "| Outliers ", " "*4, FP, " "*8, FN, " "*4, "|","\n" \ "| -------------------------------- |","\n" \ "| Inliers ", " "*4, VN, " "*7, VP, " "*3, "|","\n" \ "|_________________________________ |","\n" \ if (plot): # plot the levels lines and the points Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) subplot = plt.subplot(1, 3, i + 1) subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), cmap=plt.cm.Blues_r) a = subplot.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='red') subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='orange') b = subplot.scatter(inliers[:, 0], inliers[:, 1], c='white') c = subplot.scatter(outliers[:, 0], outliers[:, 1], c='black') subplot.axis('tight') subplot.legend( [a.collections[0], b, c], [ 'learned decision function', 'true inliers', 'true outliers' ], prop=matplotlib.font_manager.FontProperties(size=11), loc='upper left') subplot.set_title("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors)) subplot.set_xlim((-0.2, 1.3)) subplot.set_ylim((-0.2, 1.9)) if (plot): plt.subplots_adjust(0.04, 0.1, 0.96, 0.92, 0.1, 0.26) if (plot): plt.show()
def train(self, GridSearch=True, **kwargs): if self.data._X_train.ndim > 2: X_train_shape = self.data._X_train.shape X_train = self.data._X_train.reshape(X_train_shape[0], np.prod(X_train_shape[1:])) else: X_train = self.data._X_train print("Starting training...") self.start_clock() if self.loss == 'SVC': if self.kernel in ('DegreeKernel', 'WeightedDegreeKernel'): self.get_kernel_matrix(kernel=self.kernel, which_set='train', **kwargs) self.svm.fit(self.K_train, self.data._y_train) else: self.svm.fit(X_train, self.data._y_train) if self.loss == 'OneClassSVM': if self.kernel in ('DegreeKernel', 'WeightedDegreeKernel'): self.get_kernel_matrix(kernel=self.kernel, which_set='train', **kwargs) self.svm.fit(self.K_train) else: if GridSearch and self.kernel == 'rbf': # use grid search cross-validation to select gamma print("Using GridSearchCV for hyperparameter selection...") # sample small hold-out set from test set for hyperparameter selection. Save as val set. n_val_set = int(0.1 * self.data.n_test) n_test_out = 0 n_test_norm = 0 n_val_out = 0 n_val_norm = 0 while (n_test_out == 0) | (n_test_norm == 0) | ( n_val_out == 0) | (n_val_norm == 0): perm = np.random.permutation(self.data.n_test) self.data._X_val = self.data._X_test[perm[:n_val_set]] self.data._y_val = self.data._y_test[perm[:n_val_set]] # only accept small test set if AUC can be computed on val and test set n_test_out = np.sum( self.data._y_test[perm[:n_val_set]]) n_test_norm = np.sum( self.data._y_test[perm[:n_val_set]] == 0) n_val_out = np.sum(self.data._y_test[perm[n_val_set:]]) n_val_norm = np.sum( self.data._y_test[perm[n_val_set:]] == 0) self.data._X_test = self.data._X_test[perm[n_val_set:]] self.data._y_test = self.data._y_test[perm[n_val_set:]] self.data.n_val = len(self.data._y_val) self.data.n_test = len(self.data._y_test) self.diag['val']['scores'] = np.zeros( (len(self.data._y_val), 1)) self.diag['test']['scores'] = np.zeros( (len(self.data._y_test), 1)) cv_auc = 0.0 cv_acc = 0 for gamma in np.logspace(-10, -1, num=10, base=2): # train on selected gamma self.cv_svm = svm.OneClassSVM(kernel='rbf', nu=Cfg.svm_nu, gamma=gamma) self.cv_svm.fit(X_train) # predict on small hold-out set self.predict(which_set='val') # save model if AUC on hold-out set improved if self.diag['val']['auc'] > cv_auc: self.svm = self.cv_svm self.nu = Cfg.svm_nu self.gamma = gamma cv_auc = self.diag['val']['auc'] cv_acc = self.diag['val']['acc'] # save results of best cv run self.diag['val']['auc'] = cv_auc self.diag['val']['acc'] = cv_acc else: # if rbf-kernel, re-initialize svm with gamma minimizing the # numerical error if self.kernel == 'rbf': gamma = 1 / (np.max(pairwise_distances(X_train))**2) self.svm = svm.OneClassSVM(kernel='rbf', nu=Cfg.svm_nu, gamma=gamma) self.svm.fit(X_train) self.nu = Cfg.svm_nu self.gamma = gamma self.stop_clock() self.train_time = self.clocked
# In[6]: params = np.array(df.values[:, 1:], dtype="float64") params = scale(params) # In[7]: X = PCA(n_components=2).fit_transform(params) num = X.shape[0] OUTLIER_FRACTION = 0.01 # In[ ]: # In[8]: clf = svm.OneClassSVM(kernel="rbf") clf.fit(X) # In[9]: dist_to_border = clf.decision_function(X).ravel() threshold = stats.scoreatpercentile(dist_to_border, 100 * OUTLIER_FRACTION) is_inlier = dist_to_border > threshold # In[10]: xx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500)) n_inliers = int((1. - OUTLIER_FRACTION) * num) n_outliers = int(OUTLIER_FRACTION * num) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape)
def plot_species_distribution( species=["bradypus_variegatus_0", "microryzomys_minutus_0"]): """ Plot the species distribution. """ if len(species) > 2: print( "Note: when more than two species are provided, only " "the first two will be used") t0 = time() # Load the compressed data data = fetch_species_distributions() # Set up the data grid xgrid, ygrid = construct_grids(data) # The grid in x,y coordinates X, Y = np.meshgrid(xgrid, ygrid[::-1]) # create a bunch for each species BV_bunch = create_species_bunch(species[0], data.train, data.test, data.coverages, xgrid, ygrid) MM_bunch = create_species_bunch(species[1], data.train, data.test, data.coverages, xgrid, ygrid) # background points (grid coordinates) for evaluation np.random.seed(13) background_points = np.c_[ np.random.randint(low=0, high=data.Ny, size=10000), np.random.randint(low=0, high=data.Nx, size=10000)].T # We'll make use of the fact that coverages[6] has measurements at all # land points. This will help us decide between land and water. land_reference = data.coverages[6] # Fit, predict, and plot for each species. for i, species in enumerate([BV_bunch, MM_bunch]): print "_" * 80 print "Modeling distribution of species '%s'" % species.name # Standardize features mean = species.cov_train.mean(axis=0) std = species.cov_train.std(axis=0) train_cover_std = (species.cov_train - mean) / std # Fit OneClassSVM print " - fit OneClassSVM ... ", clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.5) clf.fit(train_cover_std) print "done. " # Plot map of South America pl.subplot(1, 2, i + 1) if basemap: print " - plot coastlines using basemap" m = Basemap(projection='cyl', llcrnrlat=Y.min(), urcrnrlat=Y.max(), llcrnrlon=X.min(), urcrnrlon=X.max(), resolution='c') m.drawcoastlines() m.drawcountries() else: print " - plot coastlines from coverage" pl.contour(X, Y, land_reference, levels=[-9999], colors="k", linestyles="solid") pl.xticks([]) pl.yticks([]) print " - predict species distribution" # Predict species distribution using the training data Z = np.ones((data.Ny, data.Nx), dtype=np.float64) # We'll predict only for the land points. idx = np.where(land_reference > -9999) coverages_land = data.coverages[:, idx[0], idx[1]].T pred = clf.decision_function((coverages_land - mean) / std)[:, 0] Z *= pred.min() Z[idx[0], idx[1]] = pred levels = np.linspace(Z.min(), Z.max(), 25) Z[land_reference == -9999] = -9999 # plot contours of the prediction pl.contourf(X, Y, Z, levels=levels, cmap=pl.cm.Reds) pl.colorbar(format='%.2f') # scatter training/testing points pl.scatter(species.pts_train['dd long'], species.pts_train['dd lat'], s=2**2, c='black', marker='^', label='train') pl.scatter(species.pts_test['dd long'], species.pts_test['dd lat'], s=2**2, c='black', marker='x', label='test') pl.legend() pl.title(species.name) pl.axis('equal') # Compute AUC w.r.t. background points pred_background = Z[background_points[0], background_points[1]] pred_test = clf.decision_function((species.cov_test - mean) / std)[:, 0] scores = np.r_[pred_test, pred_background] y = np.r_[np.ones(pred_test.shape), np.zeros(pred_background.shape)] fpr, tpr, thresholds = metrics.roc_curve(y, scores) roc_auc = metrics.auc(fpr, tpr) pl.text(-35, -70, "AUC: %.3f" % roc_auc, ha="right") print "\n Area under the ROC curve : %f" % roc_auc print "\ntime elapsed: %.2fs" % (time() - t0)
def test_oneclass_score_samples(): X_train = [[1, 1], [1, 2], [2, 1]] clf = svm.OneClassSVM(gamma=1).fit(X_train) assert_array_equal(clf.score_samples([[2., 2.]]), clf.decision_function([[2., 2.]]) + clf.offset_)
tX[d, int(w)] = float(cnts[n]) / total # count total number of anomalies Danom = 0.0 for a0, anomlbl in enumerate(anomlist): a = [1 for x in lbllist if anomlbl in x] Danom += float(len(a)) nulist = np.arange(1e-5, 0.4, 0.05) F1score = np.zeros(len(nulist)) fpres = open('results_indv.txt', 'w') fpres.write('') fpres.close() for n1, nu in enumerate(nulist): # train svm clf = svm.OneClassSVM(nu=nu, kernel="linear") clf.fit(trX) # test svm #pred_test = clf.predict(tX) anom_score = clf.decision_function(tX)[:, 0] anom_sorted = np.argsort(anom_score) # compute rec, prec recall = np.zeros(TopN) precision = np.zeros(TopN) tp = 0.0 for i, ind in enumerate(anom_sorted[0:TopN]): doclbl = lbllist[ind]
def main(args): path = '/media/joshua/Data/python_codes/fingerprinting/internship_experiments/AnomalyDetectionUsingAutoencoder-master/data/' ##single/all' #path = '/media/joshua/Data/python_codes/fingerprinting/internship_experiments/AnomalyDetectionUsingAutoencoder-master/ID_data/single/' intruder = [0] #4# intr = [0] snr = '0_1db' X0, y0 = load_image(path + snr, args.size, args.comp_vector, args.cartesian, args.window, args.trainProp) #X0,y0 = shuffle(X0,y0) #unique_elements, counts_elements = np.unique(y0, return_counts=True) #print(np.asarray((unique_elements, counts_elements))) [X_train, y_train], [X_val, y_val], [X_tes, y_tes] = data_split(X0, y0, args.trainProp, intruder) X_test = np.concatenate( (X_tes[np.where(np.in1d(y_tes, np.array(0)))[0]], X_tes[np.where( np.in1d(y_tes, np.array(1)))[0]], X_tes[np.where( np.in1d(y_tes, np.array(2)))[0]], X_tes[np.where( np.in1d(y_tes, np.array(3)))[0]], X_tes[np.where( np.in1d(y_tes, np.array(4)))[0]], X_tes[np.where( np.in1d(y_tes, np.array(5)))[0]]), axis=0) y_test = np.concatenate( (y_tes[np.where(np.in1d(y_tes, np.array(0)))[0]], y_tes[np.where( np.in1d(y_tes, np.array(1)))[0]], y_tes[np.where( np.in1d(y_tes, np.array(2)))[0]], y_tes[np.where( np.in1d(y_tes, np.array(3)))[0]], y_tes[np.where( np.in1d(y_tes, np.array(4)))[0]], y_tes[np.where( np.in1d(y_tes, np.array(5)))[0]]), axis=0) scaler = StandardScaler() if args.dim == 2: # s0, s1, s2 = X_train.shape[0], X_train.shape[1], X_train.shape[2] X_train = X_train.reshape(s0 * s1, s2) X_train = scaler.fit_transform(X_train) X_train = X_train.reshape(s0, s1, s2) s0, s1, s2 = X_test.shape[0], X_test.shape[1], X_test.shape[2] X_test = X_test.reshape(s0 * s1, s2) X_test = scaler.transform(X_test) X_test = X_test.reshape(s0, s1, s2) s0, s1, s2 = X_val.shape[0], X_val.shape[1], X_val.shape[2] X_val = X_val.reshape(s0 * s1, s2) X_val = scaler.fit_transform(X_val) X_val = X_val.reshape(s0, s1, s2) elif args.dim == 1: # X_train = scaler.fit_transform(X_train) X_val = scaler.fit_transform(X_val) X_test = scaler.transform(X_test) print(X_train.min(), X_train.max(), X_test.min(), X_test.max()) ### Take PCA to reduce feature space dimensionality ##pca = PCA(n_components=3, whiten=True) ##pca = pca.fit(X_train) ##print('Explained variance percentage = %0.2f' % sum(pca.explained_variance_ratio_)) ##X_train = pca.transform(X_train) ##X_test = pca.transform(X_test) ###xval = pca.transform(xval) ## Train classifier and obtain predictions for OC-SVM oc_svm_clf = svm.OneClassSVM(gamma=0.001, kernel='rbf', nu=0.08) # Obtained using grid search if_clf = IsolationForest(contamination=0.08, max_features=1.0, max_samples=0.4, n_estimators=40) # Obtained using grid search oc_svm_clf.fit(X_train) if_clf.fit(X_train) oc_svm_preds = oc_svm_clf.predict(X_test) if_preds = if_clf.predict(X_test) #calculate accuracy metrics print("SVM OOC accuracy: ", accuracy_score(y_test, oc_svm_preds)) print("IF accuracy: ", accuracy_score(y_test, if_preds)) df = pd.DataFrame({ 'Labels': np.ravel(y_test), 'Clusters': np.ravel(oc_svm_preds) }) df2 = pd.DataFrame({ 'Labels': np.ravel(y_test), 'Clusters': np.ravel(if_preds) }) ct = pd.crosstab(df['Labels'], df['Clusters']) ct2 = pd.crosstab(df2['Labels'], df2['Clusters']) print(ct) print(ct2) print( classification_report(df['Clusters'], df['Labels'], target_names=['anomaly', 'normal'])) print( classification_report(df2['Clusters'], df2['Labels'], target_names=['anomaly', 'normal'])) conf_matrix = confusion_matrix(df.Clusters, df.Labels) plt.figure(figsize=(12, 12)) sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d") plt.title("One class SVM Confusion matrix") plt.ylabel('True class') plt.xlabel('Predicted class') plt.show() conf_matrix = confusion_matrix(df2.Clusters, df2.Labels) plt.figure(figsize=(12, 12)) sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d") plt.title("Isolation forest Confusion matrix") plt.ylabel('True class') plt.xlabel('Predicted class') plt.show()
print train_data.shape print test_data.shape # train_target = np.ones([train_data.shape[0]]) test_target = np.append(np.ones([test_good_data.shape[0]], dtype=int), -np.ones([test_bad_data.shape[0]], dtype=int)) best_nu = 0 best_gamma = 0 best_auc = 0 best_model = 0 for i in range(1, 20): for j in range(1, 20): nu = i * 0.01 gamma = j * 0.01 model = svm.OneClassSVM(nu=nu, kernel='rbf', gamma=gamma) model.fit(train_data) # values_preds = model.predict(train_data) # values_targs = train_target # f1_train = 100 * metrics.f1_score(values_targs, values_preds) values_preds = model.predict(test_data) values_targs = test_target auc_test = 100 * metrics.roc_auc_score(values_targs, values_preds) print("nu = %.2f, gamma = %.2f, auc = %.2f" % (nu, gamma, auc_test)) if best_auc < auc_test: best_nu = nu best_gamma = gamma best_auc = auc_test best_model = model