class SVMDetector: #just the training() function changes, rest all remains same. def __init__(self, subjects): self.u_scores = [] self.i_scores = [] self.mean_vector = [] self.subjects = subjects def training(self): self.clf = OneClassSVM(kernel='rbf',gamma=26) print(self.clf) self.clf.fit(self.train) print(self.clf.fit(self.train)) #print("lop") def testing(self): self.u_scores = -self.clf.decision_function(self.test_genuine) self.i_scores = -self.clf.decision_function(self.test_imposter) self.u_scores = list(self.u_scores) self.i_scores = list(self.i_scores) #print(self.u_scores) #print("dog") # #print(self.i_scores) #print("cat") #print(len(self.i_scores[0])) # specific .tie5Roanl def evaluate(self): eers = [] for subject in subjects: genuine_user_data = data.loc[data.subject == "vaibhav", \ "H.period":"UD.l.Return"] imposter_data = data.loc[data.subject != "vaibhav", :] self.train = genuine_user_data[-1:] self.test_genuine = genuine_user_data[:20] self.test_imposter = imposter_data.groupby("subject"). \ head(20).loc[:, "H.period":"UD.l.Return"] #total - selected genuine ,,,,, first five of every imposter displayed #print(genuine_user_data[:200]) self.training() self.testing() eers.append(evaluateEER(self.u_scores, \ self.i_scores)) #print(evaluateEER(self.u_scores, \ # self.i_scores)) #print(np.mean(eers)) break return np.mean(eers)
def odd_evaluate(): X_test = RTM.y_sampler.X_test X_train = RTM.y_sampler.X_train label_test = RTM.y_sampler.label_test #one-class SVM clf = OneClassSVM(gamma='auto').fit(X_train) score_svm = clf.decision_function(X_test) #lower, more abnormal pr_oneclassSVM = precision_at_K(score_svm, label_test) #Isolation Forest clf = IsolationForest() clf.fit(X_train) score_if = clf.decision_function(X_test) #lower, more abnormal pr_iso_forest = precision_at_K(score_if, label_test) #Roundtrip py = RTM.estimate_py_with_IS(X_test, epoch, sd_y=best_sd, scale=best_scale, sample_size=sample_size, log=True, save=False) pr_Roundtrip = precision_at_K(py, label_test) print("The precision at K of Roundtrip model is %.4f" % pr_Roundtrip) print("The precision at K of One-class SVM is %.4f" % pr_oneclassSVM) print("The precision at K of Isolation forest is %.4f" % pr_iso_forest)
class SVMDetector(Detector): def training(self): self.clf = OneClassSVM(kernel='rbf', gamma=26) self.clf.fit(self.train) def testing(self): self.user_scores = -self.clf.decision_function(self.test_genuine) self.imposter_scores = -self.clf.decision_function(self.test_imposter) self.user_scores = list(self.user_scores) self.imposter_scores = list(self.imposter_scores)
def get_predictions(X, y, ktype): '''Use a one-class SVM to get irregularity predictions from the inputs. Parameters: X : DataFrame input features y : Series labels ktype : str SVM kernel type Returns: result : DataFrame containing the centre, its predicted label using the built-in decision function, the predictive score, and the true label ''' svm = OneClassSVM(kernel=ktype, gamma='auto').fit(X) y_pred = svm.predict(X) y_pred = ((1 - y_pred) / 2).astype(int) y_score = -svm.decision_function(X) result = pd.DataFrame({ 'centre': X.index, 'pred': y_pred, 'score': y_score, 'anomalous': y }) return result
class OCSVM(): def __init__(self, kernel='rbf', nu=.01, gamma=.01): self.svm = OneClassSVM(nu=nu, gamma=gamma, kernel=kernel) def predict(self, X): X_prime = self.mapping(X) return self.svm.predict(X_prime) def fit(self, X): X_prime = self.mapping(X) return self.svm.fit(X_prime) def decision_function(self, X): X_prime = self.mapping(X) return self.svm.decision_function(X_prime) def mapping(self, X): X = np.array(X) clutter1_com = X[:, -3:] clutter2_com = X[:, -6:-3] obj_com = X[:, -12:-9] gripper_com = X[:, -15:-12] X_prime = np.hstack((clutter1_com, clutter2_com, obj_com, gripper_com)) return X_prime
def _raw_ocsvm_experiment(dataset_load_fn, dataset_name, single_class_ind): (x_train, y_train), (x_test, y_test) = dataset_load_fn() x_train = x_train.reshape((len(x_train), -1)) x_test = x_test.reshape((len(x_test), -1)) x_train_task = x_train[y_train.flatten() == single_class_ind] if dataset_name in ['cats-vs-dogs']: # OC-SVM is quadratic on the number of examples, so subsample training set subsample_inds = np.random.choice(len(x_train_task), 5000, replace=False) x_train_task = x_train_task[subsample_inds] pg = ParameterGrid({'nu': np.linspace(0.1, 0.9, num=9), 'gamma': np.logspace(-7, 2, num=10, base=2)}) results = Parallel(n_jobs=6)( delayed(_train_ocsvm_and_score)(d, x_train_task, y_test.flatten() == single_class_ind, x_test) for d in pg) best_params, best_auc_score = max(zip(pg, results), key=lambda t: t[-1]) best_ocsvm = OneClassSVM(**best_params).fit(x_train_task) scores = best_ocsvm.decision_function(x_test) labels = y_test.flatten() == single_class_ind res_file_name = '{}_raw-oc-svm_{}_{}.npz'.format(dataset_name, get_class_name_from_index(single_class_ind, dataset_name), datetime.now().strftime('%Y-%m-%d-%H%M')) res_file_path = os.path.join(RESULTS_DIR, dataset_name, res_file_name) save_roc_pr_curve_data(scores, labels, res_file_path)
def test_score_samples_estimators(): """Check the values of score_samples methods derived from sklearn. Check that the values are the same than sklearn decision_function methods. This only concerns OCSVM and IsolationForest. """ X = np.random.randn(50, 2) clf1 = IsolationForest(random_state=88) clf1.fit(X) clf2 = ensemble.IsolationForest(random_state=88) clf2.fit(X) assert_array_equal(clf1.score_samples(X), clf2.decision_function(X)) nu = 0.4 sigma = 3.0 gamma = gamma = 1. / (2. * sigma**2) clf1 = OCSVM(sigma=sigma, nu=nu) clf1.fit(X) clf2 = OneClassSVM(gamma=gamma, nu=nu) clf2.fit(X) assert_array_equal(clf1.score_samples(X), clf2.decision_function(X).ravel())
def dist_ocsvm(X_train, X_test, gamma=0.1): """ Calculation of data density by OCSVM Parameters ---------- X_train : array-like, shape = [n_samples, n_features] X training data X_test : array-like, shape = [n_samples, n_features] X test data fact : dumping factor Returns ------- array-like, shape = [n_samples] data density calculated by OCSVM """ clf = OneClassSVM(nu=0.003, kernel="rbf", gamma=gamma) clf.fit(X_train) func = clf.decision_function(X_test) func = func.ravel() dens = abs(func - max(func)) # Normalization: dens = 0 ~ 1 dens = dens / max(dens) return dens
class OCSVM(): def __init__(self, kernel='rbf', nu=.01, gamma=.01): self.svm = OneClassSVM(nu=nu, gamma=gamma, kernel=kernel) def predict(self, X): X_prime = self.mapping(X) return self.svm.predict(X_prime) def fit(self, X): X_prime = self.mapping(X) return self.svm.fit(X_prime) def decision_function(self, X): X_prime = self.mapping(X) return self.svm.decision_function(X_prime) def mapping(self, X): X = np.array(X) clutter1_com = X[:, -3:] clutter2_com = X[:, -6:-3] obj_com = X[:, -12:-9] gripper_com = X[:, -15:-12] diff1 = clutter1_com - obj_com diff2 = clutter2_com - obj_com norm1 = np.linalg.norm(diff1, axis=1) norm2 = np.linalg.norm(diff2, axis=1) X_prime = np.array([norm1, norm2]).T X_prime = np.hstack((X_prime, gripper_com)) return X_prime
class OCSVM(AnomalyDetector): """ Anomaly detector based on one-class SVM """ def __init__(self, kernel="rbf"): self._model = OneClassSVM(gamma='scale', kernel=kernel) # self._thresholds = None # TODO : tester gamma="auto" def learn(self, data): self._model.fit(data) def get_score(self, data, epoch=None): assert len(data) == 1, "len(data) = " + str(len(data)) return self._model.decision_function(data) def anomalies_have_high_score(self): return True def predict(self, data, obs): return self._model.predict(obs) == -1 def get_memory_size(self): return 0 def save(self, filename): joblib.dump(self._model, filename) def load(self, filename): self._model = joblib.load(filename)
def support_vectors(self, X, n, **kwargs): model = OneClassSVM(gamma=X.shape[1], nu=1 / X.shape[0]) model.fit(X) sv = model.support_vectors_ distance_from_hyperplane = model.decision_function(sv).reshape(-1) idx = np.argsort(np.abs(distance_from_hyperplane))[:n] return sv[idx, :]
def occ_onehot(pokemon, combats): training, labels = get_occ_feature_matrix_onehot(pokemon, combats) training = np.array(training) labels = np.array(labels) kf = KFold(n_splits=5) clf = OneClassSVM() all_scores = list() for train_index, test_index in kf.split(training): X_train, X_test = training[train_index], training[test_index] Y_test = np.ones(len(test_index)) clf.fit(X_train) y_pred = clf.predict(X_test) prob_pos = clf.decision_function(X_test) prob_pos = (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min()) y_pred = [0 if x == -1 else 1 for x in y_pred] Y_test = [int(x) for x in Y_test] scores = [ precision_score(Y_test, y_pred), recall_score(Y_test, y_pred), f1_score(Y_test, y_pred) ] all_scores.append(scores) print(all_scores) pickle.dump(all_scores, open('scores_occ_oneoht.pickle', 'wb'))
class OCSVM(object): def __init__(self, file_name, config): self.dataset = config.dataset self.file_name = file_name self.x_dim = config.x_dim self.kernel = config.kernel self.degree = config.degree self.gamma = config.gamma self.coef0 = config.coef0 self.tol = config.tol self.nu = config.nu self.pid = config.pid self.model = OneClassSVM(kernel=self.kernel, degree=self.degree, gamma=self.gamma, coef0=self.coef0, tol=self.tol, nu=self.nu) def fit(self, train_input, train_label, test_input, test_label): # Perform fit on X and returns labels for X. # Returns -1 for outliers and 1 for inliers. y_pred = self.model.fit_predict(train_input) decision_function = self.model.decision_function(train_input) ocsvm_output = OCSVMOutput(y_hat=y_pred, decision_function=decision_function) return ocsvm_output
def ocsvm(self, X_train, kernel=None, gamma=None, nu=None): """ Train OCSVM model from Sklearn Parameters __________ X_train: scaled training data kernel: kernel funcs: ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’ gamma: kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’ nu: regularization parameter btw [0,1] Returns ________ Anomaly scores """ model = OCSVM(kernel=kernel, gamma=gamma, nu=nu) model.fit(X_train) # Predict raw anomaly score labels = model.predict(X_train) # Outlier labels (-1 or 1) labels = (labels.max() - labels) // 2 # rescaled labels (1: outliers, 0: inliers) ocsvm_anomaly_scores = model.decision_function( X_train) * -1 # Outlier scores ocsvm_anomaly_scores = self.min_max_scaler(ocsvm_anomaly_scores) return ocsvm_anomaly_scores, labels
def oneClassSVM(dataset): classifier = OneClassSVM(nu=outlierFraction, gamma=0.03) classifier.fit(dataset) predScore = classifier.decision_function(dataset).T[0] pred = classifier.predict(dataset) outlierRows = [i for i in range(len(pred)) if pred[i] == -1] return predScore, outlierRows
def _cae_ocsvm_experiment(dataset_load_fn, dataset_name, single_class_ind, gpu_q): # gpu_to_use = gpu_q.get() # os.environ["CUDA_VISIBLE_DEVICES"] = gpu_to_use (x_train, y_train), (x_test, y_test) = dataset_load_fn() print('data_shape', x_train.shape) n_channels = x_train.shape[get_channels_axis()] input_side = x_train.shape[2] # channel side will always be at shape[2] enc = conv_encoder(input_side, n_channels) dec = conv_decoder(input_side, n_channels) # print(input_side) # print(dec.summary()) x_in = Input(shape=x_train.shape[1:]) x_rec = dec(enc(x_in)) cae = Model(x_in, x_rec) cae.compile('adam', 'mse') x_train_task = x_train[y_train.flatten() == single_class_ind] x_test_task = x_test[y_test.flatten( ) == single_class_ind] # This is just for visual monitoring cae.fit(x=x_train_task, y=x_train_task, batch_size=128, epochs=200, validation_data=(x_test_task, x_test_task)) x_train_task_rep = enc.predict(x_train_task, batch_size=128) if dataset_name in LARGE_DATASET_NAMES: # OC-SVM is quadratic on the number of examples, so subsample training set subsample_inds = np.random.choice(len(x_train_task_rep), 2500, replace=False) x_train_task_rep_temp = x_train_task_rep[subsample_inds] x_test_rep = enc.predict(x_test, batch_size=128) pg = ParameterGrid({ 'nu': np.linspace(0.1, 0.9, num=9), 'gamma': np.logspace(-7, 2, num=10, base=2) }) results = Parallel(n_jobs=PARALLEL_N_JOBS)(delayed( _train_ocsvm_and_score)(d, x_train_task_rep_temp, y_test.flatten() == single_class_ind, x_test_rep) for d in pg) best_params, best_auc_score = max(zip(pg, results), key=lambda t: t[-1]) print(best_params) best_ocsvm = OneClassSVM(**best_params).fit(x_train_task_rep) scores = best_ocsvm.decision_function(x_test_rep) labels = y_test.flatten() == single_class_ind res_file_name = '{}_cae-oc-svm_{}_{}.npz'.format( dataset_name, get_class_name_from_index(single_class_ind, dataset_name), datetime.datetime.now().strftime('%Y-%m-%d-%H%M')) res_file_path = os.path.join(RESULTS_DIR, dataset_name, res_file_name) save_roc_pr_curve_data(scores, labels, res_file_path)
def find_anomaly(label1, label2, winsize): print("Find anomaly in channel", label1 + '-' + label2 + '...', file=sys.stderr) print("-" * 80) print("Channel [" + label1 + '-' + label2 + ']') print("-" * 80) # find difference electrode1 = eeg.chan_lab.index(label1) electrode2 = eeg.chan_lab.index(label2) wave = eeg.X[electrode1] - eeg.X[electrode2] # # import random # wave = [random.uniform(-20,20) for _ in range(400*30)] + [random.uniform(-2000,2000) for _ in range(5*30)] # wave = np.array(wave) print("Splitting into windows...", file=sys.stderr) wave_windows = np.array_split(wave, len(wave) / eeg.sample_rate / winsize) # wave_windows = np.array_split(wave, len(wave)/winsize) print("Extracting features...", file=sys.stderr) def extract_features(wave_window): max_val = max(wave_window) min_val = min(wave_window) stdev = np.std(wave_window) sum_val = sum(wave_window) sum_pos_val = sum([x for x in wave_window if x > 0]) sum_abs_val = sum([abs(x) for x in wave_window]) return [max_val, min_val, stdev, sum_val, sum_pos_val, sum_abs_val] Examples = np.array(map(extract_features, wave_windows)) print("Training model, assuming no more than", CONTAMINATION, "anomaly...", file=sys.stderr) od = OneClassSVM(nu=CONTAMINATION, kernel='poly', gamma=0.05, max_iter=100000) od.fit(Examples) decisions = od.decision_function(Examples) # print decisions # print max(decisions), min(decisions) print("Most likely windows with anomaly:") # find most likely windows, in desc order largest_indices = np.argsort((-np.absolute(decisions)).ravel())[:20] for large_index in largest_indices: print(large_index * winsize / 60, "min (score:", decisions[large_index][0], ")") sys.stdout.flush()
def ocsvm(features_train, features_test): # One Class Support Vector Machines # fit the model ocsvm = OneClassSVM().fit(features_train) # predict start = time.time() anomalyScores = ocsvm.decision_function(features_test) test_runtime = time.time() - start return anomalyScores, test_runtime
def main(): print('------------01') iris = load_iris() pca = PCA(n_components=2) data = pca.fit_transform(iris.data) print(type(data)) print(data) # nuで異常値の割合を指定。predictすると正常値=1,異常値=-1。 ocsvm = OneClassSVM(nu=0.1, gamma="auto") ocsvm.fit(data) preds = ocsvm.predict(data) print(preds) plt.scatter(data[:, 0], data[:, 1], c=preds, cmap=plt.cm.RdBu) plt.show() print('------------02A') x = np.linspace(-5, 5, 500) y = np.linspace(-1.5, 1.5, 250) X, Y = np.meshgrid(x, y) print('X.ravel():') print(X.ravel()) print(X.shape) print(Y.shape) z1 = np.array([X.ravel(), Y.ravel()]) print(z1.shape) z2 = ocsvm.decision_function(np.array([X.ravel(), Y.ravel()]).T) print(z2.shape) # (250, 500) # (250, 500) # (2, 125000) # (125000,) # (250, 500) print(z2.reshape(X.shape).shape) df = ocsvm.decision_function(np.array([X.ravel(), Y.ravel()]).T).reshape(X.shape) plt.scatter(data[:, 0], data[:, 1], c=preds, cmap=plt.cm.RdBu, alpha=0.8) r = max([abs(df.min()), abs(df.max())]) print('------------02B') print(df.min()) print(max([abs(df.min()), abs(df.max())])) print(df) plt.contourf(X, Y, df, 10, vmin=-r, vmax=r, cmap=plt.cm.RdBu, alpha=.5) plt.show()
def __generate_probas(self, samples, resolution, affinity_matrix, number_of_questions): print( f"📞 Looks like there's a probability distribution ({self.name}) that wants to phone in an expert (that's " f"you)\n" ) clf = OneClassSVM(kernel='precomputed') samples_and_weights = {0: 0.5} for nq in range(number_of_questions): indices = list(samples_and_weights.keys()) if nq == 0: idx = np.random.choice(range(1, len(samples))) else: preds = clf.decision_function(affinity_matrix[:, indices]) idx = [i for i, _ in sorted(enumerate(preds), key=lambda x: x[1]) if i not in samples_and_weights][ 0] sample = samples[idx] print('Score the sample below with a number between 0 and 1 (higher is better)\n') if hasattr(sample, '_repr_html_'): print(sample) else: print(sample) weight = float(input('Score: ')) assert 0 <= weight <= 1 samples_and_weights[idx] = weight indices = list(samples_and_weights.keys()) clf.fit( affinity_matrix[indices, :][:, indices], sample_weight=list(samples_and_weights.values()) ) indices = list(samples_and_weights.keys()) preds = clf.decision_function(affinity_matrix[:, indices]) scores = KernelDiscretizedMethod.discretized_scores( resolution, samples, affinity_matrix, lambda mask, _idx: preds[mask].mean()) Z = logsumexp([s for s in scores.values()]) return {idx: s - Z for idx, s in scores.items()}
def run(opt): output={} cname = opt.cname datatrain, test_loader = get_loader(opt,classname = opt.cname) model = create_model(opt) opt.load = False #model.setup(opt) model = vgg_face_dag(weights_path='vgg_face_dag.pth') model.eval() f = [] g = [] tlbl = [] model.eval() cnt = 0 output={} for data, lbl in datatrain: cnt +=1 code, c = model(data) if cnt ==1: f = code.view(code.size(0), -1).detach().cpu().numpy().tolist() else: f += code.view(code.size(0), -1).detach().cpu().numpy().tolist() output['train']=f cnt = 0 for data, lbl in test_loader: cnt +=1 code, c = model(data) #out, code = model.test() if cnt ==1: tlbl = lbl.cpu().numpy().tolist() g = code.view(code.size(0), -1).detach().cpu().numpy().tolist() #tloss = np.mean(((out.cpu()-model.real_A.detach().cpu()).numpy())**2,(1,2,3)).tolist() else: g += code.view(code.size(0), -1).detach().cpu().numpy().tolist() tlbl += lbl.cpu().numpy().tolist() #tloss += np.mean(((out.cpu()-model.real_A.detach().cpu()).numpy())**2,(1,2,3)).tolist() output['test']=g output['lbl']=tlbl print(len(tlbl)) print(len(g)) clf = OneClassSVM(gamma='auto') clf.fit(output['train']) scores = clf.decision_function(output['test']) print(sklearn.metrics.roc_auc_score(output['lbl'], scores)) '''cnt = 0
def one_class_svm_core(x_train, x_test, y_test, x_test_names, version=0): clf = OneClassSVM() print 'svm begin...' start = time.time() clf.fit(x_train) joblib.dump(clf, 'one_class_model.m') print 'begin compute', time.time() - start y_distance = clf.decision_function(x_test) y_score = (np.clip(y_distance, -1, 1) + 1) / 2 print 'svm complete..' show_result(y_score, y_test, x_test_names, version=version)
class OneClsSVM(AbstractDetector): name = "OneClassSVM" data_type = "REAL" def compute_scores(self, dataframe: pd.DataFrame, classes: np.array): bin_dataframe = dataframe._binarize_categorical_values() self.clf = OneClassSVM(**self.settings) self.clf.fit(bin_dataframe.values) self.values = -self.clf.decision_function(bin_dataframe.values) return self
def evaluate_features(train, test, labels, dataset, model=''): params = svm_parameters_dict(dataset) clf = OneClassSVM(kernel='rbf', gamma=params[1], nu=params[0], verbose=True) clf.fit(train) decision_f = clf.decision_function(test) new_decision_f = filter_scores(decision_f, dataset, params[2]) _auc = calc_auc(labels, new_decision_f) print "Area under ROC: ", _auc
def runClassifier(self, _driverId, numComponents=0): X = self.featuresHash.values() self.ids = self.featuresHash.keys() if self.runDimRed: X = self.dimRed(X, numComponents) clf = OCSVM(nu=self.nu, gamma=self.gamma) clf.fit(X) y_pred = clf.decision_function(X).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * self.outliers_fraction) self.label = y_pred > threshold self.label = map(int, self.label)
def SVM(person): # Consider current subject as real and rest as fake realUser_data = data.loc[data.subject == person, "H.period":"H.Return"] if (len(realUser_data) == 0): print("No data found for the given user") return 0 real_train = np.array((realUser_data[:200]).values) # True test set (200 records) real_test = np.array((realUser_data[200:]).values) fakeUser_data = data.loc[data.subject != person, :] # False set (250 records, 5 per fake user, 50 fake users in all) fake_test = np.array( (fakeUser_data.groupby("subject").head(5).loc[:, "H.period":"H.Return"] ).values) clf = OneClassSVM(kernel='rbf', gamma=26) clf.fit(real_train) realUser_scores = [] # real user score fakeUser_scores = [] # imposter user score # Calculate score for real user test data realUser_scores = list(-clf.decision_function(real_test)) # Calculate score for fake user test data fakeUser_scores = list(-clf.decision_function(fake_test)) # true label labels = [0] * len(realUser_scores) + [1] * len(fakeUser_scores) Equal_err_rate = Calc_equal_err_rate(realUser_scores, fakeUser_scores, labels) print("Equal err rate:: ", Equal_err_rate) return Equal_err_rate
class SVMDetector: # just the training() function changes, rest all remains same. def __init__(self, subjects): self.u_scores = [] self.i_scores = [] self.mean_vector = [] self.subjects = subjects def training(self): self.clf = OneClassSVM(kernel='rbf', gamma=26) self.clf.fit(self.train) def testing(self): self.u_scores = -self.clf.decision_function(self.test_genuine) self.i_scores = -self.clf.decision_function(self.test_imposter) self.u_scores = list(self.u_scores) self.i_scores = list(self.i_scores) def evaluate(self): eers = [] for subject in subjects: genuine_user_data = data.loc[data.subject == tempvalue, "H.period":"UD.l.Return"] imposter_data = data.loc[data.subject != tempvalue, :] self.train = genuine_user_data[-1:] self.test_genuine = genuine_user_data[:20] self.test_imposter = imposter_data.groupby("subject"). \ head(20).loc[:, "H.period":"UD.l.Return"] self.training() self.testing() eers.append(evaluateEER(self.u_scores, self.i_scores)) break return np.mean(eers)
def SVM(): eers = [] false_negative = 0.0 false_positive = 0.0 for subject in subjects: user_scores = [] imposter_scores = [] imposter_data = data.loc[data.subject != subject, :] train = data.loc[data.subject == subject, "H.period":"H.Return"][:200].values test_genuine = data.loc[data.subject == subject, "H.period":"H.Return"][200:].values imposter = imposter_data.groupby("subject").head( 5).loc[:, "H.period":"H.Return"].values clf = OneClassSVM(kernel='rbf', gamma=26) clf.fit(train) user_scores = -clf.decision_function(test_genuine) imposter_scores = -clf.decision_function(imposter) user_scores = list(user_scores) imposter_scores = list(imposter_scores) standard_deviation = np.std(user_scores) mean_standard = np.mean(user_scores) for score in user_scores: if score > mean_standard + standard_deviation or score < mean_standard - standard_deviation: false_positive = false_positive + 1 # checking for false positives for score in imposter_scores: if score < mean_standard + standard_deviation and score > mean_standard - standard_deviation: false_negative = false_negative + 1 eers.append(Calc_equal_err_rate(user_scores, imposter_scores)) #print eers return np.mean(eers), false_positive / (51 * 200), false_negative / (51 * 250)
def ocsvm(X, y, percentage=None, params={}, sh_params={}): normalizer = Normalizer(norm="l1") X = normalizer.fit_transform(X) cf = OneClassSVM(**params) cf.fit(X) anomalyscores = -cf.decision_function(X) ap = average_precision_score(y, anomalyscores) auc = roc_auc_score(y, anomalyscores) return ap, auc
def embed_dat_matrix_two_dimensions(low_dimension_data_matrix, y=None, labels=None, density_colormap='Blues', instance_colormap='YlOrRd'): from sklearn.preprocessing import scale low_dimension_data_matrix = scale(low_dimension_data_matrix) # make mesh x_min, x_max = low_dimension_data_matrix[:, 0].min(), low_dimension_data_matrix[:, 0].max() y_min, y_max = low_dimension_data_matrix[:, 1].min(), low_dimension_data_matrix[:, 1].max() step_num = 50 h = min((x_max - x_min) / step_num, (y_max - y_min) / step_num) # step size in the mesh b = h * 10 # border size x_min, x_max = low_dimension_data_matrix[:, 0].min() - b, low_dimension_data_matrix[:, 0].max() + b y_min, y_max = low_dimension_data_matrix[:, 1].min() - b, low_dimension_data_matrix[:, 1].max() + b xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # induce a one class model to estimate densities from sklearn.svm import OneClassSVM gamma = max(x_max - x_min, y_max - y_min) clf = OneClassSVM(gamma=gamma, nu=0.1) clf.fit(low_dimension_data_matrix) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max] . [y_min, y_max]. if hasattr(clf, "decision_function"): score_matrix = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) else: score_matrix = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] # Put the result into a color plot levels = np.linspace(min(score_matrix), max(score_matrix), 40) score_matrix = score_matrix.reshape(xx.shape) if y is None: y = 'white' plt.contourf(xx, yy, score_matrix, cmap=plt.get_cmap(density_colormap), alpha=0.9, levels=levels) plt.scatter(low_dimension_data_matrix[:, 0], low_dimension_data_matrix[:, 1], alpha=.5, s=70, edgecolors='gray', c=y, cmap=plt.get_cmap(instance_colormap)) # labels if labels is not None: for id in range(low_dimension_data_matrix.shape[0]): label = labels[id] x = low_dimension_data_matrix[id, 0] y = low_dimension_data_matrix[id, 1] plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
def embed_dat_matrix_two_dimensions(low_dimension_data_matrix, y=None, labels=None, density_colormap='Blues', instance_colormap='YlOrRd'): from sklearn.preprocessing import scale low_dimension_data_matrix = scale(low_dimension_data_matrix) # make mesh x_min, x_max = low_dimension_data_matrix[:, 0].min(), low_dimension_data_matrix[:, 0].max() y_min, y_max = low_dimension_data_matrix[:, 1].min(), low_dimension_data_matrix[:, 1].max() step_num = 50 h = min((x_max - x_min) / step_num, (y_max - y_min) / step_num) # step size in the mesh b = h * 10 # border size x_min, x_max = low_dimension_data_matrix[:, 0].min() - b, low_dimension_data_matrix[:, 0].max() + b y_min, y_max = low_dimension_data_matrix[:, 1].min() - b, low_dimension_data_matrix[:, 1].max() + b xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # induce a one class model to estimate densities from sklearn.svm import OneClassSVM gamma = max(x_max - x_min, y_max - y_min) clf = OneClassSVM(gamma=gamma, nu=0.1) clf.fit(low_dimension_data_matrix) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max] . [y_min, y_max]. if hasattr(clf, "decision_function"): score_matrix = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) else: score_matrix = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] # Put the result into a color plot levels = np.linspace(min(score_matrix), max(score_matrix), 40) score_matrix = score_matrix.reshape(xx.shape) if y is None: y = 'white' plt.contourf(xx, yy, score_matrix, cmap=plt.get_cmap(density_colormap), alpha=0.9, levels=levels) plt.scatter(low_dimension_data_matrix[:, 0], low_dimension_data_matrix[:, 1], alpha=.5, s=70, edgecolors='gray', c=y, cmap=plt.get_cmap(instance_colormap)) # labels if labels is not None: for id in range(low_dimension_data_matrix.shape[0]): label = labels[id] x = low_dimension_data_matrix[id, 0] y = low_dimension_data_matrix[id, 1] plt.annotate(label, xy=(x, y), xytext = (0, 0), textcoords = 'offset points')
def SVM_score(S): X = np.array(S) clf = OneClassSVM(kernel='linear') clf.fit(X) scores = clf.decision_function(X) min = 999999 max = -99999 for i in range(len(scores)): scores[i] = -1 * scores[i] if scores[i] < min: min = scores[i] if scores[i] > max: max = scores[i] return scores
def find_anomaly(label1, label2, winsize): print("Find anomaly in channel", label1 + '-' + label2 + '...', file=sys.stderr) print("-"*80) print("Channel [" + label1 + '-' + label2 + ']') print("-"*80) # find difference electrode1 = eeg.chan_lab.index(label1) electrode2 = eeg.chan_lab.index(label2) wave = eeg.X[electrode1] - eeg.X[electrode2] # # import random # wave = [random.uniform(-20,20) for _ in range(400*30)] + [random.uniform(-2000,2000) for _ in range(5*30)] # wave = np.array(wave) print("Splitting into windows...", file=sys.stderr) wave_windows = np.array_split(wave, len(wave)/eeg.sample_rate/winsize) # wave_windows = np.array_split(wave, len(wave)/winsize) print("Extracting features...", file=sys.stderr) def extract_features(wave_window): max_val = max(wave_window) min_val = min(wave_window) stdev = np.std(wave_window) sum_val = sum(wave_window) sum_pos_val = sum([x for x in wave_window if x > 0]) sum_abs_val = sum([abs(x) for x in wave_window]) return [max_val, min_val, stdev, sum_val, sum_pos_val, sum_abs_val] Examples = np.array(map(extract_features, wave_windows)) print("Training model, assuming no more than", CONTAMINATION, "anomaly...", file=sys.stderr) od = OneClassSVM(nu=CONTAMINATION, kernel='poly', gamma=0.05, max_iter=100000) od.fit(Examples) decisions = od.decision_function(Examples) # print decisions # print max(decisions), min(decisions) print("Most likely windows with anomaly:") # find most likely windows, in desc order largest_indices = np.argsort((-np.absolute(decisions)).ravel())[:20] for large_index in largest_indices: print(large_index*winsize/60, "min (score:", decisions[large_index][0], ")") sys.stdout.flush()
def remove_outliers_SVM(self): ## Remove outliers using a OneClassSVM method print "Running SVM to remove outliers..." svm = OneClassSVM(kernel='rbf', nu=0.1, degree=3, verbose=1) fit = svm.fit(self.DataArray) decision = svm.decision_function(self.DataArray) _indices = [] # If a value is below the decision hyperplane, eliminate it for i in range(len(decision)): if decision[i] < 0: _indices.append(i) print self.DataArray.shape self.DataArray = np.delete(self.DataArray, _indices, axis=0) self.TargetArray = np.delete(self.TargetArray, _indices, axis=0) print self.DataArray.shape
def predict_header_features(self, pkt_featurizer): group_id = pkt_featurizer.pkt_type features = pkt_featurizer.features arrival_time = pkt_featurizer.arrival_time try: vectorizer = DictVectorizer() vectorizer.fit(self.training_data[group_id]) training_data_vectorized = vectorizer.transform(self.training_data[group_id]) features_vectorized = vectorizer.transform(features) scaler = preprocessing.StandardScaler(with_mean=False) training_data_vectorized = scaler.fit_transform(training_data_vectorized) features_vectorized = scaler.transform(features_vectorized) classifier = OneClassSVM() classifier.fit(training_data_vectorized) result = classifier.predict(features_vectorized) distance = classifier.decision_function(features_vectorized) except KeyError: result = 0 distance = 0 return result, distance
import numpy as np import pandas as pd from sklearn.svm import OneClassSVM df = pd.read_csv('kddcup_for_elki_100000.csv', header=None, index_col=False) labelix = df.shape[1]-1 labels = df[labelix] df = df.drop(labelix, axis=1) svm = OneClassSVM(kernel='rbf', gamma=1.0/df.shape[0], tol=0.001, nu=0.5, shrinking=True, cache_size=80) svm = svm.fit(df.values) scores = svm.decision_function(df.values).flatten() maxvalue = np.max(scores) scores = maxvalue - scores output = pd.DataFrame() # perform reverse sort sort_ix = np.argsort(scores)[::-1] output['labels'] = labels[sort_ix] output['outlier_scores'] = scores[sort_ix] output.to_csv('outlier_scores.csv', header=None, index=None)
unif = np.random.uniform(lim_inf, lim_sup, size=(n_generated, n_features)) # fit: print('IsolationForest processing...') iforest = IsolationForest() iforest.fit(X_train) s_X_iforest = iforest.decision_function(X_test) print('LocalOutlierFactor processing...') lof = LocalOutlierFactor(n_neighbors=20) lof.fit(X_train) s_X_lof = lof.decision_function(X_test) print('OneClassSVM processing...') ocsvm = OneClassSVM() ocsvm.fit(X_train[:min(ocsvm_max_train, n_samples_train - 1)]) s_X_ocsvm = ocsvm.decision_function(X_test).reshape(1, -1)[0] s_unif_iforest = iforest.decision_function(unif) s_unif_lof = lof.decision_function(unif) s_unif_ocsvm = ocsvm.decision_function(unif).reshape(1, -1)[0] plt.subplot(121) auc_iforest, em_iforest, amax_iforest = em(t, t_max, volume_support, s_unif_iforest, s_X_iforest, n_generated) auc_lof, em_lof, amax_lof = em(t, t_max, volume_support, s_unif_lof, s_X_lof, n_generated) auc_ocsvm, em_ocsvm, amax_ocsvm = em(t, t_max, volume_support, s_unif_ocsvm, s_X_ocsvm, n_generated)
def select_candidates(X, h, objective_function, verbose=False, cov_computation_method=empirical_covariance): """Finds the best pure subset of observations to compute MCD from it. The purpose of this function is to find the best sets of h observations with respect to a minimization of their covariance matrix determinant. Equivalently, it removes n_samples-h observations to construct what we call a pure data set (i.e. not containing outliers). The list of the observations of the pure data set is referred to as the `support`. Starting from a support estimated with a Parzen density estimator, the pure data set is found by the c_step procedure introduced by Rousseeuw and Van Driessen in [1]. Parameters ---------- X: array-like, shape (n_samples, n_features) Data (sub)set in which we look for the h purest observations h: int, [(n + p + 1)/2] < h < n The number of samples the pure data set must contain. select: int, int > 0 Number of best candidates results to return. See --- `c_step` function Returns ------- best_locations: array-like, shape (select, n_features) The `select` location estimates computed from the `select` best supports found in the data set (`X`) best_covariances: array-like, shape (select, n_features, n_features) The `select` covariance estimates computed from the `select` best supports found in the data set (`X`) best_supports: array-like, shape (select, n_samples) The `select` best supports found in the data set (`X`) Notes ----- References: [1] A Fast Algorithm for the Minimum Covariance Determinant Estimator, 1999, American Statistical Association and the American Society for Quality, TECHNOMETRICS """ n_samples, n_features = X.shape from sklearn.metrics.pairwise import euclidean_distances from sklearn.svm import OneClassSVM pairwise_distances = np.ravel(euclidean_distances(X)) delta = sp.stats.scoreatpercentile(pairwise_distances, 10) gamma = 0.01 / delta clf = OneClassSVM(kernel='rbf', gamma=gamma) clf.fit(X) in_support = np.argsort( -np.ravel(clf.decision_function(X)))[-(n_samples / 2):] support = np.zeros(n_samples, dtype=bool) support[in_support] = True location = X[support].mean(0) covariance = cov_computation_method(X[support]) initial_estimates = (location, covariance) best_location, best_covariance, _, best_support = c_step( X, h, objective_function, initial_estimates, verbose=verbose, cov_computation_method=cov_computation_method) return best_location, best_covariance, best_support
X_test = X[n_samples_train:, :] y_train = y[:n_samples_train] y_test = y[n_samples_train:] # # training only on normal data: # X_train = X_train[y_train == 0] # y_train = y_train[y_train == 0] print('OneClassSVM processing...') model = OneClassSVM(cache_size=500) tstart = time() model.fit(X_train) fit_time += time() - tstart tstart = time() scoring = -model.decision_function(X_test) # the lower,the more normal predict_time += time() - tstart fpr_, tpr_, thresholds_ = roc_curve(y_test, scoring) if fit_time + predict_time > max_time: raise TimeoutError f = interp1d(fpr_, tpr_) tpr += f(x_axis) tpr[0] = 0. precision_, recall_ = precision_recall_curve(y_test, scoring)[:2] # cluster: old version of scipy -> interpol1d needs sorted x_input arg_sorted = recall_.argsort() recall_ = recall_[arg_sorted]
def decision_function(self, data): return -OneClassSVM.decision_function(self, data)
def main(): usage="refine2d using simmx information " parser = EMArgumentParser(usage=usage,version=EMANVERSION) parser.add_argument("--ptcls", type=str,help="particle file", default=None) parser.add_argument("--simmx", type=str,help="simmx", default=None) parser.add_argument("--npca", type=int,help="number of pca factors", default=10) parser.add_argument("--niter", type=int,help="number of iterations", default=5) parser.add_argument("--outlier", type=float,help="outlier fraction", default=0.1) parser.add_argument("--ncls", type=int,help="number of centers", default=128) parser.add_argument("--nref", type=int,help="number of references", default=32) (options, args) = parser.parse_args() logid=E2init(sys.argv) simmxfile=options.simmx for itr in range(options.niter): ### start from the simmx print "Pre-processing simmx" e=EMData(simmxfile) pts=e.numpy().T.copy() for i in range(len(pts)): pts[i]-=np.mean(pts[i]) pts[i]/=np.std(pts[i]) pts=pts.astype(np.float).copy(); #e=from_numpy(pts.T.copy()) #e.write_image("simmx_tmp.hdf") #exit() print "Doing PCA" (nptcl, ncls) = pts.shape; #nfac=options.npca pca=PCA(options.npca) pts_pca=pca.fit_transform(pts) bs=pts_pca bs/=np.std(bs) print bs.shape,pts.shape np.savetxt("test_pca_{:02d}".format(itr),pts_pca) print "Removing outliers" outliers_fraction=options.outlier svm=OneClassSVM(nu=0.95 * outliers_fraction + 0.05,kernel="rbf", gamma=0.1) svm.fit(bs) y_pred = svm.decision_function(bs).ravel() nkeep=int(len(bs)*(1-outliers_fraction)) st=np.argsort(y_pred)[::-1] st=st[:nkeep] print "Clustering" ncnt=options.ncls centroids,_ = kmeans(bs[st],ncnt) l,_ = vq(bs[st],centroids) labels=np.zeros(len(bs))-1 labels[st]=l print "Class averaging" e=EMData(1,len(labels)) for i in range(len(labels)): e.set_value_at(0,i,labels[i]) clsmxfile="clsmx_{:02d}.hdf".format(itr) e.write_image(clsmxfile) clsout="classes_{:02d}.hdf".format(itr) run("e2classaverage.py --input={} --classmx={} --output={} --force --center xform.center --iter=5 --align=rotate_translate_flip:maxshift=32 --averager=mean --keep=.6 --cmp=ccc --aligncmp=ccc --normproc=normalize --parallel=thread:12".format(options.ptcls,clsmxfile,clsout)) simmxfile="simmx_{:02d}.hdf".format(itr) run("e2simmx.py {} {} {} --align rotate_translate_flip --aligncmp ccc --cmp ccc --saveali --parallel thread:12".format(options.ptcls, clsout, simmxfile)) E2end(logid)