def cria_modelo(df_preprocessed, df_market, df_ptf): # Dados de treino e dados de validação df_train = df_preprocessed[df_preprocessed.index.isin(df_ptf.id)] # Treinamento do modelo clf = OneClassSVM(gamma='auto').fit(df_train) # Calcular treshold score_train = clf.score_samples(df_train) treshold = np.quantile(score_train, 0.1) # Aplicar modelo nos dados de mercado score_test = clf.score_samples(df_preprocessed) # Classificação utilizando o treshold pred = score_test >= treshold # Criação de coluna "recomendado" com os labels portfolio = df_market.index.isin(df_ptf.id) df_market_labeled = df_market.copy() aux = ['Sim' if n == True else 'Não' for n in pred] label = [ 'Treino' if portfolio[n] == True else aux[n] for n in range(len(portfolio)) ] df_market_labeled.insert(8, 'recomendado', label) df_market_labeled.insert(9, 'contador', 1) df_treino = df_market_labeled[df_market_labeled['recomendado'] == 'Treino'] df_sim = df_market_labeled[df_market_labeled['recomendado'] == 'Sim'] lista_ids = pd.DataFrame(df_sim.index) return df_treino, df_sim, lista_ids
def evaluate_authentication(filename, num_users): NUM_USERS = num_users df = pd.read_csv(filename) array = df.values nsamples, nfeatures = array.shape nfeatures = nfeatures - 1 features = array[:, 0:nfeatures] labels = array[:, -1] userids = ['u%03d' % i for i in range(1, NUM_USERS + 1)] positive_userid = userids[0] negative_userids = userids[1:len(userids)] scaler = MinMaxScaler() auc_list = list() # print('NUM_USERS: '+str(NUM_USERS)) for i in range(0, NUM_USERS): userid = userids[i] user_train_data = df.loc[df.iloc[:, -1].isin([userid])] # Select data for training user_train_data = user_train_data.drop(user_train_data.columns[-1], axis=1) user_array = user_train_data.values # print('User array shape: '+userid + ' ' + str(user_array.shape) ) user_array = scaler.fit_transform(user_array) num_samples = user_array.shape[0] train_samples = (int)(num_samples * 0.66) test_samples = num_samples - train_samples user_train = user_array[0:train_samples, :] user_test = user_array[train_samples:num_samples, :] other_users_data = df.loc[~df.iloc[:, -1].isin([userid])] other_users_data = other_users_data.drop(other_users_data.columns[-1], axis=1) other_users_array = other_users_data.values other_users_array = scaler.fit_transform(other_users_array) clf = OneClassSVM(gamma='auto').fit(user_train) clf.fit(user_train) pred_positive = clf.predict(user_test) pred_negative = clf.predict(other_users_array) positive_scores = clf.score_samples(user_test) negative_scores = clf.score_samples(other_users_array) auc = compute_fpr_tpr(userid, positive_scores, negative_scores, plot=False) auc_list.append(auc) print('mean: %5.2f, std: %5.2f' % (np.mean(auc_list), np.std(auc_list)))
def do_one_class_svm(X): clf = OneClassSVM(gamma=0.00001, nu=0.01).fit(X) pred = clf.predict(X) scores = clf.score_samples(X) return pred, scores
class OCSVMDetector(IAnomaly): def __init__(self, slidingWindowSize = None): self.slidingWindowSize = slidingWindowSize self.receivedSamplesNumber = 0 self.currentSamples = [] self.clf = OneClassSVM(nu=0.1, kernel="rbf", gamma='auto') self.dictHeaders = ['detectionCode', 'anomalyLikelihood', 'anomalyScore'] def appendNewData(self, sample): self.currentSamples.append(float(sample["Resistance"])) self.receivedSamplesNumber = self.receivedSamplesNumber +1 def detect(self, new_data): if self.receivedSamplesNumber < self.slidingWindowSize - 1: #Append all of the stabilization samples self.appendNewData(new_data) return dict(zip(self.dictHeaders, [-1, -1, -1])) else: #Remove one from current samples and add new data self.currentSamples.pop(0) self.appendNewData(new_data) result = self.clf.fit_predict(np.array(self.currentSamples).reshape(-1,1))[-1] likelihood = self.clf.score_samples(np.array(self.currentSamples).reshape(-1,1))[-1] return dict(zip(self.dictHeaders, [result, likelihood, -1])) def detectFromList(self, data): results = [] print "Detecting anomalies for {} samples of data".format(data.__len__()) for data_point in tqdm(data): detection = self.detect(data_point) result = copy.copy(data_point) result.update(detection) results.append(result) return results
def one_class_svm(self): try: svm = OneClassSVM(gamma='auto').fit(self.data.VW) raw = svm.score_samples(self.data.VW) except AttributeError: try: svm = OneClassSVM(gamma='auto').fit(self.data.V) raw = svm.score_samples(self.data.V) except AttributeError: try: svm = OneClassSVM(gamma='auto').fit(self.data.W) raw = svm.score_samples(self.data.W) except AttributeError: print('Where\'s the data?') raise return raw.max() - raw + 1
class OCSVM(object): def __init__(self, kernel='rbf', d=2, gamma=3.0, nu=0.1): self.kernel = kernel self.d = d self.gamma = gamma if (self.kernel == 'poly'): self.gamma = 1 self.nu = nu def fit(self, train_X): if self.gamma == 'auto': self.model = OneClassSVM(kernel=self.kernel, degree=self.d, gamma='scale', coef0=1) else: self.model = OneClassSVM(kernel=self.kernel, degree=self.d, gamma=self.gamma, coef0=1) self.model.fit(train_X) def decision_function(self, X): return (-1) * self.model.score_samples( X) #OCSVMは±0が識別面でプラス側が学習データ(正常)になるため
def use_svm2(df_list, x_columns, **kwargs): svm = OneClassSVM(kernel=kwargs['kernel']) svm.fit(df_list[0][x_columns]) predicted = [] for i in range(len(df_list)): pred = svm.score_samples(df_list[i][x_columns]) predicted.append(pred) return predicted
def detect_outliers_SVM(df): ''' Returns the outlier scores using SVM (beware: prone to overfitting) Parameters: ----------- df: pd.DataFrame, ''' clf = OneClassSVM() clf.fit_predict(df) scores = clf.score_samples(df) # dec_func = clf.decision_function(df_imputed) return scores
def load_datas(dataset,Processing_Unit, n=10): # 0 < n <= 100 S = 0 #data = pd.read_csv(dataset) data = dataset y = data.author authors_list = unique(y) authors_list.sort() SIZE = len(authors_list) roc_array = [] for i in range (0,SIZE): users = data.loc[data['author'] == authors_list[i]] indexNames = data[data['author'] == authors_list[i]].index other_users_array = data.drop(indexNames) X = users.drop("author",axis = 1) other_users_array = other_users_array.drop("author",axis = 1) if Processing_Unit == "FUNCTION": X = X.drop("function",axis=1) other_users_array = other_users_array.drop("function",axis = 1) Num_Of_Functions = users.shape[0] user_train = X.head(int(Num_Of_Functions*2/3)) user_test = X.tail(Num_Of_Functions - user_train.shape[0]) clf = OneClassSVM(gamma='scale').fit(user_train) clf.fit(user_train) positive_scores = clf.score_samples(user_test) negative_scores = clf.score_samples(other_users_array) #print(str(authors_list[i]) + " : " +str('%.2f' % compute_fpr_tpr(authors_list[i],positive_scores,negative_scores))) val = compute_fpr_tpr(authors_list[i],positive_scores,negative_scores) S+=val roc_array.append(val) avg = S/SIZE #print("avg : " + str('%.4f' % avg)) w = open(settings.aux_res,'w') w.truncate(0) w.write("avg AUC : " + str('%.4f' % avg)) w.close()
def use_model(model, df_list, x_columns, params): predicted = [] if model == 'knn': neigh = NearestNeighbors(n_neighbors=params['n'], p=params['p']) neigh.fit(df_list[0][x_columns]) for i in range(len(df_list)): pred = neigh.kneighbors(df_list[i][x_columns]) pred = [np.mean(i) for i in pred[0]] predicted.append(pred) elif model == 'svm': svm = OneClassSVM(kernel=params['kernel']) svm.fit(df_list[0][x_columns]) for i in range(len(df_list)): pred = svm.score_samples(df_list[i][x_columns]) maximum = max(pred) pred = [(x * -1) + maximum for x in pred] predicted.append(pred) elif model == 'ísolationForest': clf = IsolationForest(n_estimators=params['n_estimators'], random_state=0) clf.fit(df_list[0][x_columns]) for i in range(len(df_list)): pred = clf.score_samples(df_list[i][x_columns]) pred = list(map(abs, pred)) predicted.append(pred) elif model == 'autoencoder': clf = AutoEncoder(hidden_neurons=params['hidden_neurons'], verbose=0, random_state=0) clf.fit(df_list[0][x_columns]) for i in range(len(df_list)): pred = clf.decision_function(df_list[i][x_columns]) predicted.append(pred) elif model == 'lsanomaly': anomalymodel = lsanomaly.LSAnomaly(sigma=params['sigma'], rho=params['rho']) anomalymodel.fit(df_list[0][x_columns].to_numpy()) for i in range(len(df_list)): pred = anomalymodel.predict_proba(df_list[i][x_columns].to_numpy()) pred = [a[1] for a in pred] predicted.append(pred) return predicted
def main(): versus = [] srcs = glob.glob('D:/Scripts/oneclass/ex/*.py') for src in srcs: src = read_src(src) features = get_features(src) versus.append(features) vs_values = [list(vs.values()) for vs in versus] for vec in zip(vs_values): vec = [[i] for i in list(vec)] clf = OneClassSVM(gamma='auto').fit(vec) print('vecs: ', clf.predict(vec)) print('scores: ', clf.score_samples(vec))
def svm_nd(x_train, x_test, y_test, plot_roc=False): # svm novelty detection auc_svm = [] for kernel in tqdm(['linear', 'poly', 'rbf', 'sigmoid']): print(f'SVM---{kernel}---{"*" * 33}') for gamma in ["scale", "auto"]: clf = OneClassSVM(kernel=kernel, gamma=gamma, max_iter=10000) clf.fit(x_train) y_scores = clf.score_samples(x_test) svm = roc.area(y_test=y_test, y_scores=y_scores, pos_label=1, title='OC-SVM - ', plot_roc=plot_roc) auc_svm.append([(kernel, gamma), svm]) return auc_svm
def correct_mine(): x = X_test.astype("float32")[:10000] y = y_test.reshape(-1)[:10000] values = y n_values = 10 y = np.eye(n_values)[values] correct_mid, wrong_mid = cnn_profiler.get_correct_mid(input_shape, output_shape, x, y, anchor=-2) print(len(correct_mid), len(wrong_mid)) clf = OneClassSVM(nu=0.1, kernel="rbf", gamma=0.5) clf.fit(correct_mid[:6000]) result = clf.score_samples(correct_mid[6000:]) # result = clf.predict(wrong_mid) print(len(result), len(result[result == -1]))
def fit(self, bags, y): """ @param bags : a sequence of n bags; each bag is an m-by-k array-like object containing m instances with k features @param y : an array-like object of length n containing -1/+1 labels """ self._bags = [np.asmatrix(bag) for bag in bags] y = np.asmatrix(y).reshape((-1, 1)) # svm_X = np.vstack(self._bags) # svm_y = np.vstack([float(cls) * np.matrix(np.ones((len(bag), 1))) # for bag, cls in zip(self._bags, y)]) # Select only the negative Bag : list_X_neg = [] for bag, cls in zip(self._bags, y): if cls == -1: list_X_neg += [bag] X_neg = np.vstack(list_X_neg) # nu=0.00001 # An upper bound on the fraction of training errors onClassSVM = OneClassSVM() onClassSVM.fit(X_neg) # score_samples_X_neg = onClassSVM.score_samples(X_neg) # Positive for the normal value ie negative instance here # min_score = np.min(score_samples_X_neg) svm_X = [np.asmatrix(X_neg)] svm_y = [np.matrix(-np.ones((len(X_neg), 1)))] for bag, cls in zip(self._bags, y): if cls == 1: scores_bag = onClassSVM.score_samples(bag) local_y = -1. * onClassSVM.predict(bag) local_y[np.argmin(scores_bag)] = 1. local_y = np.reshape(local_y, (len(bag), 1)) svm_X += [bag] svm_y += [local_y] svm_X = np.vstack(svm_X) svm_y = np.vstack(svm_y) # print('Number of positive instances :',len(np.nonzero(1+svm_y)[0]),'on ',len(np.nonzero(1+y)[0]),' positive bags') super(MIbyOneClassSVM, self).fit(svm_X, svm_y)
""" Copyright (c) 2019 ground0state. All rights reserved. License: MIT License """ if __name__ == '__main__': import numpy as np from sklearn.svm import OneClassSVM normal_data = np.loadtxt("../input/normal_data.csv", delimiter=",") error_data = np.loadtxt("../input/error_data.csv", delimiter=",") class Args(): kernel = "rbf" degree = 3 gamma = "auto" model = OneClassSVM( kernel=Args().kernel, degree=Args().degree, gamma=Args().gamma, ).fit(normal_data) y_pred = -np.log(model.score_samples(error_data)) import matplotlib.pyplot as plt plt.plot(y_pred) plt.show()
print('\n******LOF*******\n') start = time.time() lof = LocalOutlierFactor() lof.fit(X) end = time.time() time_all[j, 1] = end - start lof_scores = lof.negative_outlier_factor_ print('\n******1-class SVM*******\n') start = time.time() osvm = OneClassSVM(kernel='rbf') osvm.fit(X) end = time.time() time_all[j, 2] = end - start osvm_scores = osvm.score_samples(X) print('\n******Our Algo*******\n') start = time.time() t1, _ = np.shape(X) # n_samples = int(max(t1/250,100)) # n_samples = int(t1/50) n_samples = 100 kwargs = { 'max_depth': 10, 'n_trees': 50, 'max_samples': n_samples, 'max_buckets': 3, 'epsilon': 0.1, 'sample_axis': 1, 'threshold': 0
model_path = args.clustering_model_path latents_path = args.latents_file training = not args.no_training test = args.test ensemble = args.ensemble seed = args.seed input_shape = (32, 32) latents = np.concatenate([np.load(path) for path in latents_path], axis=0) latents = latents.reshape(latents.shape[0], -1) print(f'\033[32;1mlatents: {latents.shape}\033[0m') np.random.seed(880301) if training: model = OneClassSVM().fit(latents) utils.save_model(model_path, model) else: print('\033[32;1mLoading Model\033[0m') model = utils.load_model(model_path) if test: pred = model.score_samples(latents) if ensemble: np.save(test, pred) else: utils.generate_csv(pred, test) else: pred = model.score_samples(latents) print(f'\033[32;1mValidation score: {np.mean(pred)}\033[0m')
def evaluate_authentication( df, verbose = False): print(df.shape) userids = create_userids( df ) NUM_USERS = len(userids) auc_list = list() eer_list = list() global_positive_scores = list() global_negative_scores = list() for i in range(0,NUM_USERS): userid = userids[i] user_train_data = df.loc[ df.iloc[:, -1].isin([userid]) ] # Select data for training user_train_data = user_train_data.drop(user_train_data.columns[-1], axis=1) user_array = user_train_data.values num_samples = user_array.shape[0] train_samples = (int)(num_samples * 0.66) test_samples = num_samples - train_samples # print("#train_samples: "+str(train_samples)+"\t#test_samples: "+ str(test_samples)) user_train = user_array[0:train_samples,:] user_test = user_array[train_samples:num_samples,:] other_users_data = df.loc[~df.iloc[:, -1].isin([userid])] other_users_data = other_users_data.drop(other_users_data.columns[-1], axis=1) other_users_array = other_users_data.values clf = OneClassSVM(gamma='scale') clf.fit(user_train) positive_scores = clf.score_samples(user_test) negative_scores = clf.score_samples(other_users_array) # Aggregating positive scores y_pred_positive = positive_scores for i in range(len(positive_scores) - AGGREGATE_BLOCK_NUM + 1): y_pred_positive[i] = np.average(y_pred_positive[i : i + AGGREGATE_BLOCK_NUM], axis=0) # Aggregating negative scores y_pred_negative = negative_scores for i in range(len(negative_scores) - AGGREGATE_BLOCK_NUM + 1): y_pred_negative[i] = np.average(y_pred_negative[i : i + AGGREGATE_BLOCK_NUM], axis=0) auc, eer = compute_AUC_EER(y_pred_positive, y_pred_negative) # auc, eer = compute_AUC_EER(positive_scores, negative_scores ) global_positive_scores.extend(positive_scores) global_negative_scores.extend(negative_scores) if verbose == True: print(str(userid)+", "+ str(auc)+", "+str(eer) ) auc_list.append(auc) eer_list.append(eer) print('AUC mean : %7.4f, std: %7.4f' % ( np.mean(auc_list), np.std(auc_list)) ) print('EER mean: %7.4f, std: %7.4f' % ( np.mean(eer_list), np.std(eer_list)) ) if verbose == True: global_auc, global_eer = compute_AUC_EER(global_positive_scores, global_negative_scores) print("Global AUC: "+str(global_auc)) print("Global EER: "+str(global_eer)) return auc_list, eer_list
acc_score = accuracy_score(Y1_train, Y1_pred_train) print(f'Accuratezza sul train set: {acc_score}') prec_score = precision_score(Y1_train, Y1_pred_train) print('Precisione sul train set: %.3f' % prec_score) rec_score = recall_score(Y1_train, Y1_pred_train) print('Recall sul train set: %.3f' % rec_score) F1_score = f1_score(Y1_train, Y1_pred_train) print('F1 score sul train set: %.3f' % F1_score) #box plot df_train = clf.decision_function(X1_train_n) score_samples_train = clf.score_samples(X1_train_n) plt.scatter(df_train, np.arange(0, 26, 1), s=5) plt.axvline(x=0, color='red') plt.show() #TEST SET #matrice di confusione confmat = confusion_matrix(y_true=Y_TEST, y_pred=Y_pred_TEST) fig, ax = plt.subplots(figsize=(2.5, 2.5)) ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3) for i in range(confmat.shape[0]): for j in range(confmat.shape[1]):
def evaluate_authentication_train_test(df_train, df_test, data_type, num_blocks, representation_type, verbose=False, roc_data=False, roc_data_filename=TEMP_NAME): print("Training: " + str(df_train.shape)) print("Testing: " + str(df_test.shape)) userids = create_userids(df_train) NUM_USERS = len(userids) auc_list = list() eer_list = list() global_positive_scores = list() global_negative_scores = list() for i in range(0, NUM_USERS): userid = userids[i] user_train_data = df_train.loc[df_train.iloc[:, -1].isin([userid])] # Select data for training user_train_data = user_train_data.drop(user_train_data.columns[-1], axis=1) user_array = user_train_data.values # train_samples = user_array.shape[0] user_test_data = df_test.loc[df_test.iloc[:, -1].isin([userid])] user_test_data = user_test_data.drop(user_test_data.columns[-1], axis=1) # test_samples = user_test_data.shape[0] other_users_data = df_test.loc[~df_test.iloc[:, -1].isin([userid])] other_users_data = other_users_data.drop(other_users_data.columns[-1], axis=1) # other_users_array = other_users_data.values # if (verbose == True): # print(str(userid)+". #train_samples: "+str(train_samples)+"\t#positive test_samples: "+ str(test_samples)) clf = OneClassSVM(gamma='scale') clf.fit(user_train_data) positive_scores = clf.score_samples(user_test_data) negative_scores = clf.score_samples(other_users_data) # Aggregating positive scores y_pred_positive = positive_scores for i in range(len(positive_scores) - num_blocks + 1): y_pred_positive[i] = np.average(y_pred_positive[i:i + num_blocks], axis=0) # Aggregating negative scores y_pred_negative = negative_scores for i in range(len(negative_scores) - num_blocks + 1): y_pred_negative[i] = np.average(y_pred_negative[i:i + num_blocks], axis=0) auc, eer, _, _ = compute_AUC_EER(y_pred_positive, y_pred_negative) if SCORE_NORMALIZATION == True: positive_scores, negative_scores = score_normalization( positive_scores, negative_scores) global_positive_scores.extend(positive_scores) global_negative_scores.extend(negative_scores) if verbose == True: print(str(userid) + ", " + str(auc) + ", " + str(eer)) auc_list.append(auc) eer_list.append(eer) print("\nNumber of blocks: ", num_blocks) print('AUC mean : %7.4f, std: %7.4f' % (np.mean(auc_list), np.std(auc_list))) print('EER mean: %7.4f, std: %7.4f' % (np.mean(eer_list), np.std(eer_list))) print("#positives: " + str(len(global_positive_scores))) print("#negatives: " + str(len(global_negative_scores))) global_auc, global_eer, fpr, tpr = compute_AUC_EER(global_positive_scores, global_negative_scores) filename = 'output_png/scores_' + str(data_type.value) + '_' + str( representation_type.value) if SCORES == True: # **************************************************************************************** plot_scores(global_positive_scores, global_negative_scores, filename, title='Scores distribution') # **************************************************************************************** if (roc_data == True): dict = {'FPR': fpr, 'TPR': tpr} df = pd.DataFrame(dict) df.to_csv(roc_data_filename, index=False) print(data_type.value + " Global AUC: " + str(global_auc)) print(data_type.value + " Global EER: " + str(global_eer)) return auc_list, eer_list
def use_svm(df_x_train, df_x_test, c=1): clf = OneClassSVM(kernel='sigmoid').fit(df_x_train) svm = clf.score_samples(df_x_test) return svm
acc_score = accuracy_score(Y1_train, Y1_pred_train) print(f'Accuratezza sul train set: {acc_score}') prec_score = precision_score(Y1_train, Y1_pred_train) print('Precisione sul train set: %.3f' % prec_score) rec_score = recall_score(Y1_train, Y1_pred_train) print('Recall sul train set: %.3f' % rec_score) F1_score = f1_score(Y1_train, Y1_pred_train) print('F1 score sul train set: %.3f' % F1_score) #box plot df_train = clf.decision_function(X1_train_n_reduced) score_samples_train = clf.score_samples(X1_train_n_reduced) plt.scatter(df_train, np.arange(0, 26, 1), s=5) plt.axvline(x=0, color='red') plt.show() #TEST SET #matrice di confusione confmat = confusion_matrix(y_true=Y_TEST, y_pred=Y_pred_TEST) fig, ax = plt.subplots(figsize=(2.5, 2.5)) ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3) for i in range(confmat.shape[0]): for j in range(confmat.shape[1]):
def evaluate_authentication_cross_day( df1, df2, verbose = False ): print("Session 1 shape: "+str(df1.shape)) print("Session 2 shape: "+str(df2.shape)) userids = create_userids( df1 ) NUM_USERS = len(userids) global_positive_scores = list() global_negative_scores = list() auc_list = list() eer_list = list() for i in range(0,NUM_USERS): userid = userids[i] user_session1_data = df1.loc[df1.iloc[:, -1].isin([userid])] user_session2_data = df2.loc[df2.iloc[:, -1].isin([userid])] user_session1_data = user_session1_data.drop(user_session1_data.columns[-1], axis=1) user_session1_array = user_session1_data.values # positive test data user_session2_data = user_session2_data.drop(user_session2_data.columns[-1], axis=1) user_session2_array = user_session2_data.values # negative test data other_users_session2_data = df2.loc[~df2.iloc[:, -1].isin([userid])] other_users_session2_data = other_users_session2_data.drop(other_users_session2_data.columns[-1], axis=1) other_users_session2_array = other_users_session2_data.values clf = OneClassSVM(gamma='scale') clf.fit(user_session1_array) positive_scores = clf.score_samples(user_session2_array) negative_scores = clf.score_samples(other_users_session2_array) # Aggregating positive scores y_pred_positive = positive_scores for i in range(len(positive_scores) - AGGREGATE_BLOCK_NUM + 1): y_pred_positive[i] = np.average(y_pred_positive[i : i + AGGREGATE_BLOCK_NUM], axis=0) # Aggregating negative scores y_pred_negative = negative_scores for i in range(len(negative_scores) - AGGREGATE_BLOCK_NUM + 1): y_pred_negative[i] = np.average(y_pred_negative[i : i + AGGREGATE_BLOCK_NUM], axis=0) auc, eer = compute_AUC_EER(y_pred_positive, y_pred_negative) # auc, eer = compute_AUC_EER(positive_scores, negative_scores ) global_positive_scores.extend(positive_scores) global_negative_scores.extend(negative_scores) if verbose == True: print(str(userid)+": "+ str(auc)+", "+str(eer) ) auc_list.append(auc) eer_list.append(eer) print('AUC mean : %7.4f, std: %7.4f' % ( np.mean(auc_list), np.std(auc_list)) ) print('EER mean: %7.4f, std: %7.4f' % ( np.mean(eer_list), np.std(eer_list)) ) if verbose == True: global_auc, global_eer = compute_AUC_EER(global_positive_scores, global_negative_scores) print("Global AUC: "+str(global_auc)) print("Global EER: "+str(global_eer)) return auc_list, eer_list
print(X_test.shape) #import pickle import joblib from sklearn.svm import OneClassSVM #from sklearn.model_selection import GridSearchCV, ParameterGrid eplison = 0.001 gamma = 0.0001 nu = 0.001 one_svm_rbf = OneClassSVM(nu=nu, kernel='rbf', gamma=gamma, tol=eplison) one_svm_rbf.fit(X_train) what_kernel = 'rbf' print('testing data------------------------------------------------') Y_result_rbf = one_svm_rbf.predict(X_test) Y_scroe_rbf = one_svm_rbf.score_samples(X_test) print('test data size :{}'.format(X_test.shape[0])) print('test data anomaly : {}'.format(np.sum(Y_result_rbf == -1))) print('rbf:{}'.format(np.sum(Y_result_rbf == -1) / len(Y_result_rbf) * 100)) print('traning data------------------------------------------------') Y_result_rbf_t = one_svm_rbf.predict(X_train) Y_scroe_rbf_t = one_svm_rbf.score_samples(X_train) print('train data size :{}'.format(X_train.shape[0])) print('train data anomaly : {}'.format(np.sum(Y_result_rbf_t == -1))) print('rbf:{}'.format( np.sum(Y_result_rbf_t == -1) / len(Y_result_rbf_t) * 100)) print('all data------------------------------------------------') eplison = 0.001
def main(): # np.random.seed(777) parser = argparse.ArgumentParser() parser.add_argument('--data_path', type=str, default='../data') parser.add_argument('--dataset_name', type=str, default='swat') parser.add_argument('--normal_class_index_list', nargs='+', default=[0]) # get a list of normal class indexes parser.add_argument('--cluster_num', type=int, default=5) parser.add_argument('--n_hidden_features', type=int, default=10) parser.add_argument('--cluster_type', type=str, default='gmm') parser.add_argument('--dec_pretrain_lr', type=float, default=0.01) parser.add_argument('--dec_train_epochs', type=int, default=100) parser.add_argument('--dec_train_lr', type=float, default=0.01) parser.add_argument('--save_cluster_model', type=str2bool, default=False) parser.add_argument('--load_cluster_model', type=str2bool, default=False) parser.add_argument('--classifier', type=str, default='linear') parser.add_argument('--classifier_epochs', type=int, default=200) parser.add_argument('--classifier_lr', type=float, default=0.01) parser.add_argument('--save_classifier_model', type=str2bool, default=False) parser.add_argument('--load_classifier_model', type=str2bool, default=False) parser.add_argument('--temperature', type=float, default=1000) parser.add_argument('--perturbation', type=float, default=0.001) parser.add_argument('--plot_clustering', type=str2bool, default=False) args = parser.parse_args() data_path = args.data_path dataset_name = args.dataset_name # if image data, set rgb flag if (dataset_name in config.rgb_datasets): config.is_rgb = True config.cvae_channel = 3 # if text data, set sentence embedding normal_class_index_list = args.normal_class_index_list normal_class_index_list = [int(i) for i in normal_class_index_list] config.normal_class_index_list = normal_class_index_list cluster_num = args.cluster_num config.cluster_num = cluster_num n_hidden_features = args.n_hidden_features config.n_hidden_features = n_hidden_features cluster_type = args.cluster_type config.cluster_type = cluster_type config.save_cluster_model = args.save_cluster_model config.load_cluster_model = args.load_cluster_model classifier = args.classifier config.classifier = classifier config.classifier_epochs = args.classifier_epochs config.classifier_lr = args.classifier_lr config.save_classifier_model = args.save_classifier_model config.load_classifier_model = args.load_classifier_model temperature = args.temperature config.temperature = temperature perturbation = args.perturbation config.perturbation = perturbation config.plot_clustering = args.plot_clustering # logger log = config.logger log_path = config.log_path if os.path.exists(log_path) == False: os.makedirs(log_path) sub_log_path = config.sub_log_path if os.path.exists(sub_log_path) == False: os.makedirs(sub_log_path) fileHandler = logging.FileHandler(\ os.path.join(sub_log_path, config.current_time + '-' +\ dataset_name + '-' +\ cluster_type + '-' +\ classifier + '.txt')) fileHandler.setFormatter(config.formatter) config.logger.addHandler(fileHandler) log.info("-" * 99) log.info("-" * 10 + str(args) + "-" * 10) log.info("-" * 99) log.info('START %s:%s:%s\n' % (datetime.datetime.now().hour, datetime.datetime.now().minute, datetime.datetime.now().second)) log.info('%s:%s:%s\n' % (datetime.datetime.now().hour, datetime.datetime.now().minute, datetime.datetime.now().second)) print("dataset name : " + dataset_name) log.info("dataset name : " + dataset_name) print("classifier : " + classifier) log.info("classifier : " + classifier) print("normal_class_index_list : {}".format(normal_class_index_list)) log.info("normal_class_index_list : {}".format(normal_class_index_list)) print("n_hidden_features : {}".format(n_hidden_features)) log.info("n_hidden_features : {}".format(n_hidden_features)) print("temperature : {}".format(temperature)) log.info("temperature : {}".format(temperature)) print("perturbation : {}".format(perturbation)) log.info("perturbation : {}".format(perturbation)) # loading dataset dataset = load_dataset(dataset_name=dataset_name, data_path=data_path) print("") print("dataset loading successful!") log.info("dataset loading successful") train_x = dataset["train_x"] train_y = dataset["train_y"] test_in = dataset["test_in"] test_out = dataset["test_out"] print(dataset_name) print(normal_class_index_list) cls = OneClassSVM(gamma='auto') train_x_list = [] for x in train_x: x = x.view(-1).numpy() train_x_list.append(x) print("fitting to one_class_svm") cls.fit(train_x_list) test_in_pred = [] for t_i in test_in: t_i = t_i.view(-1).numpy() test_in_pred.append(t_i) print("predicting test_in") # test_in_pred = cls.predict(test_in_pred) test_in_pred = cls.score_samples(test_in_pred) test_out_pred = [] for t_o in test_out: t_o = t_o.view(-1).numpy() test_out_pred.append(t_o) print("predicting test_out") # test_out_pred = cls.predict(test_out_pred) test_out_pred = cls.score_samples(test_out_pred) labels = [0 for i in range(len(test_in_pred)) ] + [1 for i in range(len(test_out_pred))] fpr, tpr, thresholds = roc_curve(labels, test_in_pred.tolist() + test_out_pred.tolist(), pos_label=0) auroc = auc(fpr, tpr) print(auroc)
def eval_embed(self, trainset, testset): """Evaluate performance on test set.""" _, _, embeds_tr, pools_tr, _ = self.extract(trainset) probs, dscores, embeds, pools, labels = self.extract(testset) sim_embed = -0.5 * self.squared_difference(embeds, embeds_tr, True) sim_pool = -0.5 * self.squared_difference(pools, pools_tr, True) dist_embed = tf.reduce_mean(1.0 - tf.nn.top_k(sim_embed, k=1)[0], axis=1) dist_pool = tf.reduce_mean(1.0 - tf.nn.top_k(sim_pool, k=1)[0], axis=1) for key in self.eval_metrics: if key.startswith('logit'): pred = 1.0 - probs[:, 0] elif key.startswith('dscore'): pred = 1.0 - dscores elif key.startswith('embed'): pred = dist_embed feats_tr = embeds_tr.numpy() feats = embeds.numpy() sim = sim_embed elif key.startswith('pool'): pred = dist_pool feats_tr = pools_tr.numpy() feats = pools.numpy() sim = sim_pool if 'auc' in key: self.eval_metrics[key] = util_metric.roc(pr=pred, gt=labels) elif 'locsvm' in key and key.startswith(('embed', 'pool')): # Linear kernel OC-SVM. clf = OneClassSVM(kernel='linear').fit(feats_tr) scores = -clf.score_samples(feats) self.eval_metrics[key] = util_metric.roc(pr=scores, gt=labels) elif 'kocsvm' in key and key.startswith(('embed', 'pool')): # RBF kernel OC-SVM. feats_tr = tf.nn.l2_normalize(feats_tr, axis=1) feats = tf.nn.l2_normalize(feats, axis=1) # 10 times larger value of gamma. gamma = 10. / (tf.math.reduce_variance(feats_tr) * feats_tr.shape[1]) clf = OneClassSVM(kernel='rbf', gamma=gamma).fit(feats_tr) scores = -clf.score_samples(feats) self.eval_metrics[key] = util_metric.roc(pr=scores, gt=labels) elif 'kde' in key and key.startswith(('embed', 'pool')): # RBF kernel density estimation. feats_tr = tf.nn.l2_normalize(feats_tr, axis=1) gamma = 10. / (tf.math.reduce_variance(feats_tr) * feats_tr.shape[1]) scores = None batch_size_for_kde = 100 num_iter = int(np.ceil(sim.shape[0] / batch_size_for_kde)) for i in range(num_iter): sim_batch = sim[i * batch_size_for_kde:(i + 1) * batch_size_for_kde] scores_batch = -tf.divide( tf.reduce_logsumexp(2 * gamma * sim_batch, axis=1), gamma) scores = scores_batch if scores is None else tf.concat( (scores, scores_batch), axis=0) self.eval_metrics[key] = util_metric.roc(pr=scores, gt=labels) elif 'gde' in key and key.startswith(('embed', 'pool')): # Gaussian density estimation with full covariance. feats_tr = tf.nn.l2_normalize(feats_tr, axis=1) feats = tf.nn.l2_normalize(feats, axis=1) km = GMM(n_components=1, init_params='kmeans', covariance_type='full') km.fit(feats_tr) scores = -km.score_samples(feats) self.eval_metrics[key] = util_metric.roc(pr=scores, gt=labels)
componentResults.append((0, 0)) train_ds = cu.lumpRecords(n_fold_ds) svm1c = OneClassSVM() train_a = train_ds[train_ds["active"] == True] #ann.fit(train_ds.iloc[:, 0:numcols], train_ds.iloc[:, numcols]) svm1c.fit(train_a.iloc[:, 0:numcols], None) # G_a = GaussianMixture(n_components=best_components, covariance_type="full").fit(train_ds.iloc[:, 0:numcols], # train_ds.iloc[:, numcols]) results = pd.DataFrame() results["score"] = [ max(svm1c.score_samples(x[0].iloc[:, 0:numcols])) for x in test_ds ] #results["a_score"] = [G_a.score(x[0].iloc[:, 0:numcols]) for x in test_ds] results["truth"] = [x[2] for x in test_ds] #np.array(test_ds)[:, 2] molName = molfiles[molNdx][ 1] #[molfiles[molNdx].rfind("/", 0, -1)+1:-1] auc = eval.plotSimROC( results["truth"], [results["score"]], molName + "[1C-SVM, " + str(portion * 100) + "%]", molName + "_1CSVM_sim_" + str(portion * 100) + ".pdf") auc_rank = eval.plotRankROC( results["truth"], [results["score"]], molName + "[1C-SVM, " + str(portion * 100) + "%]", molName + "_1CSVM_rank_" + str(portion * 100) + ".pdf") mean_ef = eval.getMeanEFs(np.array(results["truth"]), np.array([results["score"]]),
def train(loader, epoch, model_list, method='ocsvm'): # 大于阈值表示属于正常 # model_list 对需要多轮训练的模型有效, 传入上一次训练的模型,例如ocnn datas, labels = get_features(loader) threshold_list = [] update_models = [] update_optimizer = [] clf_list, optimizers = model_list for label in range(args.class_num): # 为每个类别拟合ocsvm模型 condition_index = np.where(labels == label)[0] fit_data = datas[condition_index] # 标签label的训练数据 optimizer = optimizers[label] if method == 'ocsvm': clf = OneClassSVM() elif method == 'isofore': clf = IsolationForest() elif method == 'gmm': clf = BayesianGaussianMixture() elif method == 'svdd': clf = SVDD(parameters) elif method == 'lof': clf = LocalOutlierFactor(novelty=True, n_neighbors=int(fit_data.size * 0.1)) elif method == 'cnn': clf = '' elif method != 'sp': clf = clf_list[label] # 训练异常检测模型 if method == 'ocnn': clf, optimizer = fit(clf, fit_data, optimizer, epoch) scores_temp = score_samples(clf, fit_data, epoch) elif method == 'lof': clf.fit(fit_data) scores_temp = clf.decision_function(fit_data) elif method == 'sp': pass elif method == 'cnn': pass else: clf.fit(fit_data) scores_temp = clf.score_samples(fit_data) # 异常检测模型阈值的计算 if method != 'sp' and method != 'gmm' and method != 'cnn': threshold = np.mean(scores_temp) - \ args.threshold_std_times*np.std(scores_temp) update_optimizer.append(optimizer) update_models.append(clf) threshold_list.append(threshold) elif method == 'gmm': threshold = np.mean(scores_temp) update_optimizer.append(optimizer) update_models.append(clf) threshold_list.append(threshold) elif method == 'sp': from cnn import get_c_v threshold_list = get_c_v(p_s=datas, labels=labels) elif method == 'cnn': threshold_list = '' model_list = (update_models, optimizers) return model_list, threshold_list
def sklearn_oneclass(featureset, target, classnum): #特征,目标,对第k个目标训练模型,从1开始 meanacc = [] meanfar = [] meanfrr = [] for t in range(0, 10): train_data, test_data, train_target, test_target = train_test_split( featureset, target, test_size=0.2, random_state=t * 30, stratify=target) train_data, test_data, sort = IAtool.minepro(train_data, test_data, train_target, 30) # print("进入第",t,"轮分类的线性判别式分析阶段") # train_data,test_data,lda_bar,lda_scaling=IAtool.ldapro(train_data,test_data,train_target) # print("进入第",t,"轮分类的主成分分析阶段") # oneclasstraindata,oneclasstestdata=IAtool.pcapro(oneclasstraindata,oneclasstestdata,8) print("进入第", t, "轮分类的oneclass阶段") oneclasstraindata = [] oneclasstestdata = test_data oneclasstesttarget = [] for k in range(len(train_target)): if train_target[k] == (classnum): oneclasstraindata.append(train_data[k]) for k in range(len(test_target)): if test_target[k] == (classnum): oneclasstesttarget.append(1) else: oneclasstesttarget.append(-1) clf = OneClassSVM(nu=0.02).fit(oneclasstraindata) # clf = EllipticEnvelope(random_state=0).fit(oneclasstraindata) # clf = LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=0.1).fit(oneclasstraindata) # clf = IsolationForest(random_state=0,max_features=len(oneclasstraindata[0]),bootstrap=True).fit(oneclasstraindata) # joblib.dump(clf, 'model.pkl') result = clf.predict(oneclasstestdata) scores = clf.score_samples(oneclasstestdata) # score = clf.predict_proba(featureset) dist = clf.decision_function(oneclasstestdata) print('原结果:', oneclasstesttarget) print('预测结果:', result) print('预测分数:', scores) print('模型距离:', dist) tp, tn, fp, fn = one_accuracy_score( oneclasstesttarget, dist, 0 ) #result_scores: onesvm, 1. EllipticEnvelope,-80. IsolationForest,-0.64.LocalOutlierFactor -1 # tp,tn,fp,fn=one_accuracy_result(oneclasstesttarget,result) print(tp, tn, fp, fn) accuracy = (tp + tn) / (tp + tn + fp + fn) far = (fp) / (fp + tn) frr = (fn) / (fn + tp) print("accuracy:", accuracy, "far:", far, "frr:", frr) meanacc.append(accuracy) meanfar.append(far) meanfrr.append(frr) print("meanacc:", np.mean(meanacc), "meanfar:", np.mean(meanfar), "meanfrr:", np.mean(meanfrr))
window_size, subset='test') data_test_anomaly = FirmaData_select_subjects(data_folder_dir, window_size, par_anodata[0], par_anodata[1], par_anodata[2], test_subject_list, subset='train') data_train = data_train.datamat data_test_normal = data_test_normal.datamat data_test_anomaly = data_test_anomaly.data data_test_anomaly = data_test_anomaly model.fit(data_train) score_train = model.score_samples(data_train) score_test_normal = model.score_samples(data_test_normal) score_test_anomaly = model.score_samples(data_test_anomaly) predict_train = model.predict(data_train) predict_train = 1 * predict_train > 0 predict_test_normal = 1 * model.predict(data_test_normal) > 0 predict_test_anomaly = 1 * model.predict(data_test_anomaly) > 0 # print(np.mean(score_train),np.mean(score_test_normal),np.mean(score_test_anomaly)) N_test_normal = data_test_normal.shape[0] # N_test_anomaly=data_test_anomaly.shape[0] N_test_anomaly = len(data_test_anomaly) N_train = data_train.shape[0] TP = np.sum(predict_test_normal)