lCorr = 60 * 24 * 7 // MPS # use one week data for calculating pearson correlation R = 800 nNearPart = 2 mNear = np.identity(nR) mNear = np.concatenate((mNear, (dists > 0) & (dists <= R))) sMNear = np.repeat(mNear.sum(axis=1), nS * t_delta) mNearTile = np.tile(mNear, (1, nS * t_delta)) score_ind = np.zeros((nT, nR * nS)) score_r = np.zeros((nT, nR)) + 100 score_int = np.zeros((nT, nR)) + 100 anomalies = np.zeros((nT, nR)) dVector = (t_delta - 1 + nNearPart) * nS model_r = OneClassSVM(nu=0.1) model_int = OneClassSVM(nu=0.1) train_r = np.zeros((0, nS)) train_int = np.zeros((0, dVector)) tsTrain = 60 * 24 * 7 // MPS nTrain = tsTrain * nR detect_st = (datetime(2014, 11, 27) - stDT).days * 24 * 60 // MPS # detect anomamlies in 2014-11-27 ed = (datetime(2014, 11, 28) - stDT).days * 24 * 60 // MPS st = max(detect_st - tsTrain, lCorr) trained = False p1 = np.einsum('ij,ik->kj', data[(st - lCorr):st, :], data[(st - lCorr):st, :]) for ts in range(st, ed): print('\r' + str(ts), end='')
y, test_size=0.3) y_train = [] y_test = [] for i in y_train_pre: if i == aa: y_train.append(1) # else: # y_train.append(-1) for i in y_test_pre: if i == aa: y_test.append(1) else: y_test.append(-1) y_train = np.array(y_train) y_test = np.array(y_test) OCSVM = OneClassSVM(gamma=1, kernel='rbf') OCSVM.fit(X_train, y_train) ans = y_test - OCSVM.predict((X_test)) i = 0 for a in ans: if a == 0: i = i + 1 print(aa, ": ", i / len(ans))
def main(): X = read_data() Y = read_labels() isf = IsolationForest(**ISF_HYPER_PARAMS) lof = LocalOutlierFactor(**LOF_HYPER_PARAMS) svm = OneClassSVM(**SVM_HYPER_PARAMS) cov = EllipticEnvelope(**COV_HYPER_PARAMS) kmn = KMeans(**KMN_HYPER_PARAMS) preds_isf = [] preds_lof = [] preds_svm = [] preds_cov = [] preds_kmn = [] preds = [] for user in range(0, num_of_users): X_all = X[user] X_labeled = X[user][0:num_of_genuine_segments] X_unlabeled = X[user][num_of_genuine_segments:] count_vect = CountVectorizer() tfidf_transformer = TfidfTransformer(use_idf=False) X_all_counts = count_vect.fit_transform(X_all) X_labeled_counts = count_vect.transform(X_labeled) X_unlabeled_counts = count_vect.transform(X_unlabeled) X_all_tfidf = tfidf_transformer.fit_transform(X_all_counts) X_labeled_tfidf = tfidf_transformer.transform(X_labeled_counts) X_unlabeled_tfidf = tfidf_transformer.transform(X_unlabeled_counts) isf.fit(X_all_tfidf) lof.fit(X_all_tfidf) svm.fit(X_all_tfidf) cov.fit(X_all_tfidf.toarray()) kmn.fit(X_all_tfidf) pred_isf = isf.predict(X_unlabeled_tfidf) pred_lof = lof.predict(X_unlabeled_tfidf) pred_svm = svm.predict(X_unlabeled_tfidf) pred_cov = cov.predict(X_unlabeled_tfidf.toarray()) pred_kmn = predict_by_euclidian_distance(X_unlabeled_tfidf, kmn) pred_isf = [1 if p == -1 else 0 for p in pred_isf] pred_lof = [1 if p == -1 else 0 for p in pred_lof] pred_svm = [1 if p == -1 else 0 for p in pred_svm] pred_cov = [1 if p == -1 else 0 for p in pred_cov] preds_lof.append(pred_lof) preds_isf.append(pred_isf) preds_svm.append(pred_svm) preds_cov.append(pred_cov) preds_kmn.append(pred_kmn) pred_sum = np.array(pred_lof) + np.array(pred_isf) + np.array( pred_svm) + np.array(pred_kmn) + np.array(pred_cov) majority = [1 if i > 2 else 0 for i in pred_sum] preds.append(majority) print("LOF:") evaluate_model(preds_lof, Y) print("ISF:") evaluate_model(preds_isf, Y) print("SVM:") evaluate_model(preds_svm, Y) print("COV:") evaluate_model(preds_cov, Y) print("KMN:") evaluate_model(preds_kmn, Y) print("TOTAL:") evaluate_model(preds, Y) WriteOutput(preds)
slc=np.r_[:,0:128] trainX[slc]=trainX[slc].astype(np.float64) testX[slc]=testX[slc].astype(np.float64) trainy=trainy.to_frame() testy=testy.to_frame() from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from sklearn.metrics import f1_score from sklearn.svm import OneClassSVM import pandas as pd # define outlier detection model model = OneClassSVM(gamma='scale', nu=0.01) # fit on majority class trainX = trainX[trainy==1] model.fit(trainX) # detect outliers in the test set yhat = model.predict(testX) # mark inliers 1, outliers -1 testy[testy == 1] = 1 testy[testy == 0] = -1 # calculate score evaluate_results(testy, yhat)
plt.axvline(1993, label='Steroid Era Start - 1993', color='green') plt.axvline(2004, label='Steroid Era End - 2003', color='green') ax.yaxis.set_major_formatter(mtick.StrMethodFormatter('{x:1.3f}')) ax.tick_params(axis='both', which='major', labelsize=12) plt.legend(loc='upper left') plt.show() # # One Class SVM (support vector method) for anomaly detection # dfx = dfplot[['OPS', 'yearID']] data = dfplot[['OPS']] scaler = StandardScaler() np_scaled = scaler.fit_transform(data) data = pd.DataFrame(np_scaled) # train oneclassSVM model = OneClassSVM(nu=0.15, kernel="rbf", gamma=0.01) model.fit(data) dfx['anomaly3'] = pd.Series(model.predict(data)) # visualization of One Class SVM anomaly detection fig, ax = plt.subplots(figsize=(15, 7)) a = dfx.loc[dfx['anomaly3'] == -1, ['yearID', 'OPS']] #anomaly ax.set_title('OPS Trend\nOne Class SVM Anomaly Detection\n', weight='bold', size=14) ax.set_xlabel("Year", labelpad=10, size=14) ax.set_ylabel("OPS", labelpad=10, size=14) ax.plot(dfx['yearID'], dfx['OPS'], marker='.', linestyle='none',
def __init__(self, kernel="rbf"): self._model = OneClassSVM(gamma='scale', kernel=kernel)
def generate_figures_and_xls(outdir, cols_starts, region2data, ext, xls, group2pos, feature_names, samples): """Generate figures and tables""" all_freqs = [] # concatenate all pos and samples into one dataframe dframes = [] for ri, (ref, pos) in enumerate( region2data.keys()): #regions): #[3]#; print(ref, pos, mt) mer, calls = region2data[(ref, pos)] for c, s in zip(calls, samples): df = pd.DataFrame(c, columns=feature_names) df["Strain"] = s df["chr_pos"] = "%s:%s" % (ref, pos) dframes.append(df) # read all tsv files df = pd.concat(dframes).dropna().reset_index() chr_pos, strains = df["chr_pos"].unique(), df["Strain"].unique() # compare individual methods for clf, method in ( (iso_new.iForest(ntrees=100, random_state=0), "GMM+eIF"), (GaussianMixture(random_state=0, n_components=2), "GMM"), (AgglomerativeClustering(n_clusters=2), "AggClust"), (KMeans(n_clusters=2), "KMeans"), (OneClassSVM(), "OCSVM"), (IsolationForest(random_state=0), "IF"), (iso_new.iForest(ntrees=100, random_state=0), "eIF"), (KNeighborsClassifier(), "KNN"), (RandomForestClassifier(), "RF"), ): fname = method print(fname) outfn = os.path.join(outdir, "%s.%s" % (fname, ext)) results = [] for i, cols_start in enumerate(cols_starts, 1): # narrow down the features to only signal intensity & trace cols = list( filter(lambda n: n.startswith(cols_start), feature_names)) cols #, "DT" # compare all samples to 0% s0 = samples[0] for s in samples[3:]: with np.errstate(under='ignore'): if "+" in method: clf2_name = method.split("+")[-1] results += get_mod_freq_two_step(df, cols, chr_pos, [s0, s], "_".join(cols_start), OFFSET=0.5, clf2_name=clf2_name, clf2=clf) elif method in ("KNN", "RF"): results += get_mod_freq_clf_train_test( df, cols, chr_pos, [s0, s], samples[1:3], clf, "_".join(cols_start)) else: results += get_mod_freq_clf(df, cols, chr_pos, [s0, s], clf, "_".join(cols_start)) # and store mod_freq predicted by various methods freqs = pd.DataFrame(results, columns=[ "chr_pos", "features", "mod_freq wt", "mod_freq strain", "strain" ]) freqs["diff"] = freqs.max(axis=1) - freqs.min(axis=1) freqs for name, pos in group2pos.items( ): #(("negative", negatives), ("pU", pU_pos), ("Nm", Nm_pos)): freqs.loc[freqs["chr_pos"].isin(pos), "group"] = name #freqs.to_csv(outfn, sep="\t"); freqs.head() freqs.to_excel(xls, fname, index=False) # plot differences between methods for group, pos in group2pos.items(): freqs.loc[freqs["chr_pos"].isin(pos), "modification"] = group #g = sns.catplot(x="strain", y="diff", hue="features", col="modification", data=freqs, kind="box")#, palette="Blues") g = sns.catplot(x="strain", y="diff", hue="features", col="modification", data=freqs, kind="point", ci=None) #, palette="Blues") fig = g.fig fig.suptitle(method) for ax in fig.axes: ax.set_xlabel("Expected mod_freq") ax.set_ylabel( "Observed mod_freq [absolute difference between wt & mt]") ax.set_ylim(0, 1) fig.savefig(outfn) plt.close() # clear axis freqs["name"] = fname all_freqs.append(freqs) return all_freqs
def main(argv): config = read_parser(argv, Inputs, InputsOpt_Defaults) if config['mode'] == 'test': print('test') elif config['mode'] == 'learn_svm': #+++Load data if config['path'] == None: root = Tk() root.withdraw() root.update() filepath = filedialog.askopenfilename() root.destroy() filename = os.path.basename(filepath) else: filepath = config['path'] filename = os.path.basename(filepath) #+++Construct features matrix and label vector myDF = pd.read_csv(filepath) mydict = myDF.to_dict(orient='list') n = len(mydict['Label']) y = np.zeros(n) y = [ int(y[i]) if mydict['Label'][i] == 'Concrete' else 1 for i in range(n) ] X = [] Features = [ 'Area_under_curve', 'Crest-Factor', 'Energy', 'Kurtosis', 'Peak_amplitude', 'RMS', 'Ring_down', 'Signal_strength', 'Skewnes', 'StDev', 'Variance' ] k = 0 for key in mydict.keys(): if key in Features: X.append(mydict[key]) k += 1 X = np.array(X) X = np.transpose(X) #+++Train/Test Split if config['stratify'] == True: X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=config['test_size'], random_state=config['rs'], stratify=y) else: X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=config['test_size'], random_state=config['rs']) #+++Scaler if config['scaler'] == True: print('With standard scaler') scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) #+++PCA if config['pca'] != None: pca = PCA(n_components=config['pca']) pca.fit(X_train) print('PCA results: ', pca.explained_variance_ratio_) X_train = pca.transform(X_train) X_test = pca.transform(X_test) #+++Hyperparameters for Grid search Penalizations = [0.1, 1.0, 10.] Kernels = ['linear', 'rbf', 'poly'] results = {} results['penal'] = [] results['kernel'] = [] results['accu_cv'] = [] results['accu_te'] = [] results['recall_cv'] = [] results['recall_te'] = [] results['preci_cv'] = [] results['preci_te'] = [] results['f1_cv'] = [] results['f1_te'] = [] count = 0 for kernel_ in Kernels: for penal_ in Penalizations: print('+++++++Case = ', count) clf = SVC(kernel=kernel_, C=penal_, gamma='auto', verbose=False, max_iter=100000, random_state=config['rs']) scores = cross_validate(clf, X_train, y_train, cv=config['cv'], scoring=('accuracy', 'recall', 'precision', 'f1')) clf.fit(X_train, y_train) Pred_y_test = clf.predict(X_test) score_test_accu = accuracy_score(y_test, Pred_y_test) score_test_recall = recall_score(y_test, Pred_y_test) score_test_preci = precision_score(y_test, Pred_y_test) score_test_f1 = f1_score(y_test, Pred_y_test) results['penal'].append(penal_) results['kernel'].append(kernel_) results['accu_cv'].append(scores['test_accuracy'].mean()) results['recall_cv'].append(scores['test_recall'].mean()) results['preci_cv'].append(scores['test_precision'].mean()) results['f1_cv'].append(scores['test_f1'].mean()) results['accu_te'].append(score_test_accu) results['recall_te'].append(score_test_recall) results['preci_te'].append(score_test_preci) results['f1_te'].append(score_test_f1) count += 1 #+++Save results config['features'] = Features config['filename'] = filename name = datetime.now().strftime("%Y%m%d_%H%M%S") print(name) save_pickle('config_' + name + '.pkl', config) DataFr = pd.DataFrame(data=results, index=None) with pd.ExcelWriter('results_' + name + '.xlsx') as writer: DataFr.to_excel(writer, sheet_name='SVM_Learn') print('Result OK') elif config['mode'] == 'learn_oneclass': #+++Load data if config['path'] == None: root = Tk() root.withdraw() root.update() filepath = filedialog.askopenfilename() root.destroy() filename = os.path.basename(filepath) else: filepath = config['path'] filename = os.path.basename(filepath) #+++Construct features matrix and label vector myDF = pd.read_csv(filepath) mydict = myDF.to_dict(orient='list') n = len(mydict['Label']) y = np.ones(n) y = [ int(y[i]) if mydict['Label'][i] == 'Concrete' else -1 for i in range(n) ] X = [] Features = [ 'Area_under_curve', 'Crest-Factor', 'Energy', 'Kurtosis', 'Peak_amplitude', 'RMS', 'Ring_down', 'Signal_strength', 'Skewnes', 'StDev', 'Variance' ] k = 0 for key in mydict.keys(): if key in Features: X.append(mydict[key]) k += 1 X = np.array(X) X = np.transpose(X) #+++Train/Test Split if config['stratify'] == True: X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=config['test_size'], random_state=config['rs'], stratify=y) else: X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=config['test_size'], random_state=config['rs']) #+++Scaler if config['scaler'] == True: print('With standard scaler') scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) #+++PCA if config['pca'] != None: pca = PCA(n_components=config['pca']) pca.fit(X_train) print('PCA results: ', pca.explained_variance_ratio_) X_train = pca.transform(X_train) X_test = pca.transform(X_test) #+++Hyperparameters for Grid search Nus = [0.1, 0.5, 0.9] Kernels = ['linear', 'rbf', 'poly'] results = {} results['nu'] = [] results['kernel'] = [] results['accu_cv'] = [] results['accu_te'] = [] results['recall_cv'] = [] results['recall_te'] = [] results['preci_cv'] = [] results['preci_te'] = [] results['f1_cv'] = [] results['f1_te'] = [] results['Baccu_cv'] = [] results['Baccu_te'] = [] results['Bpreci_cv'] = [] results['Bpreci_te'] = [] count = 0 for kernel_ in Kernels: for nu_ in Nus: print('+++++++Case = ', count) clf = OneClassSVM(kernel=kernel_, nu=nu_, gamma='auto', verbose=False, max_iter=100000) # scores = cross_validate(clf, X_train, y_train, cv=config['cv'], scoring=('accuracy', 'recall', 'precision', 'f1')) scores = cross_validate( clf, X_train, y_train, cv=config['cv'], scoring=('accuracy', 'recall', 'precision', 'f1', 'balanced_accuracy', 'average_precision')) clf.fit(X_train, y_train) Pred_y_test = clf.predict(X_test) score_test_accu = accuracy_score(y_test, Pred_y_test) score_test_recall = recall_score(y_test, Pred_y_test) score_test_preci = precision_score(y_test, Pred_y_test) score_test_f1 = f1_score(y_test, Pred_y_test) score_test_Baccu = balanced_accuracy_score(y_test, Pred_y_test) score_test_Bpreci = average_precision_score( y_test, Pred_y_test) # results['penal'].append(penal_) results['nu'].append(nu_) results['kernel'].append(kernel_) results['accu_cv'].append(scores['test_accuracy'].mean()) results['recall_cv'].append(scores['test_recall'].mean()) results['preci_cv'].append(scores['test_precision'].mean()) results['f1_cv'].append(scores['test_f1'].mean()) results['Baccu_cv'].append( scores['test_balanced_accuracy'].mean()) results['Bpreci_cv'].append( scores['test_average_precision'].mean()) results['accu_te'].append(score_test_accu) results['recall_te'].append(score_test_recall) results['preci_te'].append(score_test_preci) results['f1_te'].append(score_test_f1) results['Baccu_te'].append(score_test_Baccu) results['Bpreci_te'].append(score_test_Bpreci) count += 1 #+++Save results config['features'] = Features config['filename'] = filename name = datetime.now().strftime("%Y%m%d_%H%M%S") print(name) save_pickle('config_' + name + '.pkl', config) DataFr = pd.DataFrame(data=results, index=None) with pd.ExcelWriter('results_' + name + '.xlsx') as writer: DataFr.to_excel(writer, sheet_name='SVM_Learn') print('Result OK') else: print('error mode') return
score = 'neg_mean_absolute_error' gscv = GridSearchCV(pipe, param_grid, cv=cv, scoring=score) gscv.fit(X_train, y_train) print_gscv_score(gscv) y_pred = gscv.predict(X_train) print('train data: ', end="") print_score(y_train, y_pred) # visualize fig = yyplot(y_train, y_pred) #%% # Novelty detection by One Class SVM with optimized hyperparameter clf = OneClassSVM(nu=0.003, kernel=gscv.best_params_['model__kernel'], gamma=gscv.best_params_['model__gamma']) clf.fit(X_train) y_pred = gscv.predict(X_test) # predicted y reliability = clf.predict(X_test) # outliers = -1 data = [] output = 'test2.csv' for i in range(len(X_test)): satom1 = periodic_table.get_el_sp(int(X_test[i][0])) satom2 = periodic_table.get_el_sp(int(X_test[i][1])) natom1 = int(X_test[i][2]) natom2 = int(X_test[i][3]) str_mat = str(satom1) + str(natom1) + str(satom2) + str(natom2) formula = Composition(str_mat).reduced_formula
def evaluate_authentication( df, data_type, representation_type, verbose = False, roc_data = False, roc_data_filename = TEMP_NAME): print(df.shape) userids = create_userids( df ) NUM_USERS = len(userids) auc_list = list() eer_list = list() global_positive_scores = list() global_negative_scores = list() for i in range(0,NUM_USERS): userid = userids[i] user_train_data = df.loc[ df.iloc[:, -1].isin([userid]) ] # Select data for training user_train_data = user_train_data.drop(user_train_data.columns[-1], axis=1) user_array = user_train_data.values num_samples = user_array.shape[0] train_samples = (int)(num_samples * 0.66) + 1 test_samples = num_samples - train_samples if (verbose == True): print(str(userid)+". #train_samples: "+str(train_samples)+"\t#test_samples: "+ str(test_samples)) user_train = user_array[0:train_samples,:] user_test = user_array[train_samples:num_samples,:] other_users_data = df.loc[~df.iloc[:, -1].isin([userid])] other_users_data = other_users_data.drop(other_users_data.columns[-1], axis=1) other_users_array = other_users_data.values clf = OneClassSVM(gamma='scale') clf.fit(user_train) positive_scores = clf.score_samples(user_test) negative_scores = clf.score_samples(other_users_array) # Aggregating positive scores y_pred_positive = positive_scores for i in range(len(positive_scores) - AGGREGATE_BLOCK_NUM + 1): y_pred_positive[i] = np.average(y_pred_positive[i : i + AGGREGATE_BLOCK_NUM], axis=0) # Aggregating negative scores y_pred_negative = negative_scores for i in range(len(negative_scores) - AGGREGATE_BLOCK_NUM + 1): y_pred_negative[i] = np.average(y_pred_negative[i : i + AGGREGATE_BLOCK_NUM], axis=0) auc, eer,_,_ = compute_AUC_EER(y_pred_positive, y_pred_negative) if SCORE_NORMALIZATION == True: positive_scores, negative_scores = score_normalization(positive_scores, negative_scores) global_positive_scores.extend(positive_scores) global_negative_scores.extend(negative_scores) if verbose == True: print(str(userid)+", "+ str(auc)+", "+str(eer)+"\n" ) auc_list.append(auc) eer_list.append(eer) print('AUC mean : %7.4f, std: %7.4f' % ( np.mean(auc_list), np.std(auc_list)) ) print('EER mean: %7.4f, std: %7.4f' % ( np.mean(eer_list), np.std(eer_list)) ) print("#positives: "+str(len(global_positive_scores))) print("#negatives: "+str(len(global_negative_scores))) global_auc, global_eer, fpr, tpr = compute_AUC_EER(global_positive_scores, global_negative_scores) filename = 'output_png/scores_'+ str(data_type.value)+ '_' + str(representation_type.value) if SCORES == True: # **************************************************************************************** plot_scores(global_positive_scores, global_negative_scores, filename, title='Scores distribution') # **************************************************************************************** if( roc_data == True ): dict = {'FPR': fpr, 'TPR': tpr} df = pd.DataFrame(dict) df.to_csv(roc_data_filename, index=False) words = roc_data_filename.split('/') auc_eer_data_filename = words[0] +'/auc_eer_' + words[ 1 ] dict = {'AUC': auc_list, 'EER': eer_list} df = pd.DataFrame(dict) df.to_csv(auc_eer_data_filename, index=False) print("Global AUC: "+str(global_auc)) print("Global EER: "+str(global_eer)) return auc_list, eer_list
def model_ocsvm(train_x, test_x): model = OneClassSVM(gamma='auto', kernel='linear') model.fit(train_x) pred = model.predict(test_x) return model, pred
def evaluate_authentication_cross_day( df1, df2, data_type, representation_type, verbose = False, roc_data = False, roc_data_filename = TEMP_NAME ): print("Session 1 shape: "+str(df1.shape)) print("Session 2 shape: "+str(df2.shape)) userids = create_userids( df1 ) NUM_USERS = len(userids) global_positive_scores = list() global_negative_scores = list() auc_list = list() eer_list = list() for i in range(0,NUM_USERS): userid = userids[i] user_session1_data = df1.loc[df1.iloc[:, -1].isin([userid])] user_session2_data = df2.loc[df2.iloc[:, -1].isin([userid])] user_session1_data = user_session1_data.drop(user_session1_data.columns[-1], axis=1) user_session1_array = user_session1_data.values # positive test data user_session2_data = user_session2_data.drop(user_session2_data.columns[-1], axis=1) user_session2_array = user_session2_data.values # negative test data other_users_session2_data = df2.loc[~df2.iloc[:, -1].isin([userid])] other_users_session2_data = other_users_session2_data.drop(other_users_session2_data.columns[-1], axis=1) other_users_session2_array = other_users_session2_data.values clf = OneClassSVM(gamma='scale') clf.fit(user_session1_array) positive_scores = clf.score_samples(user_session2_array) negative_scores = clf.score_samples(other_users_session2_array) # Aggregating positive scores y_pred_positive = positive_scores for i in range(len(positive_scores) - AGGREGATE_BLOCK_NUM + 1): y_pred_positive[i] = np.average(y_pred_positive[i : i + AGGREGATE_BLOCK_NUM], axis=0) # Aggregating negative scores y_pred_negative = negative_scores for i in range(len(negative_scores) - AGGREGATE_BLOCK_NUM + 1): y_pred_negative[i] = np.average(y_pred_negative[i : i + AGGREGATE_BLOCK_NUM], axis=0) auc, eer, _, _ = compute_AUC_EER(y_pred_positive, y_pred_negative) # auc, eer = compute_AUC_EER(positive_scores, negative_scores ) if SCORE_NORMALIZATION == True: positive_scores, negative_scores = score_normalization(positive_scores, negative_scores) global_positive_scores.extend(positive_scores) global_negative_scores.extend(negative_scores) if verbose == True: print(str(userid)+": "+ str(auc)+", "+str(eer) ) auc_list.append(auc) eer_list.append(eer) print('AUC mean : %7.4f, std: %7.4f' % ( np.mean(auc_list), np.std(auc_list)) ) print('EER mean: %7.4f, std: %7.4f' % ( np.mean(eer_list), np.std(eer_list)) ) global_auc, global_eer, fpr, tpr = compute_AUC_EER(global_positive_scores, global_negative_scores) filename = 'output_png/scores_'+ str(data_type.value)+ '_' + str(representation_type.value) if SCORES == True: # **************************************************************************************** plot_scores(global_positive_scores, global_negative_scores, filename, title='Scores distribution') # **************************************************************************************** if( roc_data == True ): dict = {'FPR': fpr, 'TPR': tpr} df = pd.DataFrame(dict) df.to_csv(roc_data_filename, index=False) print("Global AUC: "+str(global_auc)) print("Global EER: "+str(global_eer)) return auc_list, eer_list
def evaluate_authentication_skilledforgeries( df_genuine, df_forgery, data_type, representation_type, verbose = False, roc_data = False, roc_data_filename = TEMP_NAME): print("Genuine shape: "+str(df_genuine.shape)) print("Forgery shape: "+str(df_forgery.shape)) print(df_forgery.shape) userids = create_userids( df_genuine ) NUM_USERS = len(userids) global_positive_scores = list() global_negative_scores = list() auc_list = list() eer_list = list() for i in range(0,NUM_USERS): userid = userids[i] user_genuine_data = df_genuine.loc[df_genuine.iloc[:, -1].isin([userid])] user_forgery_data = df_forgery.loc[df_forgery.iloc[:, -1].isin([userid])] user_genuine_data = user_genuine_data.drop(user_genuine_data.columns[-1], axis=1) user_genuine_array = user_genuine_data.values num_samples = user_genuine_array.shape[0] train_samples = (int)(num_samples * 0.66) test_samples = num_samples - train_samples # MCYT # train_samples = 15 # test_samples = 10 user_genuine_train = user_genuine_array[0:train_samples,:] user_genuine_test = user_genuine_array[train_samples:num_samples,:] user_forgery_data = user_forgery_data.drop(user_forgery_data.columns[-1], axis=1) user_forgery_array = user_forgery_data.values clf = OneClassSVM(gamma='scale') clf.fit(user_genuine_train) positive_scores = clf.score_samples(user_genuine_test) negative_scores = clf.score_samples(user_forgery_array) auc, eer,_,_ = compute_AUC_EER(positive_scores, negative_scores ) if SCORE_NORMALIZATION == True: positive_scores, negative_scores = score_normalization(positive_scores, negative_scores) global_positive_scores.extend(positive_scores) global_negative_scores.extend(negative_scores) if verbose == True: print(str(userid)+": "+ str(auc)+", "+str(eer) ) auc_list.append(auc) eer_list.append(eer) print('AUC mean : %7.4f, std: %7.4f' % ( np.mean(auc_list), np.std(auc_list)) ) print('EER mean: %7.4f, std: %7.4f' % ( np.mean(eer_list), np.std(eer_list)) ) global_auc, global_eer, fpr, tpr = compute_AUC_EER(global_positive_scores, global_negative_scores) filename = 'output_png/scores_'+ str(data_type.value)+ '_' + str(representation_type.value) if SCORES == True: # **************************************************************************************** plot_scores(global_positive_scores, global_negative_scores, filename, title='Scores distribution') # **************************************************************************************** if( roc_data == True ): dict = {'FPR': fpr, 'TPR': tpr} df = pd.DataFrame(dict) df.to_csv(roc_data_filename, index=False) print("Global AUC: "+str(global_auc)) print("Global EER: "+str(global_eer))
def outlierDetection(Xtrain, Xtest): outlierDetector = OneClassSVM(kernel='rbf', gamma = 0.1, nu = 0.001) outlierDetector.fit(Xtrain) return outlierDetector.predict(Xtest)
'n_jobs': 20, 'trainfrac': 0.5, 'sim_reps': 2, 'score_thresh': 0.5 }) X, Y, reg_ind, anom_ind = dl.loaddata() iforest = IsolationForest(n_estimators=params['n_estimators'], max_samples=params['max_samples'], max_features=params['max_features'], n_jobs=params['n_jobs'], behaviour='new', contamination=0.001) ocsvm = OneClassSVM(gamma='scale', nu=0.05) def traintest(model, trainX, trainY, testX, testY): all_metrics = dict({'f1': [], 'precision': [], 'recall': [], 'mcc': []}) model.fit(trainX) train_scores = model.score_samples(trainX) thresh = np.percentile(train_scores, 5) scores = model.score_samples(testX) predY = np.where(scores <= thresh, ANOM_LAB, REG_LAB) prec, recall, f1, _ = precision_recall_fscore_support(testY, predY, average="binary", pos_label=ANOM_LAB) all_metrics['f1'] = f1 all_metrics['precision'] = prec
from data_parcing import data_parcing from SVM import * dp = data_parcing('dataset') temp, hum, gas = dp.test_data_convert_format() temp= np.array(temp) data_temp_x= temp[:, 1].reshape(-1, 1) data_temp_y= temp[:, 2] data_temp_x, data_temp_y = data_split(temp) train_temp_x, test_temp_x, train_temp_y, test_temp_y= train_test_split(data_temp_x, data_temp_y, test_size=0.3, random_state=123, shuffle=True) model_temp= OneClassSVM(gamma='auto', kernel="linear") model_temp.fit(train_temp_x) pred_temp_y=model_temp.predict(test_temp_x) cnt=0 print("len:", len(pred_temp_y)) for i in range(len(pred_temp_y)): if pred_temp_y[i]==-1: pred_temp_y[i]=0 cnt+=1 print("cnt:", cnt) test_temp_y = np.array(list(map(int, test_temp_y))) acc= accuracy_score(test_temp_y, pred_temp_y) # test_temp_y=np.array(test_temp_y)
from os.path import dirname from matplotlib.pyplot import plot, show, title from numpy import array, genfromtxt from sklearn.svm import OneClassSVM if __name__ == "__main__": DATA_SET_1 = array( genfromtxt(dirname(__file__) + "\\" + "women.csv", delimiter=",")) NU_VAL = 1 / 25 GAMMA_VAL = 1 / 3500000000 SVM_MODEL = OneClassSVM(nu=NU_VAL, gamma=GAMMA_VAL) SVM_MODEL.fit(DATA_SET_1) DATA_SET_1_PRED = SVM_MODEL.predict(DATA_SET_1) NORMAL = DATA_SET_1[DATA_SET_1_PRED == 1] ABNORMAL = DATA_SET_1[DATA_SET_1_PRED == -1] plot(NORMAL[:, 0], NORMAL[:, 1], "bx") plot(ABNORMAL[:, 0], ABNORMAL[:, 1], "ro") title("gamma = " + str(SVM_MODEL.gamma) + ", nu = " + str(SVM_MODEL.nu)) show()
LightGBM = LGBMClassifier(n_estimators=115, num_leaves=65, max_depth=15, min_child_samples=40, learning_rate=0.1, boosting_type='gbdt', objective='binary', random_state=42, n_jobs=-1, silent=True) Naive_Bayes = GaussianNB(var_smoothing=1e0) One_Class_SVM = OneClassSVM(kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=0.001, nu=0.05, shrinking=True, cache_size=200, verbose=False, max_iter=-1) Isolation_Forest = IsolationForest(n_estimators=100, max_samples='auto', contamination='auto', max_features=1.0, bootstrap=True, n_jobs=-1, random_state=42) Auto_Enc_LogReg = Log_Reg Auto_Enc_LightGBM = LightGBM score_met = 'average_precision'
def training(self): self.clf = OneClassSVM(kernel='rbf',gamma=26) self.clf.fit(self.train)
plt.figure(figsize=(12,10)) # on this line I just set the size of figure to 12 by 10. p=sns.heatmap(data.corr(), annot=True,cmap ='RdYlGn') X=data.iloc[:,:-1].values Y=data.iloc[:,-1].values ####### standard scale ######## sc=StandardScaler() X=sc.fit_transform(X) ######### Outlier ########## outliers_fraction = 0.01 outlier_model = OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.01) outlier_model.fit(X) out = outlier_model.predict(X) df = pd.DataFrame({'out_prediction':out}) df=df[df['out_prediction']==1] b=set(df.index.values.tolist()) a=[] for i in range(0,len(X)): if i in b: a.append(X[i]) X=np.array(a) c=[]
def base_experiment(config, ntrials=1, seed=123456789): """ Run a single experiment, locally. @param config: The configuration parameters to use for the SP. @param ntrials: The number of times to repeat the experiment. @param seed: The random seed to use. @return: A tuple containing the percentage errors for the SP's training and testing results and the SVM's training and testing results, respectively. """ # Base parameters ntrain, ntest = 800, 200 clf_th = 0.5 # Seed numpy np.random.seed(seed) # Get the data (tr_x, tr_y), (te_x, te_y) = load_mnist() tr_x_0 = np.random.permutation(tr_x[tr_y == 0]) x_tr = tr_x_0[:ntrain] x_te = tr_x_0[ntrain:ntrain + ntest] outliers = [ np.random.permutation(tr_x[tr_y == i])[:ntest] for i in xrange(1, 10) ] # Metrics metrics = SPMetrics() # Get the metrics for the datasets u_x_tr = metrics.compute_uniqueness(x_tr) o_x_tr = metrics.compute_overlap(x_tr) c_x_tr = 1 - metrics.compute_distance(x_tr) u_x_te = metrics.compute_uniqueness(x_te) o_x_te = metrics.compute_overlap(x_te) c_x_te = 1 - metrics.compute_distance(x_te) u_y_te, o_y_te, c_y_te = [], [], [] for outlier in outliers: u_y_te.append(metrics.compute_uniqueness(outlier)) o_y_te.append(metrics.compute_overlap(outlier)) c_y_te.append(1 - metrics.compute_distance(outlier)) # Initialize the overall results sp_x_results = np.zeros(ntrials) sp_y_results = [np.zeros(ntrials) for _ in xrange(9)] svm_x_results = np.zeros(ntrials) svm_y_results = [np.zeros(ntrials) for _ in xrange(9)] # Iterate across the trials: for nt in xrange(ntrials): # Make a new seeod seed2 = np.random.randint(1000000) config['seed'] = seed2 # Create the SP sp = SPRegion(**config) # Fit the SP sp.fit(x_tr) # Get the SP's output sp_x_tr = sp.predict(x_tr) sp_x_te = sp.predict(x_te) sp_y_te = [sp.predict(outlier) for outlier in outliers] # Get the metrics for the SP's results u_sp_x_tr = metrics.compute_uniqueness(sp_x_tr) o_sp_x_tr = metrics.compute_overlap(sp_x_tr) c_sp_x_tr = 1 - metrics.compute_distance(sp_x_tr) u_sp_x_te = metrics.compute_uniqueness(sp_x_te) o_sp_x_te = metrics.compute_overlap(sp_x_te) c_sp_x_te = 1 - metrics.compute_distance(sp_x_te) u_sp_y_te, o_sp_y_te, c_sp_y_te = [], [], [] for y in sp_y_te: u_sp_y_te.append(metrics.compute_uniqueness(y)) o_sp_y_te.append(metrics.compute_overlap(y)) c_sp_y_te.append(1 - metrics.compute_distance(y)) # Log all of the metrics sp._log_stats('Input Base Class Train Uniqueness', u_x_tr) sp._log_stats('Input Base Class Train Overlap', o_x_tr) sp._log_stats('Input Base Class Train Correlation', c_x_tr) sp._log_stats('Input Base Class Test Uniqueness', u_x_te) sp._log_stats('Input Base Class Test Overlap', o_x_te) sp._log_stats('Input Base Class Test Correlation', c_x_te) sp._log_stats('SP Base Class Train Uniqueness', u_sp_x_tr) sp._log_stats('SP Base Class Train Overlap', o_sp_x_tr) sp._log_stats('SP Base Class Train Correlation', c_sp_x_tr) sp._log_stats('SP Base Class Test Uniqueness', u_sp_x_te) sp._log_stats('SP Base Class Test Overlap', o_sp_x_te) sp._log_stats('SP Base Class Test Correlation', c_sp_x_te) for i, (a, b, c, d, e, f) in enumerate( zip(u_y_te, o_y_te, c_y_te, u_sp_y_te, o_sp_y_te, c_sp_y_te), 1): sp._log_stats('Input Novelty Class {0} Uniqueness'.format(i), a) sp._log_stats('Input Novelty Class {0} Overlap'.format(i), b) sp._log_stats('Input Novelty Class {0} Correlation'.format(i), c) sp._log_stats('SP Novelty Class {0} Uniqueness'.format(i), d) sp._log_stats('SP Novelty Class {0} Overlap'.format(i), e) sp._log_stats('SP Novelty Class {0} Correlation'.format(i), f) # Get average representation of the base class sp_base_result = np.mean(sp_x_tr, 0) sp_base_result[sp_base_result >= 0.5] = 1 sp_base_result[sp_base_result < 1] = 0 # Averaged results for each metric type u_sp_base_to_x_te = 0. o_sp_base_to_x_te = 0. c_sp_base_to_x_te = 0. u_sp, o_sp, c_sp = np.zeros(9), np.zeros(9), np.zeros(9) for i, x in enumerate(sp_x_te): xt = np.vstack((sp_base_result, x)) u_sp_base_to_x_te += metrics.compute_uniqueness(xt) o_sp_base_to_x_te += metrics.compute_overlap(xt) c_sp_base_to_x_te += 1 - metrics.compute_distance(xt) for j, yi in enumerate(sp_y_te): yt = np.vstack((sp_base_result, yi[i])) u_sp[j] += metrics.compute_uniqueness(yt) o_sp[j] += metrics.compute_overlap(yt) c_sp[j] += 1 - metrics.compute_distance(yt) u_sp_base_to_x_te /= ntest o_sp_base_to_x_te /= ntest c_sp_base_to_x_te /= ntest for i in xrange(9): u_sp[i] /= ntest o_sp[i] /= ntest c_sp[i] /= ntest # Log the results sp._log_stats('Base Train to Base Test Uniqueness', u_sp_base_to_x_te) sp._log_stats('Base Train to Base Test Overlap', o_sp_base_to_x_te) sp._log_stats('Base Train to Base Test Correlation', c_sp_base_to_x_te) for i, j in enumerate(xrange(1, 10)): sp._log_stats('Base Train to Novelty {0} Uniqueness'.format(j), u_sp[i]) sp._log_stats('Base Train to Novelty {0} Overlap'.format(j), o_sp[i]) sp._log_stats('Base Train to Novelty {0} Correlation'.format(j), c_sp[i]) # Create an SVM clf = OneClassSVM(kernel='linear', nu=0.1, random_state=seed2) # Evaluate the SVM's performance clf.fit(x_tr) svm_x_te = len(np.where(clf.predict(x_te) == 1)[0]) / float(ntest) * \ 100 svm_y_te = np.array([ len(np.where(clf.predict(outlier) == -1)[0]) / float(ntest) * 100 for outlier in outliers ]) # Perform classification using overlap as the feature # -- The overlap must be above 50% clf_x_te = 0. clf_y_te = np.zeros(9) for i, x in enumerate(sp_x_te): xt = np.vstack((sp_base_result, x)) xo = metrics.compute_overlap(xt) if xo >= clf_th: clf_x_te += 1 for j, yi in enumerate(sp_y_te): yt = np.vstack((sp_base_result, yi[i])) yo = metrics.compute_overlap(yt) if yo < clf_th: clf_y_te[j] += 1 clf_x_te = (clf_x_te / ntest) * 100 clf_y_te = (clf_y_te / ntest) * 100 # Store the results as errors sp_x_results[nt] = 100 - clf_x_te sp_y_results[nt] = 100 - clf_y_te svm_x_results[nt] = 100 - svm_x_te svm_y_results[nt] = 100 - svm_y_te # Log the results sp._log_stats('SP % Correct Base Class', clf_x_te) sp._log_stats('SVM % Correct Base Class', svm_x_te) for i, j in enumerate(xrange(1, 10)): sp._log_stats('SP % Correct Novelty Class {0}'.format(j), clf_y_te[i]) sp._log_stats('SVM % Correct Novelty Class {0}'.format(j), svm_y_te[i]) sp._log_stats('SP % Mean Correct Novelty Class', np.mean(clf_y_te)) sp._log_stats('SVM % Mean Correct Novelty Class', np.mean(svm_y_te)) sp._log_stats('SP % Adjusted Score', (np.mean(clf_y_te) * clf_x_te) / 100) sp._log_stats('SVM % Adjusted Score', (np.mean(svm_y_te) * svm_x_te) / 100) return sp_x_results, sp_y_results, svm_x_results, svm_y_results
def test_one_class_svm(self): model = OneClassSVM() dump_one_class_classification(model, folder=self.folder)
def run(data_class, out_class=[], printer=Printer()): has_out = np.any(out_class) start = time.time() print('Target data shape: (%d,%d)' % (data_class.shape[0], data_class.shape[1])) X = np.delete(data_class, -1, axis=1) y = data_class[:, -1] # print(y) if (has_out): print('Has out class: Yes') print('Out data shape: (%d,%d)' % (out_class.shape[0], out_class.shape[1])) X_out = np.delete(out_class, -1, axis=1) y_out = out_class[:, -1] # print(y_out) print(data_class.shape, X.shape, y.shape) clf = OneClassSVM(gamma='scale', nu=0.01) kf = KFold(n_splits=5) kf.get_n_splits(X) param_dist = { 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['scale', 'auto'], 'nu': stats.uniform(.0, .05), 'shrinking': [True, False] } n_inter = 20 clf = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_inter, cv=5, scoring="accuracy") f1_scores = [] precision_scores = [] recall_scores = [] accuracy_scores = [] run_time_start = time.time() print(kf) for train_index, test_index in kf.split(X): round_time_start = time.time() X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] if (has_out): X_test = np.concatenate((X_test, X_out)) y_test = np.concatenate((y_test, y_out)) clf = clf.fit(X_train, y_train) y_pred_test = clf.predict(X_test) # print(y_test) # print(y_pred_test) round_time_end = time.time() n_error_test = (y_pred_test != y_test).sum() f1_test_score = f1_score(y_test, y_pred_test, pos_label=-1) precision_test_score = precision_score(y_test, y_pred_test) recall_test_score = recall_score(y_test, y_pred_test) accuracy_test_score = accuracy_score(y_test, y_pred_test) printer.print_write("\n=============ITERATION SCORES=============\n") printer.print_write( tabulate([ ['Metric', 'Value'], ['Test error:', '{:d}'.format(n_error_test)], ['Test F1 Score:', '%.3f' % f1_test_score], ['Test Precision Score:', '%.3f' % precision_test_score], ['Test Recall Score:', '%.3f' % recall_test_score], ['Test Accuracy Score:', '%.3f' % accuracy_test_score], [ 'Iteration time:', '%.2f seconds' % (round_time_end - round_time_start) ], ], headers="firstrow")) # printer.print_write("Test error: {:d}".format(n_error_test)) # printer.print_write('Test F1 Score: %.3f' % f1_test_score) # printer.print_write('Test Precision Score: %.3f' % precision_test_score) # printer.print_write('Test Recall Score: %.3f' % recall_test_score) # printer.print_write('Test Accuracy Score: %.3f' % accuracy_test_score) # printer.print_write("Iteration time: %.2f seconds" % (round_time_end - round_time_start)) f1_scores.append(f1_test_score) precision_scores.append(precision_test_score) recall_scores.append(recall_test_score) accuracy_scores.append(accuracy_test_score) run_time_end = time.time() f1_scores = np.array(f1_scores) precision_scores = np.array(precision_scores) recall_scores = np.array(recall_scores) printer.print_write("\n=============FINAL SCORES=============\n") printer.print_write("F1 Score Final: %f" % (f1_scores.sum() / f1_scores.size)) printer.print_write("Precision Score Final: %f" % (precision_scores.sum() / precision_scores.size)) printer.print_write("Recall Score Final: %f" % (recall_scores.sum() / recall_scores.size)) printer.print_write("Accuracy Score Final: %f" % (accuracy_scores.sum() / accuracy_scores.size)) printer.print_write("Final time: %.2f seconds" % (run_time_end - run_time_start)) end = time.time() printer.print_write("\n=============TIME=============\n") printer.print_write("It took: %.2f seconds" % (end - start))
df = pd.read_csv('data/datalab_persona_cont.csv') X_outliers = df[df['FKSmoker'] == 0] X_outliers.drop(['FKSmoker'], inplace=True, axis=1) X = df[df['FKSmoker'] == 1] X.drop(['FKSmoker'], inplace=True, axis=1) X_train = X.sample(frac=0.9) X_test = X.drop(df.index[list(X_train.index)]) # fit the model clf = OneClassSVM(gamma=0.3, kernel='rbf') clf.fit(X_train) y_pred_train = clf.predict(X_train) y_pred_test = clf.predict(X_test) y_pred_outliers = clf.predict(X_outliers) outliers = pd.Series(y_pred_test, name='verdict') print(len(outliers[outliers == 1])) print(len(y_pred_test)) outliers = pd.Series(y_pred_outliers, name='verdict') print(len(outliers[outliers == -1])) print(len(y_pred_outliers))
def test_explain_one_class_svm_unsupported(): X = np.array([[0,0], [0, 1], [5, 3], [93, 94], [90, 91]]) clf = OneClassSVM().fit(X) expl = explain_weights(clf) assert 'supported' in expl.error
print("Detailed classification report:") print() print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") print() y_true, y_pred = y_test, gscv.predict(X_test) print(classification_report(y_true, y_pred)) print(confusion_matrix(y_test, y_pred)) print() #%% # Novelty detection by One Class SVM with optimized hyperparameter clf = OneClassSVM(nu=0.10, kernel=gscv.best_params_['kernel'], gamma=gscv.best_params_['gamma']) clf.fit(X_train) y_pred = gscv.predict(X_test) # prediction reliability = clf.predict(X_test) # outliers = -1 results = np.c_[y_pred, y_test, reliability] #print('y_predicted, y_true, outliers = -1') #print(y_tot) #print() #%% df = pd.DataFrame(results, columns=list('ABC')) df_in_ = df[df.C == 1] df_out = df[df.C == -1] print('Inlier sample, number of good/bad predictions: {} {}'.format(
from sklearn.svm import OneClassSVM from src.filereader.FileReader import FileReader FILE_PATH = "../../resource/PostureEntry.csv" N_VALIDATIONS = 4 samples, labels, label_names = FileReader.read(FILE_PATH) samples = preprocessing.scale(samples) labels = labels.ravel() trainsamples = samples[(labels == 0) | (labels == 2) | (labels == 4), :] testsamples = samples[(labels == 1) | (labels == 3) | (labels == 5), :] labels[(labels == 0) | (labels == 2) | (labels == 4)] = 0 labels[(labels != 0)] = 1 clf = OneClassSVM() clf.fit(trainsamples) y_train = clf.predict(trainsamples) y_test = clf.predict(testsamples) error_train = y_train[y_train == -1].size / y_train.size error_test = y_test[y_test == -1].size / y_test.size print("Train error " + str(error_train)) print("Test error " + str(error_test))
data_train = pickle.load(concating_data_train) concating_data_train.close() concating_data_test = open( "/home/alperen/Desktop/Thesis_Application/data_test", 'rb') data_test = pickle.load(concating_data_test) concating_data_test.close() pickle_test_label = open("/home/alperen/Desktop/Thesis_Application/test_label", 'rb') test_label = pickle.load(pickle_test_label) pickle_test_label.close() # train and predict phase o_svm = OneClassSVM() #kernel='rbf', gamma=0.001, nu=0.01 o_svm.fit(data_train) anomaly_detect = o_svm.predict(data_test) pickle_predicted_data = open( "/home/alperen/Desktop/Thesis_Application/predicted_data", 'wb') predicted_data = pickle.dump(anomaly_detect, pickle_predicted_data) pickle_predicted_data.close() # pickle_predicted_data = open("/home/alperen/Desktop/Thesis_Application/predicted_data", 'rb') # anomaly_detect = pickle.load(pickle_predicted_data) # pickle_predicted_data.close() unique, counts = np.unique(anomaly_detect, return_counts=True) print(np.asarray((unique, counts)).T)
def oneclass_svm(dataset, kernel, nu): svm = OneClassSVM(kernel=kernel, nu=nu).fit(dataset) return svm
"true").replace(False, "false") store_csv(pandas.concat([decisionFunction, outlier], axis=1), name + ".csv") if "Housing" in datasets: build_iforest_housing(IsolationForest(random_state=13), "IsolationForestHousing") def build_ocsvm_housing(svm, name): mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())]) pipeline = Pipeline([("mapper", mapper), ("scaler", MaxAbsScaler()), ("estimator", svm)]) pipeline.fit(housing_X) pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values) store_pkl(pipeline, name + ".pkl") decisionFunction = DataFrame(pipeline.decision_function(housing_X), columns=["decisionFunction"]) outlier = DataFrame(pipeline.predict(housing_X) <= 0, columns=["outlier" ]).replace(True, "true").replace(False, "false") store_csv(pandas.concat([decisionFunction, outlier], axis=1), name + ".csv") if "Housing" in datasets: build_ocsvm_housing(OneClassSVM(nu=0.10, random_state=13), "OneClassSVMHousing")