class svm_model(): def train(self, X, ker): self.model = OneClassSVM(kernel=ker, shrinking=True,random_state=1) self.model.fit(X) def predict(self, X): return self.model.predict(X)
def main(): n = 1000 data = [] for i in range(n): data.append(np.array([np.random.randint(0, 5000) for i in range(np.random.randint(20, 150))])) data = np.array(data) # making all the data into 5 dimensions # howto : boxplot x = [] y = [] for i in data: sorted_i = sorted(i) x.append([max(sorted_i), np.percentile(sorted_i, 75), np.median(sorted_i), np.percentile(sorted_i, 25), min(sorted_i)]) y.append(0) x = np.array(x) ''' # making all the data into 5 dimensions # howto : distance start = time.time() data_i = 0 cnt = 1 x = np.zeros((n, n)) for i in data: data_j = data_i for j in data[cnt:]: dist = dtw(i, j, dist=lambda i, j: norm(i - j, ord=1))[0] x[data_i][data_j+1], x[data_j+1][data_i] = dist, dist data_j += 1 cnt += 1 data_i += 1 end = time.time() print(end - start) ''' # build model with x model = OneClassSVM() model.fit(x) # create test dataset test = [] for i in range(10): test.append(np.array([np.random.randint(0, 10000) for i in range(np.random.randint(20000, 30000))])) test = np.array(test) # transform test dataset x = [] y = [] for i in test: sorted_i = sorted(i) x.append([max(sorted_i), np.percentile(sorted_i, 75), np.median(sorted_i), np.percentile(sorted_i, 25), min(sorted_i)]) y.append(0) x = np.array(x) # predict test dataset pred = model.predict(x) '''
class Cluster(object): def __init__(self, name): self.name = name self.raw_dataset = [] self.dataset = [] self.dataset_red = [] def get_featurevec(self, data): '''Takes in data in the form of an array of EmoPackets, and outputs a list of feature vectors.''' # CHECKED, all good :) num_bins = (len(data)/int(dsp.SAMPLE_RATE*dsp.STAGGER) - int(dsp.BIN_SIZE / dsp.STAGGER) + 1) size = int(dsp.BIN_SIZE*dsp.SAMPLE_RATE) starts = int(dsp.SAMPLE_RATE*dsp.STAGGER) points = [] for i in range(num_bins): points.append(dsp.get_features(data[i*starts:i*starts+size])) return points def add_data(self, raw): '''Allows the addition of new data. Will retrain upon addition. Expects a list of EmoPackets.''' self.dataset.extend(self.get_featurevec(raw)) def extract_features(self): '''Does feature extraction for all of the datasets.''' self.dataset = [] for sess in self.raw_dataset: self.dataset.extend(self.get_featurevec(sess)) def reduce_dim(self, NDIM=5): '''Reduces the dimension of the extracted feature vectors.''' X = np.array(self.dataset) self.pca = RandomizedPCA(n_components=NDIM).fit(X) self.dataset_red = self.pca.transform(X) def train(self): '''Trains the classifier.''' self.svm = OneClassSVM() self.svm.fit(self.dataset_red) def is_novel(self, pt): '''Says whether or not the bin is novel. Expects an array of EmoPackets''' X = self.pca.transform(np.array(self.get_featurevec(data)[0])) ans = self.svm.predict(X) self.dataset_red.append(X) self.train() return ans def save(self): '''Saves this classifier to a data directory.''' this_dir, this_filename = os.path.split(__file__) DATA_PATH = os.path.join(this_dir, "data", self.name+'.pkl') dumpfile = open(DATA_PATH, "wb") pickle.dump(self, dumpfile, pickle.HIGHEST_PROTOCOL) dumpfile.close()
def select_best_support_vectors(data, nu=0.01, all_gammas=2 ** np.arange(-10, 10, 1)): all_errors = [] for gamma in all_gammas: clf = OneClassSVM(nu=nu, gamma=gamma) clf.fit(data) prediction = clf.predict(data) out_of_class_count = np.sum(prediction == -1) support_vectors_count = len(clf.support_vectors_) error = (float(out_of_class_count) / len(data) - nu) ** 2 error += (float(support_vectors_count) / len(data) - nu) ** 2 all_errors.append(error) index = np.argmin(all_errors) return all_gammas[index], all_errors
class NoveltySeparator(BaseEstimator): def get_params(self, deep=True): return {} def fit(self, X, y): # lets treat users spending something in the rest of the month as outliers inliers = y - X[:, 0] inliers = np.where(inliers < 0.1, True, False) self.detector = OneClassSVM(nu=0.05, cache_size=2000, verbose=True) # training only on inliers print("Training detector") self.detector.fit(X[inliers]) results = self.detector.predict(X).reshape(X.shape[0]) # predicted inliers = results == 1 outliers = results == -1 print("Training estimators") self.est_inliers = Ridge(alpha=0.05) self.est_outliers = Ridge(alpha=0.05) self.est_inliers.fit(X[inliers], y[inliers]) self.est_inliers.fit(X[outliers], y[outliers]) def predict(self, X): y = np.zeros(X.shape[0]) labels = self.detector.predict(X).reshape(X.shape[0]) inliers = lables == 1 outliers = lables == -1 y[inliers] = self.est_inliers.predict(X[inliers]) y[outliers] = self.est_outliers.predict(X[outliers]) return y
def slice_probability_space_selection(data, nu=0.05, all_gammas=2 ** np.linspace(-10, 10, 50), rho=0.05, outlier_distribution = np.random.rand, folds_count=7): kf_iterator = KFold(len(data), n_folds=folds_count) all_errors = [] for gamma in all_gammas: error = 0.0 clf = OneClassSVM(nu=nu, gamma=gamma) for train, test in kf_iterator: train_data = data[train,:] test_data = data[test,:] clf = OneClassSVM(nu=nu, gamma=gamma) clf.fit(train_data) prediction = clf.predict(test_data) inlier_metric_part = np.mean(prediction == -1) inlier_metric_part = inlier_metric_part / (1 + rho) / len(data) outliers = outlier_distribution(*data.shape) - 0.5 outliers *= 8 * np.std(data) outlier_metric_part = np.mean(clf.predict(outliers) == 1) * rho / (1 + rho) / len(outliers) error += inlier_metric_part + outlier_metric_part all_errors.append(error / folds_count) index = np.argmin(all_errors) #best_index = pd.Series(all_errors).pct_change().argmax() - 1 return int(index), all_errors
def outlier_detect(data_frame): #pandas to numpy - digestible by scikit columns = ['blm_tag_count','protest_count','justice_count','riot_count','breathe_count'] features = data_frame[list(columns)].values clf = OneClassSVM(nu=0.008, gamma=0.05) clf.fit(features) y_pred = clf.predict(features) mask=[y_pred==-1] oak_array = np.asarray(data_frame.hourly) protest_predict = oak_array[mask] protest_hours = list(protest_predict) return protest_hours
def svm(data, fraction=0.05, kernel='poly', degree=3, gamma=0, coeff=0): svm = OneClassSVM(kernel=kernel, degree=degree, gamma=gamma, nu=fraction, coeff0=coeff) svm.fit(data) score = svm.predict(data) numeration = [[i] for i in xrange(1, len(data)+1, 1)] numeration = np.array(numeration) y = np.hstack((numeration, score)) anomalies = numeration for num,s in y: if (y == 1): y = np.delete(anomalies, num-1, axis=0) return anomalies
def select_best_outlier_fraction_cross_val(data, nu=0.05, all_gammas=2 ** np.arange(-10, 10, 50), folds_count=7): all_errors = [] kf_iterator = KFold(len(data), n_folds=folds_count) for gamma in all_gammas: error = 0 for train, test in kf_iterator: train_data = data[train,:] test_data = data[test,:] clf = OneClassSVM(nu=nu, gamma=gamma) clf.fit(train_data) prediction = clf.predict(test_data) outlier_fraction = np.mean(prediction == -1) error += (nu - outlier_fraction) ** 2 + (float(clf.support_vectors_.shape[0]) / len(data) - nu) ** 2 all_errors.append(error / folds_count) best_index = np.argmin(error) return int(best_index), all_errors
class OneClassSVMDetector(BaseOutlier): @staticmethod def get_attributes(): return { "nu":0.1, "kernel":['rbf','linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], "gamma":0.1, } def __init__(self,nu=0.1,kernel='rbf',gamma=0.1): self.nu = nu self.kernel = kernel self.gamma = gamma def fit(self,data=None): self.data = data self.check_finite(data) if(self._is_using_pandas(data)==True): self.data.interpolate(inplace=True) # self.datareshap = data.reshape(-1,1) self.clf = OneClassSVM(nu=self.nu, kernel=self.kernel, gamma=self.gamma) self.clf.fit(data.reshape(-1,1)) # print "done" return self def predict(self, X_test): y_pred_train = self.clf.predict(X_test.reshape(-1,1)) outlier_idx = np.where(y_pred_train == -1) inlier_idx = np.where(y_pred_train == 1) d = { 'timestamp': self.data.index[outlier_idx], 'anoms': self.data.iloc[outlier_idx] } anoms = pd.DataFrame(d) self.anomaly_idx = anoms.index self.anom_val = anoms['anoms'] return anoms def fit_predict(self, data=None): self.fit(data) return self.predict(data) def plot(self): import matplotlib.pyplot as plt f, ax = plt.subplots(1, 1) ax.plot(self.data, 'b') ax.plot(self.anomaly_idx, self.anom_val, 'ro') ax.set_title('Detected Anomalies') ax.set_ylabel('Count') f.tight_layout() return f
def cross_validate(): #for tinkering with the model #read data all_df = pd.read_csv('./data/train.csv',index_col = 'ID') #split data zeros_df = all_df[all_df.TARGET == 0] ones_df = all_df[all_df.TARGET == 1] num_ones = ones_df.shape[0] msk = np.random.permutation(len(zeros_df)) < num_ones zeros_train_df = zeros_df[~msk] zeros_test_df = zeros_df[msk] ones_test_df = ones_df train_df = zeros_train_df test_df = pd.concat([zeros_test_df,ones_test_df]) train_X = np.array(train_df.drop('TARGET', axis = 1)) train_Y = np.array(train_df.TARGET) test_X = np.array(test_df.drop('TARGET',axis = 1)) test_Y = np.array(test_df.TARGET) #true target values #init svm print('training svm') my_svm = OneClassSVM(verbose = True) my_svm.fit(train_X) #predict print('predicting') predictions = my_svm.predict(test_X) conf_matrix = confusion_matrix(test_Y,predictions) print('confusion matrix:') print(pd.DataFrame(conf_matrix,columns = [0,1])) print('accuracy:') print(sum(test_Y.reshape(predictions.shape) == predictions)/len(test_Y))
def predict_header_features(self, pkt_featurizer): group_id = pkt_featurizer.pkt_type features = pkt_featurizer.features arrival_time = pkt_featurizer.arrival_time try: vectorizer = DictVectorizer() vectorizer.fit(self.training_data[group_id]) training_data_vectorized = vectorizer.transform(self.training_data[group_id]) features_vectorized = vectorizer.transform(features) scaler = preprocessing.StandardScaler(with_mean=False) training_data_vectorized = scaler.fit_transform(training_data_vectorized) features_vectorized = scaler.transform(features_vectorized) classifier = OneClassSVM() classifier.fit(training_data_vectorized) result = classifier.predict(features_vectorized) distance = classifier.decision_function(features_vectorized) except KeyError: result = 0 distance = 0 return result, distance
class TwoStage(object): def __init__(self, *args, **kwargs): super(TwoStage, self).__init__(*args, **kwargs) self._oneCls = OneClassSVM(nu=NU, gamma=GAMMA) self._clf = RandomForestClassifier(n_estimators=30) self._scaler = StandardScaler() def fit(self, data, labels): sdata = self._scaler.fit_transform(data) self._oneCls.fit(sdata) self._clf.fit(sdata, labels) return self def predict(self, data): sdata = self._scaler.transform(data) is_known_cls = self._oneCls.predict(sdata) cls = self._clf.predict(sdata) cls[is_known_cls == -1] = "zother" classes = list(self._clf.classes_) + ["zother"] return cls, classes
def predict_pkt_length_features(self, pkt_featurizer): group_id = pkt_featurizer.pkt_type try: dbscan = DBSCAN() pkt_lengths = np.array(list(self.pkt_lengths[group_id])+[pkt_featurizer.len_bytes]).reshape(-1,1) labels = dbscan.fit_predict(pkt_lengths) dbscan_prediction = labels[-1] == -1 if self.plot: self.plot_1d_dbscan(pkt_lengths, labels, range(len(pkt_lengths)), self.pkt_lengths_fig_dbscan, "", "Pkt Length", "Pkt Length DBSCAN Clustering - Anomalous Pkts in Black") one_class_svm = OneClassSVM() scaler = preprocessing.StandardScaler() pkt_lengths_scaled = scaler.fit_transform(np.array(self.pkt_lengths[group_id]).reshape(-1,1)) features_scaled = scaler.transform(np.array(pkt_featurizer.len_bytes).reshape(1,-1)) one_class_svm.fit(pkt_lengths_scaled) svm_prediction = one_class_svm.predict(features_scaled) if self.plot and len(pkt_lengths_scaled) > 2: self.plot_1d_svm(self.pkt_lengths[group_id], one_class_svm, range(len(self.pkt_lengths[group_id])), scaler, self.pkt_lengths_fig_svm, "Pkt", "Pkt Length", "Pkt Length One Class SVM Classification") except (KeyError, IndexError) as e: print e dbscan_prediction = 0 return dbscan_prediction
if __name__ == '__main__': ############### OUTLIER DETECTION ############### if (outlier_detection) : # humans_data = load_data_from_csv("D:/Kaggle/HumanVRobot/train_humans_ef_38f.csv", train = True) humans_data = load_data_from_csv("D:/Kaggle/HumanVRobot/train_humans_ef_21f_selrlr.csv", train = True) # Discard category information because it is a sparse matrix and only consider top 28 features bidder_ids, features = extract_features_for_anomaly_det (humans_data) # clf = OneClassSVM(nu = 0.0025, gamma = 0.0001) clf = OneClassSVM(nu = 0.0005, gamma = 0.0033) clf.fit(features) # clf.decision_function(features) pred = np.array(clf.predict(features)) num_outliers = 0 outlier_idx = [] anomaly_bidders = [] if (manual_handcode == False): for i, p in enumerate(pred) : if (p == -1): num_outliers += 1 outlier_idx.append([i]) anomaly_bidders.append(bidder_ids[i]) # print (" i = ", i, features[i, :]) else: print ("WARNING: Handcoding anomaly indices!") outlier_idx = [1079, 1807, 184, 564, 1228, 1497] # These look bot-ish by manual inspeection for idx in outlier_idx: anomaly_bidders.append(bidder_ids[idx])
from sklearn import preprocessing scaler = preprocessing.StandardScaler().fit(X1_train) X1_train_n=scaler.transform(X1_train) X1_test_n=scaler.transform(X1_test) X0_outliers_n=scaler.transform(X0) clf=OneClassSVM(gamma='auto', nu=0.1) clf.fit(X1_train_n) Y1_pred_train=clf.predict(X1_train_n) Y1_pred_test=clf.predict(X1_test_n) Y0_pred_outliers=clf.predict(X0_outliers_n) #VALUTAZIONE #TRAIN SET #matrice di confusione confmat = confusion_matrix(y_true=Y1_train, y_pred=Y1_pred_train) fig, ax = plt.subplots(figsize=(2.5, 2.5)) ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3) for i in range(confmat.shape[0]):
from sklearn.svm import OneClassSVM if __name__ == '__main__': dataset_pos = data.load_pos_eviction() dataset_neg = data.load_neg_eviction() dataset_all = data.load_eviction() # nu: The proportion of outliers we expect in our data. model_pos = OneClassSVM(kernel='linear', nu=0.9) model_pos.fit(dataset_pos.X_train) model_neg = OneClassSVM(kernel='linear', nu=0.1) model_neg.fit(dataset_neg.X_train) predictions_pos = model_pos.predict(dataset_all.X_train) predictions_neg = model_neg.predict(dataset_all.X_train) # +1 is inlier, -1 is outlier. We want those who are evicted, to be +1 # and those who are not evicted to be 0. # Outliers, those evicted, to be 1. predictions_neg = (predictions_neg == -1).astype(int) # Inliers, those evicted, to be 1. predictions_pos = (predictions_pos == 1).astype(int) # Print results and mean squared error. utils.evaluate(dataset_all.y_train, predictions_pos, model_pos.__class__.__name__) utils.evaluate(dataset_all.y_train, predictions_neg,
#Implement k fold cross validation kf = KFold(n_splits=kFold, shuffle=True) for trainIndex, testIndex in kf.split(xNormalData): #Training data (normal) for every k xTrain = xNormalData[trainIndex] #Test data (normal and all anomaly data) for every k xTest = np.concatenate((xNormalData[testIndex], xAnomalyData), axis=0) yTest = np.concatenate( (np.zeros(np.size(xNormalData[testIndex], axis=0)) + 1, -1 * np.ones(np.size(xAnomalyData, axis=0))), axis=0) #Create Support Vector Machines model svm = OneClassSVM(nu=nu, kernel=kernel, gamma=gamma) svm.fit(xTrain) #Make predictions predictions = svm.predict(xTest) #Calculate metrics for every k accuracy = metrics.accuracy_score(yTest, predictions) recall = metrics.recall_score(yTest, predictions) precision = metrics.precision_score(yTest, predictions) f1Score = metrics.f1_score(yTest, predictions) #Partial calculations of overal metrics accuracies += accuracy recalls += recall precisions += precision f1Scores += f1Score kIndex += 1 #Print metrics for every k print(str(kIndex) + " Fold Iteration:") print("Accuracy: " + str(accuracy * 100) + "%") print("Recall: " + str(recall * 100) + "%")
del train_data['exercise'] del train_data['minute'] del train_data['second'] # print(train_data) ## print(test_data) # remove from testing data : bp_systolic,bp_diastolic,drink_coffee,eating,sleeping,exercise # del test_data['bp_systolic'] # del test_data['bp_diastolic'] del test_data['drink_coffee'] del test_data['eating'] del test_data['sleeping'] del test_data['exercise'] del test_data['minute'] del test_data['second'] print(test_data) clf = OneClassSVM() output_training = clf.fit(train_data) y_pred = clf.predict(test_data) # print(y_pred) i = 0 for idx, data in enumerate(y_pred): if data > 0: print(idx, data) print(train_data.iloc[[idx]]) i += 1 print(i)
gscv.fit(X_train, y_train) print_gscv_score(gscv) y_pred = gscv.predict(X_train) print('train data: ', end="") print_score_rgr(y_train, y_pred) # visualize fig = yyplot(y_train, y_pred) #%% # Novelty detection by One Class SVM with optimized hyperparameter clf = OneClassSVM(nu=0.003, kernel=gscv.best_params_['model__kernel'], gamma=gscv.best_params_['model__gamma']) clf.fit(X_train) reliability1 = clf.predict(X_test) # outliers = -1 # Novelty detection by One Class SVM with optimized hyperparameter optgamma = optimize_gamma(X_train, range_g) clf = OneClassSVM(nu=0.003, kernel=gscv.best_params_['model__kernel'], gamma=optgamma) clf.fit(X_train) reliability2 = clf.predict(X_test) # outliers = -1 print("gamma1, 2 = ", gscv.best_params_['model__gamma'], optgamma) y_pred = gscv.predict(X_test) # predicted y data = [] for i in range(len(X_test)):
def remove_outliers(features, max_fraction=0.1, min_fraction=0.25, verbose=False): """ Remove outliers from feature set. Since this is an unsupervised approach we iterate over many nu/gamma settings for the one-class SVM. For each setting, a certain fraction of the subjects will be classified as outliers. For some settings, this fraction will be very large, e.g., 90% which is not realistic. For this reason, you can set a maximum fraction, e.g., 10%. Only those parameter combinations that result in 10% or less outliers are considered for further analysis. Within those combinations we simply count how often a given subject is classified as an outlier. We then use a minimum fraction to determine when a subject is truly an outlier. :param features: :param max_fraction: Upper bound on number of outliers allowed :param min_fraction: Lower bound on number of times a subject is classified as outlier :param verbose: Verbosity. :return: Filtered feature set """ X, y = util.get_xy( features, target_column='diagnosis', exclude_columns=['age', 'gender', 'diagnosis']) subjects = {} nr_ok_fractions = 0 for nu in np.linspace(0.01, 1.0, num=20): for gamma in [2**x for x in range(-15, 4, 2)]: # Train classifier classifier = OneClassSVM(kernel='rbf', gamma=gamma, nu=nu) classifier.fit(X) y_pred = classifier.predict(X) # Calculate fraction of outliers count = 0.0 for i in range(len(y_pred)): if y_pred[i] == -1: count += 1.0 fraction = count / len(y_pred) # If fraction is less than threshold run through list again to find # which subjects are considered outliers. Each outlying subject is # added to the table and its value incremented by one if fraction < max_fraction: nr_ok_fractions += 1 for i in range(len(y_pred)): if y_pred[i] == -1: subject = features.index[i] if subject not in subjects.keys(): subjects[subject] = 0 subjects[subject] += 1 # Print number of times each subject is identified as outlier outliers = [] for subject in subjects.keys(): fraction = subjects[subject] / float(nr_ok_fractions) if fraction >= min_fraction: outliers.append(subject) # Remove outlying subjects if verbose: print('Removing {} outliers...'.format(len(outliers))) features.drop(outliers, axis=0, inplace=True) return features
standard_x = standard_scaler.transform(x) minmax_x = minmax_scaler.transform(x) pca_x = pca_scaler.transform(x) # ## SVM - choose the number of cluster # In[31]: from sklearn.svm import OneClassSVM svm_clf = OneClassSVM(gamma='auto',nu = 0.25).fit(standard_x) y_pred =svm_clf.predict(standard_x) # In[32]: #check result using PCA from mpl_toolkits.mplot3d import Axes3D pca = PCA(n_components=2) pca.fit(standard_x) x_pca = pca.transform(standard_x) pca_cluster_center = PCA(n_components=2) #2D plot
# 导入库 from sklearn.svm import OneClassSVM # 导入OneClassSVM import numpy as np # 导入numpy库 import matplotlib.pyplot as plt # 导入Matplotlib from mpl_toolkits.mplot3d import Axes3D # 导入3D样式库 # 数据准备 raw_data = np.loadtxt('outlier.txt', delimiter=' ') # 读取数据 train_set = raw_data[:900, :] # 训练集 test_set = raw_data[900:, :] # 测试集 # 异常数据检测 model_onecalsssvm = OneClassSVM(nu=0.1, kernel="rbf", random_state=0) # 创建异常检测算法模型对象 model_onecalsssvm.fit(train_set) # 训练模型 pre_test_outliers = model_onecalsssvm.predict(test_set) # 异常检测 # 异常结果统计 toal_test_data = np.hstack( (test_set, pre_test_outliers.reshape(test_set.shape[0], 1))) # 将测试集和检测结果合并 normal_test_data = toal_test_data[toal_test_data[:, -1] == 1] # 获得异常检测结果中正常数据集 outlier_test_data = toal_test_data[toal_test_data[:, -1] == -1] # 获得异常检测结果中异常数据 n_test_outliers = outlier_test_data.shape[0] # 获得异常的结果数量 total_count_test = toal_test_data.shape[0] # 获得测试集样本量 print('outliers: {0}/{1}'.format(n_test_outliers, total_count_test)) # 输出异常的结果数量 print('{:*^60}'.format(' all result data (limit 5) ')) # 打印标题 print(toal_test_data[:5]) # 打印输出前5条合并后的数据集 # 异常检测结果展示
def run(self, x, knownFeatures): trainSet = x[:, knownFeatures].T print(trainSet.shape) clf = OneClassSVM() clf.fit(trainSet) self.selected_features = clf.predict(x.T)
def eval(cfg, model, train_dataset, test_dataset, criterion, publisher="test"): model.eval() # get global features using a training dataset train_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, num_workers=cfg.nworkers, pin_memory=True) train_loader = tqdm(train_loader, ncols=100, desc="get train GF") train_global_features = [] with torch.no_grad(): for lidx, (inputs, targets) in enumerate(train_loader): inputs = inputs.to(cfg.device, non_blocking=True) inputs = torch.transpose( inputs, 1, 2)[:, :3] # inputs.shape: Batch_size, num_channels, num_points # model encoder processing outputs, _, _ = model.encoder(inputs) # add a global feature to a list train_global_features.append(PytorchTools.t2n(outputs)) train_global_features = np.concatenate( train_global_features, axis=0) # shape (num_train_data, 1024) # get global features using a validation dataset test_loader = DataLoader(test_dataset, batch_size=cfg.batch_size, num_workers=cfg.nworkers, pin_memory=True) test_loader = tqdm(test_loader, ncols=100, desc="get eval GF") test_global_features = [] eval_labels = [] loss_list = [] with torch.no_grad(): for lidx, (inputs, targets) in enumerate(test_loader): inputs = inputs.to(cfg.device, non_blocking=True) inputs = torch.transpose( inputs, 1, 2)[:, :3] # inputs.shape: Batch_size, num_channels, num_points # model encoder processing outputs, _, _ = model.encoder(inputs) # get reconstructions for loss of true data reconstructions = model.decoder(outputs) # compute loss inputs = torch.transpose(inputs, 1, 2) dist1, dist2 = criterion["chamfer_distance"](inputs, reconstructions) dist1 = np.mean(PytorchTools.t2n(dist1), axis=1) dist2 = np.mean(PytorchTools.t2n(dist2), axis=1) dist_loss = dist1 + dist2 # add dist_losses to a list loss_list.append(dist_loss) # add a global feature to a list test_global_features.append(PytorchTools.t2n(outputs)) # get eval labels eval_labels.append(targets) test_global_features = np.concatenate( test_global_features, axis=0) # shape (num_eval_data, 1024) eval_labels = np.squeeze(np.concatenate(eval_labels, axis=0), axis=-1) # shape (num_data) loss_list = np.concatenate(loss_list, axis=0) # use one class classification classifier = OneClassSVM(kernel='rbf', nu=0.1, gamma='auto') classifier.fit(train_global_features) pred_labels = classifier.predict(test_global_features) # get training data label _, true_label = train_dataset[0] # convert eval labels other than true labels to -1 eval_labels[eval_labels != true_label] = -1 # convert true labels to 1 eval_labels[eval_labels == true_label] = 1 # get loss of true data dist_loss = np.mean(loss_list[eval_labels]).item() # get a accuracy acc = np.mean(pred_labels == eval_labels).item() * 100 return acc, dist_loss
def remove_outliers(data): clf = OneClassSVM(nu=0.2, kernel="rbf", gamma=0.00001) clf.fit(data) logging.info("%s outliers removed from %s elements" % ((clf.predict(data) == -1).sum(), len(data))) return data[clf.predict(data) == 1]
import pandas as pd from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, confusion_matrix from sklearn.model_selection import GridSearchCV from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.svm import LinearSVC from sklearn.svm import OneClassSVM from sklearn.svm import NuSVC from sklearn.metrics import accuracy_score from sklearn.metrics import balanced_accuracy_score from sklearn.metrics import f1_score from sklearn.metrics import precision_score from sklearn.metrics import recall_score df = pd.read_csv('iris.csv', sep=',') features = list(df.columns[:4]) X = df.drop('variety', axis=1) y = df['variety'] classifier = OneClassSVM(gamma=1.1, kernel='linear') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) text_file = open("generated/result8.txt", "w") print("accuracy_score= " + str(accuracy_score(y_test, y_pred)), flush=True) text_file.write("accuracy_score= " + str(accuracy_score(y_test, y_pred))) text_file.close()
class OSVM(TransformerMixin): """ One-class SVM used for outlier and novelty detection. Wrapper for sklearn implementation of Scholkopf2000. """ def __init__(self, kernel='rbf', degree=3, gamma='auto', coef0=0.0,\ tol=0.001, nu=0.5, shrinking=True, cache_size=200, verbose=False,\ max_iter=-1, random_state=None): """ Inits OSVM. @param kernel Kernel type. String ['linear', 'poly', 'rbf', 'sigmoid']. @param degree Polynomial kernel degree. Integer. @param gamma Kernel coefficient. @param coef0 Independent term in kernel function. Scalar. @param tol Tolerance for stopping criterion. Scalar float. @param nu Error upper bound and SV upper bound. Scalar [0,1]. @param shrinking Whether to use the shrinking heuristic. Boolean. @param cache_size Specify the size of the kernel cache (in MB). @param verbose Enable verbose output. Boolean. @param max_iter Hard limit on iterations within solver. -1 for no limit. @param random_state Random seed. """ # Setting parameters for classifier self.__mdl = OneClassSVM(kernel, degree, gamma, coef0, tol, nu,\ shrinking, cache_size, verbose, max_iter, random_state) def fit(self, X, y=None, w=None): """ Detects the soft boundary of the set of samples X. @param X Input matrix [n_samples, n_features]. @param y Labels vector [n_samples]. @param w Per-sample weights [n_samples]. @return self """ # Fit classifier self.__mdl = self.__mdl.fit(X, y=y, sample_weight=w) # Return self for sklearn API return self def predict(self, X): """ Estimates input data class (normal, novelty or outlier) @param X Input matrix [n_samples, n_features]. @return Data labels (+1 or -1). """ # Predict labels = self.__mdl.predict(X) # Return return labels def transform(self, X): """ Returns the data class given the detection model. @param X Input matrix [n_samples, n_features]. @return Data with detection labels data. """ # Computing error Xlbs = self.predict(X) # Concatenating errors to x Xlbs = np.hstack((X, Xlbs)) # Returning return Xlbs def save(self, path): """ Saves current model. @param path File path. """ # Opening file with open(path, 'w') as fp: # Saving on disk pickle.dump(self.__dict__, fp, 2) def load(self, path): """ Loads a saved model. @param path File path. """ # Opening file with open(path, 'r') as fp: # Loading from disk tmp_dict = pickle.load(fp) self.__dict__.update(tmp_dict)
def oneClass(self): model = OneClassSVM() model.fit(self.arr) model.predict(self.arr)
ax.set_zlabel( 'Similarity of Neighboring Districts' ) ax.set_zlim( [ 0., 1. ] ) ax.set_xlim( [ 0., 500. ] ) ax.set_ylim( [ 0., 1. ] ) fig.show() angles = np.linspace(0,360,41)[:-1] # Take 20 angles between 0 and 360 rotanimate(ax, angles,'movie.gif',delay=20, width = 6., height = 5.) # do outlier search using one-class SVM data[ 0, : ] = preprocessing.scale( data[ 0, : ] ) model = OneClassSVM( gamma = .001, nu = .1 ) fit = model.fit( data ) preds = model.predict( data ) inlier = np.where( preds == 1. )[ 0 ] outlier = np.where( preds == -1. )[ 0 ] fig = plt.figure() ax = fig.add_subplot( 111, projection = '3d' ) ax.scatter( data[ inlier, 0 ], data[ inlier, 1 ], data[ inlier, 2 ], c = 'b' ) ax.scatter( data[ outlier, 0 ], data[ outlier, 1 ], data[ outlier, 2 ], c = 'k' ) ax.set_xlabel( '$P^2/A$' ) ax.set_ylabel( 'Margin' ) ax.set_zlabel( 'Similarity of Neighboring Districts' ) ax.set_ylim( [0., 1 ] ) ax.set_zlim( [ 0., 1. ] )
slicer = featurizer.FirstSlicer(2) X = slicer.transform(X0) Xf0 = np.concatenate(X) Xf = Xf0[::50] hexbin(Xf0[:, 0], Xf0[:, 1], bins='log') svm = OneClassSVM(nu=0.15) svm.fit(Xf) y = svm.predict(Xf) plot(Xf[y==1][:, 0], Xf[y==1][:, 1], 'kx') plot(Xf[y==-1][:, 0], Xf[y==-1][:, 1], 'wx') clusterer = cluster.GMM(n_components=3) yi = map(lambda x: svm.predict(x), X) from msmbuilder.cluster import MultiSequenceClusterMixin, BaseEstimator from sklearn.svm import OneClassSVM class OneClassSVMTrimmer(MultiSequenceClusterMixin, OneClassSVM, BaseEstimator): def partial_transform(self, traj):
def classifier(data): from sklearn.covariance import EllipticEnvelope from sklearn.svm import OneClassSVM from sklearn.datasets import load_boston from sklearn import preprocessing # Get data # Define "classifiers" to be used legend1 = {} legend2 = {} evaluation = [[val["coverage"], val["num_exons"], val["distance_to_next"]] for val in data] X = [[val["coverage"], val["num_exons"], val["distance_to_next"]] for val in data] X = preprocessing.scale(X) evaluation = preprocessing.scale(evaluation) # Learn a frontier for outlier detection with several classifiers sample = random.sample(X, 20000) clf = OneClassSVM(nu=.1, kernel='rbf') test = random.sample(evaluation, 2000) print >> sys.stderr, "fitting data" clf.fit(sample) print >> sys.stderr, "predicting data" Y = clf.predict(test) print >> sys.stderr, "plotting data" fig, axes = subplots() for i in range(len(test)): if Y[i] == 1: color = 'blue' else: color = 'red' axes.scatter(test[i][2], test[i][1], c=color) #ylim([50,2000]) #num exons ylabel("distance") #xlim([3,10]) xlabel("coverage") savefig("DistanceVCoverage.pdf") fig, axes = subplots() """ for i in range(len(test)): if Y[i] == 1: color = 'blue' else: color = 'red' axes.scatter(test[i][1], test[i][0], c=color) #xlim([0,10]) #num exons xlabel("number of exons") #ylim([3,15]) ylabel("coverage") savefig("ExonsvsCoverage.pdf") """ full_test = clf.predict(evaluation) novel, regular = [],[] for i in range(len(full_test)): result = full_test[i] if result == -1: print data[i]["id"] novel.append(data[i]["num_exons"]) else: regular.append(data[i]["num_exons"]) multi_exon_novel = [val for val in novel if val > 1] multi_exon_regular = [val for val in regular if val > 1] print >> sys.stderr, "novel, regular" print >> sys.stderr, len(novel), len(regular) print >> sys.stderr, mean(multi_exon_novel), mean(multi_exon_regular), len(multi_exon_novel), len(multi_exon_regular)
def main(): args = parse_arguments() random.seed(args.seed) X, y = load_data(args) if args.scale: scaler = MinMaxScaler() X = scaler.fit_transform(X) y_value = np.unique(y) f_index = np.where(y == y_value[0])[0] s_index = np.where(y == y_value[1])[0] target_X, target_y = X[f_index], np.ones(len(f_index)) outlier_X, outlier_y = X[s_index], -np.ones(len(s_index)) target_X_train, target_X_test, target_y_train, target_y_test = train_test_split(target_X, target_y, shuffle=True, random_state=args.seed, test_size=1/3) self_adaptive_shifting = SelfAdaptiveShifting(target_X_train) self_adaptive_shifting.edge_pattern_detection(args.threshold) pseudo_outlier_X = self_adaptive_shifting.generate_pseudo_outliers() pseudo_target_X = self_adaptive_shifting.generate_pseudo_targets() pseudo_outlier_y = -np.ones(len(pseudo_outlier_X)) pseudo_target_y = np.ones(len(pseudo_target_X)) gamma_candidates = [1e-4, 1e-3, 1e-2, 1e-1, 1e-0, 1e+1, 1e+2, 1e+3, 1/np.size(target_X, -1)] nu_candidates = [0.005, 0.01, 0.05, 0.1, 0.5] best_err = 1.0 best_gamma, best_nu = 1/np.size(target_X, -1), 0.5 for gamma in tqdm(gamma_candidates): for nu in tqdm(nu_candidates): model = OneClassSVM(gamma=gamma, nu=nu).fit(target_X_train) err_o = 1 - np.mean(model.predict(pseudo_outlier_X) == pseudo_outlier_y) err_t = 1 - np.mean(model.predict(pseudo_target_X) == pseudo_target_y) err = float((err_o + err_t) / 2) if err < best_err: best_err = err best_gamma = gamma best_nu = nu best_model = OneClassSVM(kernel=args.kernel, gamma=best_gamma, nu=best_nu).fit(target_X_train) target_pred = best_model.predict(target_X_test) outlier_pred = best_model.predict(outlier_X) y_pred = np.concatenate((target_pred, outlier_pred)) y_true = np.concatenate((target_y_test, outlier_y)) f1 = f1_score(y_true, y_pred, average="binary") mcc = matthews_corrcoef(y_true, y_pred) acc = accuracy_score(y_true, y_pred) print("\n[%s] (gamma: %.4f, nu: %.4f, err: %.4f) \nf1-score: %.4f, mcc: %.4f, acc: %.4f" % (args.data, best_gamma, best_nu, best_err, f1, mcc, acc)) model = OneClassSVM(kernel=args.kernel).fit(target_X_train) target_pred = model.predict(target_X_test) outlier_pred = model.predict(outlier_X) y_pred = np.concatenate((target_pred, outlier_pred)) y_true = np.concatenate((target_y_test, outlier_y)) f1 = f1_score(y_true, y_pred, average="binary") mcc = matthews_corrcoef(y_true, y_pred) acc = accuracy_score(y_true, y_pred) print("\n[%s] (default setting) \nf1-score: %.4f, mcc: %.4f, acc: %.4f" % (args.data, f1, mcc, acc)) if args.visualize: self_adaptive_shifting.visualize()
url = "C:/Users/Βασίλης/IdeaProjects/MyThesisApp/Data sets/Total_Vehicle_Sales.csv" dataset = pd.read_csv(url) outliers_fraction = 0.05 data = dataset[['Value']] scaler = StandardScaler() np_scaled = scaler.fit_transform(data) data = pd.DataFrame(np_scaled) # train oneclassSVM model = OneClassSVM(nu=outliers_fraction, kernel='rbf', gamma=0.01) model.fit(data) dataset['anomaly'] = pd.Series(model.predict(data)) print(dataset) a = dataset.loc[dataset['anomaly'] == -1, ['Date', 'Value']] #anomaly fig, ax = plt.subplots(figsize=(15, 10)) ax.plot(dataset['Date'], dataset['Value'], color='blue') ax.scatter(a['Date'], a['Value'], color='red', label='Anomaly Detection OneClassSVM') plt.show() # original = [] # anomalies = []
class OutlierRemover(): """ strategy: z_score inter_quartile_range isolation_forest elliptic_envelope local_outlier_factor one_class_svm params: Isolation Forest: n_estimators EllipticEnvelope: contamination LocalOutlierFactor: n_neighbors OneClassSVM kernel degree gamma """ def __init__(self, strategy, **params): self.all_strategies = [ 'z_score', 'inter_quartile_range', 'isolation_forest', 'elliptic_envelope', 'local_outlier_factor', 'one_class_svm' ] if strategy not in self.all_strategies: raise Exception( 'Invalid Strategy... strategy can be one of the follwing:\n', *self.all_strategies) self.strategy = strategy self.params = params if strategy == 'isolation_forest': self.outlier_remover = IsolationForest(n_estimators=params.get( 'n_estimators', 100), bootstrap=True, random_state=19) if strategy == 'elliptic_envelope': self.outlier_remover = EllipticEnvelope(contamination=params.get( 'contamination', 0.1), random_state=19) if strategy == 'local_outlier_factor': self.outlier_remover = LocalOutlierFactor(contamination=params.get( 'n_neighbors', 20), random_state=19) if strategy == 'one_class_svm': self.outlier_remover = OneClassSVM( kernel=params.get('kernel', 'rbf'), degree=params.get('degree', 3), gamma=params.get('gamma', 'scale')) def fit(self, X, y): if self.strategy not in ['z_score', 'inter_quartile_range']: return self.outlier_remover.fit(X) return self def transform(self, X, y): if self.strategy not in ['z_score', 'inter_quartile_range']: y_hat = self.outlier_remover.predict(X) mask = y_hat != -1 X, y = X.iloc[mask, :], y.iloc[mask] return X, y if self.strategy == 'z_score': z = pd.DataFrame(np.abs(stats.zscore(X))) idx = X[z <= 3].dropna().index, return X.iloc[idx], y.iloc[idx] if self.strategy == 'inter_quartile_range': Q1 = X.quantile(0.25) Q3 = X.quantile(0.75) IQR = Q3 - Q1 idx = X[(X >= (Q1 - 1.5 * IQR)) & (X <= (Q3 + 1.5 * IQR))].dropna().index return X.iloc[idx], y.iloc[idx] def fit_transform(self, X, y): self.fit(X, y) return self.transform(X, y)
def decision_tree_classify( data_dir: str, train_attack_name: str = "FGSM", train_transform_name: str = "noop", test_attack_name: str = "FGSM", test_transform_name: str = "noop", ): train_dir = os.path.join(data_dir, f"train") test_dir = os.path.join(data_dir, f"test") train_original_dir = os.path.join(train_dir, f"original_{train_transform_name}") train_adversarial_dir = os.path.join( train_dir, f"{train_attack_name}_{train_transform_name}") test_original_dir = os.path.join(test_dir, f"original_{test_transform_name}") test_adversarial_dir = os.path.join( test_dir, f"{test_attack_name}_{test_transform_name}") train_model_by_key_fn = partial( train_model_by_key, train_original_dir, test_original_dir, test_adversarial_dir, ) keys = [ "channel_relation", "channel_birelation", "spatial_relation", "weight_relation", "hieght_width_relation", "channel_weight", ] clf_list = [] train_pred_list = [] for key in keys: clf = train_model_by_key_fn(key) clf_list.append(clf) train_original_data, train_original_label, \ train_original_model_pred = load_data_and_label( train_original_dir, training_images_per_class, key = key, ) if len(train_pred_list) == 0: train_pred_list.append(np.expand_dims(train_original_model_pred, 1)) train_pred_list.append( np.expand_dims(clf.predict(train_original_data), 1)) return # One class classification based on predictions of each classifier pred = np.concatenate(train_pred_list, axis=1) one_class_clf = OneClassSVM( # max_depth = 10, # min_samples_leaf = 20, ) one_class_clf = one_class_clf.fit(pred) test_original_preds = [] test_adversarial_preds = [] for clf, key in zip(clf_list, keys): test_original_data, test_original_label, _ = load_data_and_label( test_original_dir, test_images_per_class, key=key, ) test_adversarial_data, test_adversarial_label, test_adversarial_pred = load_data_and_label( test_adversarial_dir, test_images_per_class, key=key, ) if len(test_original_preds) == 0: test_original_preds.append(np.expand_dims(test_original_label, 1)) original_pred = clf.predict(test_original_data) original_pred = np.expand_dims(original_pred, 1) test_original_preds.append(original_pred) if len(test_adversarial_preds) == 0: test_adversarial_preds.append( np.expand_dims(test_adversarial_pred, 1)) adversarial_pred = clf.predict(test_adversarial_data) adversarial_pred = np.expand_dims(adversarial_pred, 1) test_adversarial_preds.append(adversarial_pred) original_input = np.concatenate(test_original_preds, axis=1) adversarial_input = np.concatenate(test_adversarial_preds, axis=1) original_pred = one_class_clf.predict(original_input) adversarial_pred = one_class_clf.predict(adversarial_input) acc = (original_pred == 1).sum() / len(original_pred) fpr = 1 - acc tpr = (adversarial_pred == -1).sum() / len(adversarial_pred) roc_tpr = [0, tpr, 1] roc_fpr = [0, fpr, 1] auc = metrics.auc(roc_fpr, roc_tpr) print(f"One class pred tpr: {tpr:.3f}, fpr: {fpr:.3f}, auc: {auc:.3f}")
def base_experiment(config, pct_noise=0.15, noverlap_bits=0, ntrials=10, verbose=False, seed=123456789): """Run a single experiment, locally. @param config: The configuration parameters. @param pct_noise: The percentage of noise to add to the dataset. @param noverlap_bits: The number of bits the base class should overlap with the novelty class. @param ntrials: The number of times to repeat the experiment. @param verbose: If True print the results. @param seed: The random seed to use. """ # Base parameters ntrain, ntest = 800, 200 nsamples, nbits, pct_active = ntest + ntrain, 100, 0.4 clf_th = 0.5 # Build the directory, if needed base_dir = config['log_dir'] if not os.path.exists(base_dir): os.makedirs(base_dir) # Seed numpy np.random.seed(seed) # Create the base dataset x_ds = SPDataset(nsamples, nbits, pct_active, pct_noise, seed=seed) x_tr, x_te = x_ds.data[:ntrain], x_ds.data[ntrain:] # Create the outlier dataset base_indexes = set(np.where(x_ds.base_class == 1)[0]) choices = [x for x in range(nbits) if x not in base_indexes] outlier_base = np.zeros(nbits, dtype='bool') outlier_base[np.random.choice(choices, x_ds.nactive - noverlap_bits, False)] = 1 outlier_base[np.random.permutation(list(base_indexes))[:noverlap_bits]] = 1 y_ds = SPDataset(ntest, nbits, pct_active, pct_noise, outlier_base, seed) y_te = y_ds.data if verbose: bctn = 1 - (np.mean(x_te, 0) * x_ds.base_class.astype('i')).sum() / 40. ocn = 1 - (np.mean(y_te, 0) * outlier_base.astype('i')).sum() / 40. overlap = (np.dot(x_ds.base_class.astype('i'), outlier_base.astype('i'))) print(f"\nBase class' test noise: {bctn:2.2f}") print(f"Outlier's class noise: {ocn:2.2f}") print(f'Overlap between two classes: {overlap}') # Metrics metrics = SPMetrics() # Get the metrics for the datasets u_x_tr = metrics.compute_uniqueness(x_tr) o_x_tr = metrics.compute_overlap(x_tr) u_x_te = metrics.compute_uniqueness(x_te) o_x_te = metrics.compute_overlap(x_te) u_y_te = metrics.compute_uniqueness(y_te) o_y_te = metrics.compute_overlap(y_te) # Initialize the overall results sp_x_results = np.zeros(ntrials) sp_y_results = np.zeros(ntrials) svm_x_results = np.zeros(ntrials) svm_y_results = np.zeros(ntrials) # Iterate across the trials: for i, seed2 in enumerate(generate_seeds(ntrials, seed)): # Create the SP config['seed'] = seed2 sp = SPRegion(**config) # Fit the SP sp.fit(x_tr) # Get the SP's output sp_x_tr = sp.predict(x_tr) sp_x_te = sp.predict(x_te) sp_y_te = sp.predict(y_te) # Get the metrics for the SP's results u_sp_x_tr = metrics.compute_uniqueness(sp_x_tr) o_sp_x_tr = metrics.compute_overlap(sp_x_tr) u_sp_x_te = metrics.compute_uniqueness(sp_x_te) o_sp_x_te = metrics.compute_overlap(sp_x_te) u_sp_y_te = metrics.compute_uniqueness(sp_y_te) o_sp_y_te = metrics.compute_overlap(sp_y_te) # Log all of the metrics sp._log_stats('Input Base Class Train Uniqueness', u_x_tr) sp._log_stats('Input Base Class Train Overlap', o_x_tr) sp._log_stats('Input Base Class Test Uniqueness', u_x_te) sp._log_stats('Input Base Class Test Overlap', o_x_te) sp._log_stats('Input Novelty Class Test Uniqueness', u_y_te) sp._log_stats('Input Novelty Class Test Overlap', o_y_te) sp._log_stats('SP Base Class Train Uniqueness', u_sp_x_tr) sp._log_stats('SP Base Class Train Overlap', o_sp_x_tr) sp._log_stats('SP Base Class Test Uniqueness', u_sp_x_te) sp._log_stats('SP Base Class Test Overlap', o_sp_x_te) sp._log_stats('SP Novelty Class Test Uniqueness', u_sp_y_te) sp._log_stats('SP Novelty Class Test Overlap', o_sp_y_te) # Print the results fmt_s = '{0}:\t{1:2.4f}\t{2:2.4f}\t{3:2.4f}\t{4:2.4f}\t{5:2.4f}\t{6:2.4f}' if verbose: print('\nDescription\tx_tr\tx_te\ty_te\tsp_x_tr\tsp_x_te\tsp_y_te') print((fmt_s.format('Uniqueness', u_x_tr, u_x_te, u_y_te, u_sp_x_tr, u_sp_x_te, u_sp_y_te))) print((fmt_s.format('Overlap', o_x_tr, o_x_te, o_y_te, o_sp_x_tr, o_sp_x_te, o_sp_y_te))) # Get average representation of the base class sp_base_result = np.mean(sp_x_tr, 0) sp_base_result[sp_base_result >= 0.5] = 1 sp_base_result[sp_base_result < 1] = 0 # Averaged results for each metric type u_sp_base_to_x_te = 0. o_sp_base_to_x_te = 0. u_sp_base_to_y_te = 0. o_sp_base_to_y_te = 0. for x, y in zip(sp_x_te, sp_y_te): # Refactor xt = np.vstack((sp_base_result, x)) yt = np.vstack((sp_base_result, y)) # Compute the sums u_sp_base_to_x_te += metrics.compute_uniqueness(xt) o_sp_base_to_x_te += metrics.compute_overlap(xt) u_sp_base_to_y_te += metrics.compute_uniqueness(yt) o_sp_base_to_y_te += metrics.compute_overlap(yt) u_sp_base_to_x_te /= ntest o_sp_base_to_x_te /= ntest u_sp_base_to_y_te /= ntest o_sp_base_to_y_te /= ntest # Log the results sp._log_stats('Base Train to Base Test Uniqueness', u_sp_base_to_x_te) sp._log_stats('Base Train to Base Test Overlap', o_sp_base_to_x_te) sp._log_stats('Base Train to Novelty Test Uniqueness', u_sp_base_to_y_te) sp._log_stats('Base Train to Novelty Test Overlap', o_sp_base_to_y_te) # Print the results if verbose: print('\nDescription\tx_tr->x_te\tx_tr->y_te') print(f'Uniqueness:\t\ {u_sp_base_to_x_te:2.4f}\t{u_sp_base_to_y_te:2.4f}') print('Overlap:\t\ {o_sp_base_to_x_te:2.4f}\t{o_sp_base_to_y_te:2.4f}') # Create an SVM clf = OneClassSVM(kernel='linear', nu=0.1, random_state=seed2) # Evaluate the SVM's performance clf.fit(x_tr) svm_x_te = len(np.where(clf.predict(x_te) == 1)[0]) / float(ntest) * \ 100 svm_y_te = len(np.where(clf.predict(y_te) == -1)[0]) / float(ntest) * \ 100 # Perform classification using overlap as the feature # -- The overlap must be above 50% clf_x_te = 0. clf_y_te = 0. for x, y in zip(sp_x_te, sp_y_te): # Refactor xt = np.vstack((sp_base_result, x)) yt = np.vstack((sp_base_result, y)) # Compute the accuracy xo = metrics.compute_overlap(xt) yo = metrics.compute_overlap(yt) if xo >= clf_th: clf_x_te += 1 if yo < clf_th: clf_y_te += 1 clf_x_te = (clf_x_te / ntest) * 100 clf_y_te = (clf_y_te / ntest) * 100 # Store the results as errors sp_x_results[i] = 100 - clf_x_te sp_y_results[i] = 100 - clf_y_te svm_x_results[i] = 100 - svm_x_te svm_y_results[i] = 100 - svm_y_te # Log the results sp._log_stats('SP % Correct Base Class', clf_x_te) sp._log_stats('SP % Correct Novelty Class', clf_y_te) sp._log_stats('SVM % Correct Base Class', svm_x_te) sp._log_stats('SVM % Correct Novelty Class', svm_y_te) # Print the results if verbose: print(f'\nSP Base Class Detection : {clf_x_te:2.2f}%') print(f'SP Novelty Class Detection : {clf_y_te:2.2f}%') print(f'SVM Base Class Detection : {svm_x_te:2.2f}%') print(f'SVM Novelty Class Detection : {svm_y_te:2.2f}%') # Save the results with open(os.path.join(base_dir, 'results.pkl'), 'wb') as f: pickle.dump((sp_x_results, sp_y_results, svm_x_results, svm_y_results), f, pickle.HIGHEST_PROTOCOL)
# Data Training SVMModel = OneClassSVM() # Deleted log1p, because there are too many labels # we cannot cover when checking precision, recall, f_score # yLabelsLog = np.log(yLabels+3) SVMModel.fit(dataTrain) # Test Trained Random Forest Regressor preds = SVMModel.predict(X=dataTest) # testLog = np.log(testYLabels+3) # testLog = testLog.values.ravel() # evaluation values (or matrix) aScore = accuracy_score(testYLabels, preds.round()) cMatrix = confusion_matrix(testYLabels, preds.round()) # ignore '0' value for displaying cMatrixDP = np.delete(cMatrix, 2, 0) cMatrixDP = np.delete(cMatrixDP, 2, 1) precisionList = precision(cMatrix) recallList = recall(cMatrix) precisionList = np.delete(precisionList, 2)
def base_experiment(pct_noise=0.15, noverlap_bits=0, exp_name='1-1', ntrials=10, verbose=True, seed=123456789): """Run a single experiment, locally. @param pct_noise: The percentage of noise to add to the dataset. @param noverlap_bits: The number of bits the base class should overlap with the novelty class. @param exp_name: The name of the experiment. @param ntrials: The number of times to repeat the experiment. @param verbose: If True print the results. @param seed: The random seed to use. @return: A tuple containing the percentage errors for the SP's training and testing results and the SVM's training and testing results, respectively. """ # Base parameters ntrain, ntest = 800, 200 nsamples, nbits, pct_active = ntest + ntrain, 100, 0.4 clf_th = 0.5 log_dir = os.path.join(os.path.expanduser('~'), 'scratch', 'novelty_experiments', exp_name) # Configure the SP config = { 'ninputs': 100, 'trim': 1e-4, 'disable_boost': True, 'seed': seed, 'pct_active': None, 'random_permanence': True, 'pwindow': 0.5, 'global_inhibition': True, 'ncolumns': 200, 'nactive': 50, 'nsynapses': 75, 'seg_th': 15, 'syn_th': 0.5, 'pinc': 0.001, 'pdec': 0.001, 'nepochs': 10, 'log_dir': log_dir } # Seed numpy np.random.seed(seed) # Create the base dataset x_ds = SPDataset(nsamples, nbits, pct_active, pct_noise, seed=seed) x_tr, x_te = x_ds.data[:ntrain], x_ds.data[ntrain:] # Create the outlier dataset base_indexes = set(np.where(x_ds.base_class == 1)[0]) choices = [x for x in range(nbits) if x not in base_indexes] outlier_base = np.zeros(nbits, dtype='bool') outlier_base[np.random.choice(choices, x_ds.nactive - noverlap_bits, False)] = 1 outlier_base[np.random.permutation(list(base_indexes))[:noverlap_bits]] = 1 y_ds = SPDataset(ntest, nbits, pct_active, pct_noise, outlier_base, seed) y_te = y_ds.data if verbose: # copied from novelty_detection_slurm.py bctn = 1 - (np.mean(x_te, 0) * x_ds.base_class.astype('i')).sum() / 40. ocn = 1 - (np.mean(y_te, 0) * outlier_base.astype('i')).sum() / 40. overlap = (np.dot(x_ds.base_class.astype('i'), outlier_base.astype('i'))) print(f"\nBase class' test noise: {bctn:2.2f}") print(f"Outlier's class noise: {ocn:2.2f}") print(f'Overlap between two classes: {overlap}') # Metrics metrics = SPMetrics() # Get the metrics for the datasets u_x_tr = metrics.compute_uniqueness(x_tr) o_x_tr = metrics.compute_overlap(x_tr) c_x_tr = 1 - metrics.compute_distance(x_tr) u_x_te = metrics.compute_uniqueness(x_te) o_x_te = metrics.compute_overlap(x_te) c_x_te = 1 - metrics.compute_distance(x_te) u_y_te = metrics.compute_uniqueness(y_te) o_y_te = metrics.compute_overlap(y_te) c_y_te = 1 - metrics.compute_distance(y_te) # Initialize the overall results sp_x_results = np.zeros(ntrials) sp_y_results = np.zeros(ntrials) svm_x_results = np.zeros(ntrials) svm_y_results = np.zeros(ntrials) # Iterate across the trials: for i in range(ntrials): # Make a new seed seed2 = np.random.randint(1000000) config['seed'] = seed2 config['log_dir'] = '{0}-{1}'.format(log_dir, i + 1) # Create the SP sp = SPRegion(**config) # Fit the SP sp.fit(x_tr) # Get the SP's output sp_x_tr = sp.predict(x_tr) sp_x_te = sp.predict(x_te) sp_y_te = sp.predict(y_te) # Get the metrics for the SP's results u_sp_x_tr = metrics.compute_uniqueness(sp_x_tr) o_sp_x_tr = metrics.compute_overlap(sp_x_tr) c_sp_x_tr = 1 - metrics.compute_distance(sp_x_tr) u_sp_x_te = metrics.compute_uniqueness(sp_x_te) o_sp_x_te = metrics.compute_overlap(sp_x_te) c_sp_x_te = 1 - metrics.compute_distance(sp_x_te) u_sp_y_te = metrics.compute_uniqueness(sp_y_te) o_sp_y_te = metrics.compute_overlap(sp_y_te) c_sp_y_te = 1 - metrics.compute_distance(sp_y_te) # Log all of the metrics sp._log_stats('Input Base Class Train Uniqueness', u_x_tr) sp._log_stats('Input Base Class Train Overlap', o_x_tr) sp._log_stats('Input Base Class Train Correlation', c_x_tr) sp._log_stats('Input Base Class Test Uniqueness', u_x_te) sp._log_stats('Input Base Class Test Overlap', o_x_te) sp._log_stats('Input Base Class Test Correlation', c_x_te) sp._log_stats('Input Novelty Class Test Uniqueness', u_y_te) sp._log_stats('Input Novelty Class Test Overlap', o_y_te) sp._log_stats('Input Novelty Class Test Correlation', c_y_te) sp._log_stats('SP Base Class Train Uniqueness', u_sp_x_tr) sp._log_stats('SP Base Class Train Overlap', o_sp_x_tr) sp._log_stats('SP Base Class Train Correlation', c_sp_x_tr) sp._log_stats('SP Base Class Test Uniqueness', u_sp_x_te) sp._log_stats('SP Base Class Test Overlap', o_sp_x_te) sp._log_stats('SP Base Class Test Correlation', c_sp_x_te) sp._log_stats('SP Novelty Class Test Uniqueness', u_sp_y_te) sp._log_stats('SP Novelty Class Test Overlap', o_sp_y_te) sp._log_stats('SP Novelty Class Test Correlation', c_sp_y_te) # Print the results fmt_s = '{0}:\t{1:2.4f}\t{2:2.4f}\t{3:2.4f}\t{4:2.4f}\t{5:2.4f}\t{5:2.4f}' if verbose: print('\nDescription\tx_tr\tx_te\ty_te\tsp_x_tr\tsp_x_te\tsp_y_te') print((fmt_s.format('Uniqueness', u_x_tr, u_x_te, u_y_te, u_sp_x_tr, u_sp_x_te, u_sp_y_te))) print((fmt_s.format('Overlap', o_x_tr, o_x_te, o_y_te, o_sp_x_tr, o_sp_x_te, o_sp_y_te))) print((fmt_s.format('Correlation', c_x_tr, c_x_te, c_y_te, c_sp_x_tr, c_sp_x_te, c_sp_y_te))) # Get average representation of the base class sp_base_result = np.mean(sp_x_tr, 0) sp_base_result[sp_base_result >= 0.5] = 1 sp_base_result[sp_base_result < 1] = 0 # Averaged results for each metric type u_sp_base_to_x_te = 0. o_sp_base_to_x_te = 0. c_sp_base_to_x_te = 0. u_sp_base_to_y_te = 0. o_sp_base_to_y_te = 0. c_sp_base_to_y_te = 0. for x, y in zip(sp_x_te, sp_y_te): # Refactor xt = np.vstack((sp_base_result, x)) yt = np.vstack((sp_base_result, y)) # Compute the sums u_sp_base_to_x_te += metrics.compute_uniqueness(xt) o_sp_base_to_x_te += metrics.compute_overlap(xt) c_sp_base_to_x_te += 1 - metrics.compute_distance(xt) u_sp_base_to_y_te += metrics.compute_uniqueness(yt) o_sp_base_to_y_te += metrics.compute_overlap(yt) c_sp_base_to_y_te += 1 - metrics.compute_distance(yt) u_sp_base_to_x_te /= ntest o_sp_base_to_x_te /= ntest c_sp_base_to_x_te /= ntest u_sp_base_to_y_te /= ntest o_sp_base_to_y_te /= ntest c_sp_base_to_y_te /= ntest # Log the results sp._log_stats('Base Train to Base Test Uniqueness', u_sp_base_to_x_te) sp._log_stats('Base Train to Base Test Overlap', o_sp_base_to_x_te) sp._log_stats('Base Train to Base Test Correlation', c_sp_base_to_x_te) sp._log_stats('Base Train to Novelty Test Uniqueness', u_sp_base_to_y_te) sp._log_stats('Base Train to Novelty Test Overlap', o_sp_base_to_y_te) sp._log_stats('Base Train to Novelty Test Correlation', c_sp_base_to_y_te) # Print the results if verbose: print('\nDescription\tx_tr->x_te\tx_tr->y_te') print(f'Uniqueness:\t\ {u_sp_base_to_x_te:2.4f}\t{u_sp_base_to_y_te:2.4f}') print('Overlap:\t\ {o_sp_base_to_x_te:2.4f}\t{o_sp_base_to_y_te:2.4f}') print('Correlation:\t\ {c_sp_base_to_x_te:2.4f}\t{c_sp_base_to_y_te:2.4f}') # Create an SVM clf = OneClassSVM(kernel='linear', nu=0.1, random_state=seed2) # Evaluate the SVM's performance clf.fit(x_tr) svm_x_te = len(np.where(clf.predict(x_te) == 1)[0]) / float(ntest) * \ 100 svm_y_te = len(np.where(clf.predict(y_te) == -1)[0]) / float(ntest) * \ 100 # Perform classification using overlap as the feature # -- The overlap must be above 50% clf_x_te = 0. clf_y_te = 0. for x, y in zip(sp_x_te, sp_y_te): # Refactor xt = np.vstack((sp_base_result, x)) yt = np.vstack((sp_base_result, y)) # Compute the accuracy xo = metrics.compute_overlap(xt) yo = metrics.compute_overlap(yt) if xo >= clf_th: clf_x_te += 1 if yo < clf_th: clf_y_te += 1 clf_x_te = (clf_x_te / ntest) * 100 clf_y_te = (clf_y_te / ntest) * 100 # Store the results as errors sp_x_results[i] = 100 - clf_x_te sp_y_results[i] = 100 - clf_y_te svm_x_results[i] = 100 - svm_x_te svm_y_results[i] = 100 - svm_y_te # Log the results sp._log_stats('SP % Correct Base Class', clf_x_te) sp._log_stats('SP % Correct Novelty Class', clf_y_te) sp._log_stats('SVM % Correct Base Class', svm_x_te) sp._log_stats('SVM % Correct Novelty Class', svm_y_te) # Print the results if verbose: print(f'\nSP Base Class Detection : {clf_x_te:2.2f}%') print(f'SP Novelty Class Detection : {clf_y_te:2.2f}%') print(f'SVM Base Class Detection : {svm_x_te:2.2f}%') print(f'SVM Novelty Class Detection : {svm_y_te:2.2f}%') return sp_x_results, sp_y_results, svm_x_results, svm_y_results
Users_26JS_pca = pca.fit_transform(Users_26JS) Users_26JS_pca_nor = Normalizer().fit_transform(Users_26JS_pca) print 'PCA与归一化完成...\n' X_train_lst = [] X_test_lst = [] for index_2 in X_train: X_train_lst.append(Users_26JS_pca_nor[index_2]) for index_3 in Users_not_All_Jobs_index: X_test_lst.append(Users_26JS_pca_nor[index_3]) X_train_array= np.array(X_train_lst) X_test_array = np.array(X_test_lst) print 'OCSVM开始训练...\n' clf = OneClassSVM(kernel='rbf', tol=0.01, nu=0.5, gamma='auto') clf.fit(X_train_array) pred = clf.predict(X_test_array) print '开始输出分类结果...\n' # 考虑标题行在内 # ACM2278:line 2841; # CMP2946:line 2331; # PLJ1771:line 1283; # CDE1846:line 656; # MBG3183:line 1495; # print 'ACM2278 is ', clf.predict(Users_26JS_pca_nor[2839]), '\t', clf.decision_function(Users_26JS_pca_nor[2839]), '\n' # print 'CMP2946 is ', clf.predict(Users_26JS_pca_nor[2329]), '\n', clf.decision_function(Users_26JS_pca_nor[2329]), '\n' # print 'PLJ1771 is ', clf.predict(Users_26JS_pca_nor[1281]), '\t', clf.decision_function(Users_26JS_pca_nor[1281]), '\n' # print 'CDE1846 is ', clf.predict(Users_26JS_pca_nor[654]), '\n', clf.decision_function(Users_26JS_pca_nor[654]), '\n' # print 'MBG3183 is ', clf.predict(Users_26JS_pca_nor[1493]), '\n', clf.decision_function(Users_26JS_pca_nor[1493]), '\n'
scaler = preprocessing.StandardScaler().fit(tr_data) tr_data = scaler.transform(tr_data) cv_data = scaler.transform(cv_data) bot_data = scaler.transform(bot_data) #Hard coded for testing. will change gt_data = [+1]*23 # Outlier detection code using multi variate gaussian # mu, sigma = estimateGaussian(tr_data) # p = multivariateGaussian(tr_data,mu,sigma) # p_cv = multivariateGaussian(cv_data,mu,sigma) # fscore, ep = selectThresholdByCV(p_cv,gt_data) # mu, sigma = estimateGaussian(bot_data) # p_bot = multivariateGaussian(bot_data,mu,sigma) # outliers = p_bot < ep # print (outliers) # <codecell> # Novelty detection using one class SVM outlierDetector = OneClassSVM() outlierDetector.fit(cv_data, gt_data) bot_preds = outlierDetector.predict(bot_data) print(bot_preds)
def base_experiment(config, pct_noise=0.15, noverlap_bits=0, ntrials=10, verbose=False, seed=123456789): """ Run a single experiment, locally. @param config: The configuration parameters. @param pct_noise: The percentage of noise to add to the dataset. @param noverlap_bits: The number of bits the base class should overlap with the novelty class. @param ntrials: The number of times to repeat the experiment. @param verbose: If True print the results. @param seed: The random seed to use. """ # Base parameters ntrain, ntest = 800, 200 nsamples, nbits, pct_active = ntest + ntrain, 100, 0.4 clf_th = 0.5 # Build the directory, if needed base_dir = config['log_dir'] if not os.path.exists(base_dir): os.makedirs(base_dir) # Seed numpy np.random.seed(seed) # Create the base dataset x_ds = SPDataset(nsamples, nbits, pct_active, pct_noise, seed=seed) x_tr, x_te = x_ds.data[:ntrain], x_ds.data[ntrain:] # Create the outlier dataset base_indexes = set(np.where(x_ds.base_class == 1)[0]) choices = [x for x in xrange(nbits) if x not in base_indexes] outlier_base = np.zeros(nbits, dtype='bool') outlier_base[np.random.choice(choices, x_ds.nactive - noverlap_bits, False)] = 1 outlier_base[np.random.permutation(list(base_indexes))[:noverlap_bits]] = 1 y_ds = SPDataset(ntest, nbits, pct_active, pct_noise, outlier_base, seed) y_te = y_ds.data if verbose: print "\nBase class' test noise: {0:2.2f}".format(1 - (np.mean(x_te, 0) * x_ds.base_class.astype('i')).sum() / 40.) print "Outlier's class noise: {0:2.2f}".format(1 - (np.mean(y_te, 0) * outlier_base.astype('i')).sum() / 40.) print 'Overlap between two classes: {0}'.format(np.dot( x_ds.base_class.astype('i'), outlier_base.astype('i'))) # Metrics metrics = SPMetrics() # Get the metrics for the datasets u_x_tr = metrics.compute_uniqueness(x_tr) o_x_tr = metrics.compute_overlap(x_tr) u_x_te = metrics.compute_uniqueness(x_te) o_x_te = metrics.compute_overlap(x_te) u_y_te = metrics.compute_uniqueness(y_te) o_y_te = metrics.compute_overlap(y_te) # Initialize the overall results sp_x_results = np.zeros(ntrials) sp_y_results = np.zeros(ntrials) svm_x_results = np.zeros(ntrials) svm_y_results = np.zeros(ntrials) # Iterate across the trials: for i, seed2 in enumerate(generate_seeds(ntrials, seed)): # Create the SP config['seed'] = seed2 sp = SPRegion(**config) # Fit the SP sp.fit(x_tr) # Get the SP's output sp_x_tr = sp.predict(x_tr) sp_x_te = sp.predict(x_te) sp_y_te = sp.predict(y_te) # Get the metrics for the SP's results u_sp_x_tr = metrics.compute_uniqueness(sp_x_tr) o_sp_x_tr = metrics.compute_overlap(sp_x_tr) u_sp_x_te = metrics.compute_uniqueness(sp_x_te) o_sp_x_te = metrics.compute_overlap(sp_x_te) u_sp_y_te = metrics.compute_uniqueness(sp_y_te) o_sp_y_te = metrics.compute_overlap(sp_y_te) # Log all of the metrics sp._log_stats('Input Base Class Train Uniqueness', u_x_tr) sp._log_stats('Input Base Class Train Overlap', o_x_tr) sp._log_stats('Input Base Class Test Uniqueness', u_x_te) sp._log_stats('Input Base Class Test Overlap', o_x_te) sp._log_stats('Input Novelty Class Test Uniqueness', u_y_te) sp._log_stats('Input Novelty Class Test Overlap', o_y_te) sp._log_stats('SP Base Class Train Uniqueness', u_sp_x_tr) sp._log_stats('SP Base Class Train Overlap', o_sp_x_tr) sp._log_stats('SP Base Class Test Uniqueness', u_sp_x_te) sp._log_stats('SP Base Class Test Overlap', o_sp_x_te) sp._log_stats('SP Novelty Class Test Uniqueness', u_sp_y_te) sp._log_stats('SP Novelty Class Test Overlap', o_sp_y_te) # Print the results fmt_s = '{0}:\t{1:2.4f}\t{2:2.4f}\t{3:2.4f}\t{4:2.4f}\t{5:2.4f}\t{6:2.4f}' if verbose: print '\nDescription\tx_tr\tx_te\ty_te\tsp_x_tr\tsp_x_te\tsp_y_te' print fmt_s.format('Uniqueness', u_x_tr, u_x_te, u_y_te, u_sp_x_tr, u_sp_x_te, u_sp_y_te) print fmt_s.format('Overlap', o_x_tr, o_x_te, o_y_te, o_sp_x_tr, o_sp_x_te, o_sp_y_te) # Get average representation of the base class sp_base_result = np.mean(sp_x_tr, 0) sp_base_result[sp_base_result >= 0.5] = 1 sp_base_result[sp_base_result < 1] = 0 # Averaged results for each metric type u_sp_base_to_x_te = 0. o_sp_base_to_x_te = 0. u_sp_base_to_y_te = 0. o_sp_base_to_y_te = 0. for x, y in zip(sp_x_te, sp_y_te): # Refactor xt = np.vstack((sp_base_result, x)) yt = np.vstack((sp_base_result, y)) # Compute the sums u_sp_base_to_x_te += metrics.compute_uniqueness(xt) o_sp_base_to_x_te += metrics.compute_overlap(xt) u_sp_base_to_y_te += metrics.compute_uniqueness(yt) o_sp_base_to_y_te += metrics.compute_overlap(yt) u_sp_base_to_x_te /= ntest o_sp_base_to_x_te /= ntest u_sp_base_to_y_te /= ntest o_sp_base_to_y_te /= ntest # Log the results sp._log_stats('Base Train to Base Test Uniqueness', u_sp_base_to_x_te) sp._log_stats('Base Train to Base Test Overlap', o_sp_base_to_x_te) sp._log_stats('Base Train to Novelty Test Uniqueness', u_sp_base_to_y_te) sp._log_stats('Base Train to Novelty Test Overlap', o_sp_base_to_y_te) # Print the results if verbose: print '\nDescription\tx_tr->x_te\tx_tr->y_te' print 'Uniqueness:\t{0:2.4f}\t{1:2.4f}'.format(u_sp_base_to_x_te, u_sp_base_to_y_te) print 'Overlap:\t{0:2.4f}\t{1:2.4f}'.format(o_sp_base_to_x_te, o_sp_base_to_y_te) # Create an SVM clf = OneClassSVM(kernel='linear', nu=0.1, random_state=seed2) # Evaluate the SVM's performance clf.fit(x_tr) svm_x_te = len(np.where(clf.predict(x_te) == 1)[0]) / float(ntest) * \ 100 svm_y_te = len(np.where(clf.predict(y_te) == -1)[0]) / float(ntest) * \ 100 # Perform classification using overlap as the feature # -- The overlap must be above 50% clf_x_te = 0. clf_y_te = 0. for x, y in zip(sp_x_te, sp_y_te): # Refactor xt = np.vstack((sp_base_result, x)) yt = np.vstack((sp_base_result, y)) # Compute the accuracy xo = metrics.compute_overlap(xt) yo = metrics.compute_overlap(yt) if xo >= clf_th: clf_x_te += 1 if yo < clf_th: clf_y_te += 1 clf_x_te = (clf_x_te / ntest) * 100 clf_y_te = (clf_y_te / ntest) * 100 # Store the results as errors sp_x_results[i] = 100 - clf_x_te sp_y_results[i] = 100 - clf_y_te svm_x_results[i] = 100 - svm_x_te svm_y_results[i] = 100 - svm_y_te # Log the results sp._log_stats('SP % Correct Base Class', clf_x_te) sp._log_stats('SP % Correct Novelty Class', clf_y_te) sp._log_stats('SVM % Correct Base Class', svm_x_te) sp._log_stats('SVM % Correct Novelty Class', svm_y_te) # Print the results if verbose: print '\nSP Base Class Detection : {0:2.2f}%'.format(clf_x_te) print 'SP Novelty Class Detection : {0:2.2f}%'.format(clf_y_te) print 'SVM Base Class Detection : {0:2.2f}%'.format(svm_x_te) print 'SVM Novelty Class Detection : {0:2.2f}%'.format(svm_y_te) # Save the results with open(os.path.join(base_dir, 'results.pkl'), 'wb') as f: cPickle.dump((sp_x_results, sp_y_results, svm_x_results, svm_y_results), f, cPickle.HIGHEST_PROTOCOL)
X_test = X_test.toarray() n_samples, n_features = X.shape test_samples, test_features = X_test.shape print "done in %fs" % (time() - t0) print "Train set - n_samples: %d, n_features: %d" % (n_samples, n_features) print "Test set - n_samples: %d, n_features: %d" % (test_samples, test_features) print # fit the model # when nu=0.01, gamma=0.0034607 is the smallest to generate >0 result clf = OneClassSVM(nu=0.01, kernel="rbf", gamma=0.05) clf.fit(X) # predit on X_test y_pred = clf.predict(X_test) # Count number of selected items given different gamma and nu # This change is interesting # Could further study systematically usign grid search # count = 0 for i, pred in enumerate(y_pred): if pred != -1: count += 1 print count csvWriter = csv.writer(open("detected.csv","wb")) for i, pred in enumerate(y_pred): if pred != -1:
import numpy as np from rop_dataextract import * from sklearn.svm import OneClassSVM import sys MAX_EVENT_COUNTERS = 4 TIME_DELTA = 10000 CLUSTER_POINTS = 32 TRAIN_POINTS = 100000 TEST_POINTS = -1 svm = OneClassSVM() train_set, test_set = getSetNames(sys.argv) print "aggregating data..." obs = aggrTimeseries(train_set, TRAIN_POINTS, CLUSTER_POINTS, MAX_EVENT_COUNTERS, TIME_DELTA) print len(obs) print "fitting model..." svm.fit(obs) print "aggregating test..." test = aggrTimeseries(test_set, TEST_POINTS, CLUSTER_POINTS, MAX_EVENT_COUNTERS, TIME_DELTA) print "testing..." prediction = svm.predict(test) print sum(prediction) print len(prediction)
def compute_scores(normal_users, queue, Ks=[]): ''' Calculates the novelty scores (noise and strangeness) for the 4 algotithms Receives the list of normal users and the queue (all users) and the list of curiosity factors Ks Updates the global variables GMM_n, one_n, lsa_n, K_n, GMM_s, one_s, lsa_s with the results ''' global GMM_n, one_n, lsa_n, K_n, GMM_s, one_s, lsa_s, K_s #Novelty Scores for each algorithm, those ''_n are for noise score, ''_s are for strangeness score GMM_n = [] one_n = [] lsa_n = [] K_n = [] GMM_s = [] one_s = [] lsa_s = [] K_s = [] K_GMM_n, K_KMeans_n, K_GMM_s, K_KMeans_s = Ks #K_GMM_n, K_KMeans_n are the noise curiosity factors for each algorithm #K_GMM_s, K_KMeans_s are the strangeness curiosity factors for each algorithm #Ks is a list containing the 4 above mentioned parameters ''' For One_class_SVM and LSA, when asked to predict the new entry, a label is directly returned LSA: 'anomaly' or '0' (normal) One One_class_SVM: -1 (anomaly) or 1 (normal) GMM and K means predict a fitting score. The novelty score is obtained calculating the zscore of the entry compared with the scores of all other entries, calling the function get_score_last_item If the zscore returned >= 1 the new entry is anomalous ''' ''' Noise scores are computed with the queue as the base of knowledge, fitting all the entries but the last to the algorithm ''' B = GMM(covariance_type='full', n_components = 1) B.fit(queue[0:-1]) x = [B.score([i]).mean() for i in queue] GMM_n.append(get_score_last_item(x, K_GMM_n)) K = KMeans(n_clusters=1) K.fit(queue[0:-1]) x = [K.score([i]) for i in queue] K_n.append(get_score_last_item(x, K_KMeans_n)) oneClassSVM = OneClassSVM(nu=0.1) oneClassSVM.fit(queue[0:-1]) x = oneClassSVM.predict(np.array([queue[-1]])) if x == -1: one_n.append(1) if x == 1: one_n.append(0) X = np.array(queue[0:-1]) anomalymodel = lsanomaly.LSAnomaly() anomalymodel.fit(X) x = anomalymodel.predict(np.array([queue[-1]])) if x == ['anomaly']: lsa_n.append(1) if x == [0]: lsa_n.append(0) ''' Strangeness scores are computed with the normal users as the base of knowledge, fitting normal users to the algorithm ''' normal_and_new = normal_users + [queue[-1]] #List to be passed to get_score_last_item to calculate the zscore of the last item, the new entry B = GMM(covariance_type='full', n_components = 1) B.fit(normal_users) x = [B.score([i]).mean() for i in normal_and_new] GMM_s.append(get_score_last_item(x, K_GMM_s)) K = KMeans(n_clusters=1) K.fit(normal_users) x = [K.score([i]) for i in normal_and_new] K_s.append(get_score_last_item(x, K_KMeans_s)) oneClassSVM = OneClassSVM(nu=0.1) oneClassSVM.fit(normal_users) x = oneClassSVM.predict(np.array([queue[-1]])) if x == -1: one_s.append(1) if x == 1: one_s.append(0) anomalymodel = lsanomaly.LSAnomaly() X = np.array(normal_users) anomalymodel.fit(X) x = anomalymodel.predict(np.array([queue[-1]])) if x == ['anomaly']: lsa_s.append(1) if x == [0]: lsa_s.append(0) return GMM_n, one_n, lsa_n, K_n, GMM_s, one_s, lsa_s, K_s
def base_experiment(config, ntrials=1, seed=123456789): """ Run a single experiment, locally. @param config: The configuration parameters to use for the SP. @param ntrials: The number of times to repeat the experiment. @param seed: The random seed to use. @return: A tuple containing the percentage errors for the SP's training and testing results and the SVM's training and testing results, respectively. """ # Base parameters ntrain, ntest = 800, 200 clf_th = 0.5 # Seed numpy np.random.seed(seed) # Get the data (tr_x, tr_y), (te_x, te_y) = load_mnist() tr_x_0 = np.random.permutation(tr_x[tr_y == 0]) x_tr = tr_x_0[:ntrain] x_te = tr_x_0[ntrain:ntrain + ntest] outliers = [np.random.permutation(tr_x[tr_y == i])[:ntest] for i in xrange(1, 10)] # Metrics metrics = SPMetrics() # Get the metrics for the datasets u_x_tr = metrics.compute_uniqueness(x_tr) o_x_tr = metrics.compute_overlap(x_tr) c_x_tr = 1 - metrics.compute_distance(x_tr) u_x_te = metrics.compute_uniqueness(x_te) o_x_te = metrics.compute_overlap(x_te) c_x_te = 1 - metrics.compute_distance(x_te) u_y_te, o_y_te, c_y_te = [], [], [] for outlier in outliers: u_y_te.append(metrics.compute_uniqueness(outlier)) o_y_te.append(metrics.compute_overlap(outlier)) c_y_te.append(1 - metrics.compute_distance(outlier)) # Initialize the overall results sp_x_results = np.zeros(ntrials) sp_y_results = [np.zeros(ntrials) for _ in xrange(9)] svm_x_results = np.zeros(ntrials) svm_y_results = [np.zeros(ntrials) for _ in xrange(9)] # Iterate across the trials: for nt in xrange(ntrials): # Make a new seeod seed2 = np.random.randint(1000000) config['seed'] = seed2 # Create the SP sp = SPRegion(**config) # Fit the SP sp.fit(x_tr) # Get the SP's output sp_x_tr = sp.predict(x_tr) sp_x_te = sp.predict(x_te) sp_y_te = [sp.predict(outlier) for outlier in outliers] # Get the metrics for the SP's results u_sp_x_tr = metrics.compute_uniqueness(sp_x_tr) o_sp_x_tr = metrics.compute_overlap(sp_x_tr) c_sp_x_tr = 1 - metrics.compute_distance(sp_x_tr) u_sp_x_te = metrics.compute_uniqueness(sp_x_te) o_sp_x_te = metrics.compute_overlap(sp_x_te) c_sp_x_te = 1 - metrics.compute_distance(sp_x_te) u_sp_y_te, o_sp_y_te, c_sp_y_te = [], [], [] for y in sp_y_te: u_sp_y_te.append(metrics.compute_uniqueness(y)) o_sp_y_te.append(metrics.compute_overlap(y)) c_sp_y_te.append(1 - metrics.compute_distance(y)) # Log all of the metrics sp._log_stats('Input Base Class Train Uniqueness', u_x_tr) sp._log_stats('Input Base Class Train Overlap', o_x_tr) sp._log_stats('Input Base Class Train Correlation', c_x_tr) sp._log_stats('Input Base Class Test Uniqueness', u_x_te) sp._log_stats('Input Base Class Test Overlap', o_x_te) sp._log_stats('Input Base Class Test Correlation', c_x_te) sp._log_stats('SP Base Class Train Uniqueness', u_sp_x_tr) sp._log_stats('SP Base Class Train Overlap', o_sp_x_tr) sp._log_stats('SP Base Class Train Correlation', c_sp_x_tr) sp._log_stats('SP Base Class Test Uniqueness', u_sp_x_te) sp._log_stats('SP Base Class Test Overlap', o_sp_x_te) sp._log_stats('SP Base Class Test Correlation', c_sp_x_te) for i, (a, b, c, d, e, f) in enumerate(zip(u_y_te, o_y_te, c_y_te, u_sp_y_te, o_sp_y_te, c_sp_y_te), 1): sp._log_stats('Input Novelty Class {0} Uniqueness'.format(i), a) sp._log_stats('Input Novelty Class {0} Overlap'.format(i), b) sp._log_stats('Input Novelty Class {0} Correlation'.format(i), c) sp._log_stats('SP Novelty Class {0} Uniqueness'.format(i), d) sp._log_stats('SP Novelty Class {0} Overlap'.format(i), e) sp._log_stats('SP Novelty Class {0} Correlation'.format(i), f) # Get average representation of the base class sp_base_result = np.mean(sp_x_tr, 0) sp_base_result[sp_base_result >= 0.5] = 1 sp_base_result[sp_base_result < 1] = 0 # Averaged results for each metric type u_sp_base_to_x_te = 0. o_sp_base_to_x_te = 0. c_sp_base_to_x_te = 0. u_sp, o_sp, c_sp = np.zeros(9), np.zeros(9), np.zeros(9) for i, x in enumerate(sp_x_te): xt = np.vstack((sp_base_result, x)) u_sp_base_to_x_te += metrics.compute_uniqueness(xt) o_sp_base_to_x_te += metrics.compute_overlap(xt) c_sp_base_to_x_te += 1 - metrics.compute_distance(xt) for j, yi in enumerate(sp_y_te): yt = np.vstack((sp_base_result, yi[i])) u_sp[j] += metrics.compute_uniqueness(yt) o_sp[j] += metrics.compute_overlap(yt) c_sp[j] += 1 - metrics.compute_distance(yt) u_sp_base_to_x_te /= ntest o_sp_base_to_x_te /= ntest c_sp_base_to_x_te /= ntest for i in xrange(9): u_sp[i] /= ntest o_sp[i] /= ntest c_sp[i] /= ntest # Log the results sp._log_stats('Base Train to Base Test Uniqueness', u_sp_base_to_x_te) sp._log_stats('Base Train to Base Test Overlap', o_sp_base_to_x_te) sp._log_stats('Base Train to Base Test Correlation', c_sp_base_to_x_te) for i, j in enumerate(xrange(1, 10)): sp._log_stats('Base Train to Novelty {0} Uniqueness'.format(j), u_sp[i]) sp._log_stats('Base Train to Novelty {0} Overlap'.format(j), o_sp[i]) sp._log_stats('Base Train to Novelty {0} Correlation'.format(j), c_sp[i]) # Create an SVM clf = OneClassSVM(kernel='linear', nu=0.1, random_state=seed2) # Evaluate the SVM's performance clf.fit(x_tr) svm_x_te = len(np.where(clf.predict(x_te) == 1)[0]) / float(ntest) * \ 100 svm_y_te = np.array([len(np.where(clf.predict(outlier) == -1)[0]) / float(ntest) * 100 for outlier in outliers]) # Perform classification using overlap as the feature # -- The overlap must be above 50% clf_x_te = 0. clf_y_te = np.zeros(9) for i, x in enumerate(sp_x_te): xt = np.vstack((sp_base_result, x)) xo = metrics.compute_overlap(xt) if xo >= clf_th: clf_x_te += 1 for j, yi in enumerate(sp_y_te): yt = np.vstack((sp_base_result, yi[i])) yo = metrics.compute_overlap(yt) if yo < clf_th: clf_y_te[j] += 1 clf_x_te = (clf_x_te / ntest) * 100 clf_y_te = (clf_y_te / ntest) * 100 # Store the results as errors sp_x_results[nt] = 100 - clf_x_te sp_y_results[nt] = 100 - clf_y_te svm_x_results[nt] = 100 - svm_x_te svm_y_results[nt] = 100 - svm_y_te # Log the results sp._log_stats('SP % Correct Base Class', clf_x_te) sp._log_stats('SVM % Correct Base Class', svm_x_te) for i, j in enumerate(xrange(1, 10)): sp._log_stats('SP % Correct Novelty Class {0}'.format(j), clf_y_te[i]) sp._log_stats('SVM % Correct Novelty Class {0}'.format(j), svm_y_te[i]) sp._log_stats('SP % Mean Correct Novelty Class', np.mean(clf_y_te)) sp._log_stats('SVM % Mean Correct Novelty Class', np.mean(svm_y_te)) sp._log_stats('SP % Adjusted Score', (np.mean(clf_y_te) * clf_x_te) / 100) sp._log_stats('SVM % Adjusted Score', (np.mean(svm_y_te) * svm_x_te) / 100) return sp_x_results, sp_y_results, svm_x_results, svm_y_results
from sklearn.datasets import load_boston from sklearn.svm import OneClassSVM from scipy import stats # Get the data dataset = load_boston() data = dataset["data"][:, [5, 12]] # Banana-shaped data contamination = 0.261 gamma = 0.1 # Fit the model clf = OneClassSVM(nu=contamination, gamma=gamma) clf.fit(data) # Perform outlier detection predicted_data = clf.predict(data) inlier_predicted_data = data[predicted_data == 1] outlier_predicted_data = data[predicted_data == -1] num_inliers_predicted = inlier_predicted_data.shape[0] num_outliers_predicted = outlier_predicted_data.shape[0] # Plot decision function values xr = np.linspace(3, 10, 500) yr = np.linspace(-5, 45, 500) xx, yy = np.meshgrid(xr, yr) zz = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) zz = zz.reshape(xx.shape) scores = clf.decision_function(data) threshold = stats.scoreatpercentile(scores, 100 * contamination) plt.contourf(xx, yy,
def Predict(self): if self.ID < 0: self.ErrorMessage.setIcon(QMessageBox.Information) self.ErrorMessage.setText("Your are not logged in") self.ErrorMessage.setWindowTitle("Warning!") self.ErrorMessage.exec_() elif self.String == self.Accounts[self.ID].AccountPassword: y = [] for i in range(len(self.Accounts)): if self.Accounts[self.ID].AccountPassword == self.Accounts[ i].AccountPassword: for x in range(len(self.Accounts[i].TrainData)): y.append(self.Accounts[i].AccountName) sts = len(list(set(y))) self.ProcessData() Xset = [] Yset = [] sz = len(self.Accounts[self.ID].AccountPassword) * 2 for j in range(len(self.Accounts[self.ID].TrainData)): Xset.append(array(self.Accounts[self.ID].TrainData)[j][sz:]) Yset.append(1) Xset = array(Xset) Yset = array(Yset) trainx, testx, trainy, testy = train_test_split(Xset, Yset, test_size=0.3, random_state=2) trainx = array(trainx) X = [] multiy = [] multi2y = [] if sts > 1: for i in range(len(self.Accounts)): if self.Accounts[self.ID].AccountPassword == self.Accounts[ i].AccountPassword and self.ID != i: hold = [] for k in range(len(self.Accounts[i].TrainData)): hold.append(self.Accounts[i].TrainData[k][16:]) X = X + hold for x in range(len(self.Accounts[i].TrainData)): multiy.append(-1) multi2y.append(0) X = array(X) multiy = array(multiy) multi2y = array(multi2y) testx = np.concatenate((testx, X)) testymone = np.concatenate((testy, multiy)) testymzero = np.concatenate((testy, multi2y)) if sts == 1: testymone = testy testymzero = testy Osvm = OneClassSVM(kernel='rbf', gamma="auto").fit(trainx) Ypredict = Osvm.predict(testx) score = f1_score(testymone, Ypredict, pos_label=1) kmeans = KMeans(n_clusters=2, random_state=0).fit(trainx) Ypredict = kmeans.predict(testx) score1 = f1_score(testymzero, Ypredict, pos_label=1) brc = Birch(n_clusters=2, threshold=0.01).fit(trainx) Ypredict = brc.predict(testx) score2 = f1_score(testymzero, Ypredict, pos_label=1) IsF = IsolationForest(contamination=0.01) IsF.fit(trainx) Ypredict = IsF.predict(testx) score3 = f1_score(testymone, Ypredict, pos_label=1) ev = EllipticEnvelope(contamination=0.01) ev.fit(trainx) Ypredict = ev.predict(testx) score4 = f1_score(testymone, Ypredict, pos_label=1) if Osvm.predict([self.Dwell + self.Flight]) == 1: OsvmResult = 'pass' else: OsvmResult = 'fail' if kmeans.predict([self.Dwell + self.Flight]) == 1: kmResult = 'pass' else: kmResult = 'fail' if brc.predict([self.Dwell + self.Flight]) == 1: brcResult = 'pass' else: brcResult = 'fail' if IsF.predict([self.Dwell + self.Flight]) == 1: IsFResult = 'pass' else: IsFResult = 'fail' if ev.predict([self.Dwell + self.Flight]) == 1: evResult = 'pass' else: evResult = 'fail' #print(score,score1,score2,score3,score4) self.TrainText.setText("Score/Model" + " \n" + str(round(score, 2)) + " Osvm: " + OsvmResult + " \n" + str(round(score1, 2)) + " Km: " + kmResult + " \n" + str(round(score2, 2)) + " Brc: " + brcResult + " \n " + str(round(score3, 2)) + " ISF: " + IsFResult + " \n" + str(round(score4, 2)) + " Ev: " + evResult) #if sts > 1: # self.CompareText.setText(self.Accounts[self.ID].AccountPassword) # self.Compare() # prediction = self.clf.predict([self.Dwell+self.Flight]) # str1 = str(prediction) # self.TrainText.setText(str(prediction)) self.Reset() else: self.ErrorMessage.setIcon(QMessageBox.Information) self.ErrorMessage.setText("Your password is wrong") self.ErrorMessage.setWindowTitle("Warning!") self.ErrorMessage.exec_()
import librosa trainX, trainY = train_df[['mean','zc']], train_df['label'] testX, testY = test_df[['mean','zc']], test_df['label'] sc = StandardScaler() sc.fit(trainX) trainX = sc.transform(trainX) testX = sc.transform(testX) model = OneClassSVM() model.fit(trainX) y_pred = model.predict(testX) pred = np.where(pred==-1, 1, 0) def create_power_spectral(data): N = data.shape[1] dt = 10/N F = np.abs(np.fft.fft(data)/(N/2)) fq = np.linspace(0,1/dt, N) return F[:, :int(N/2)+1], fq[:int(N/2)+1] F, freq = create_power_spectral(train) plt.plot(freq, F[0]) melspec = librosa.feature.melspectrogram(train[0])
def predict_rate_features(self, pkt_featurizer): group_id = pkt_featurizer.pkt_type features = pkt_featurizer.features arrival_time = pkt_featurizer.arrival_time try: if len(self.time_delta3[group_id]) <= 1: raise ValueError td1 = arrival_time - self.time_data[group_id][-1] td2 = td1 - self.time_delta1[group_id][-1] td3 = td2 - self.time_delta2[group_id][-1] """ if self.plot: self.t_fig.cla() self.prep_figure(self.t_fig, "Time", "Pkt", grid=True) self.t_fig.scatter(self.time_data[group_id], range(len(self.time_data[group_id]))) """ dbscan1 = DBSCAN() dbscan2 = DBSCAN() dbscan3 = DBSCAN() td1_training = np.array(list(self.time_delta1[group_id]) + [td1]).reshape(-1,1) td2_training = np.array(list(self.time_delta2[group_id]) + [td2]).reshape(-1,1) td3_training = np.array(list(self.time_delta3[group_id]) + [td3]).reshape(-1,1) labels1 = dbscan1.fit_predict(td1_training) labels2 = dbscan2.fit_predict(td2_training) labels3 = dbscan3.fit_predict(td3_training) db_predict1 = labels1[-1] == -1 db_predict2 = labels2[-1] == -1 db_predict3 = labels3[-1] == -1 if self.plot: self.plot_1d_dbscan(td1_training, labels1, list(self.time_data[group_id])[(len(self.time_data[group_id])-len(self.time_delta1[group_id])) :]+[arrival_time], self.td1_fig_dbscan, "", "Pkt/Time", "Pkt Rate DBSCAN Clustering - Anomalous Pkts in Black") self.plot_1d_dbscan(td2_training, labels2, list(self.time_data[group_id])[(len(self.time_data[group_id])-len(self.time_delta2[group_id])) :]+[arrival_time], self.td2_fig_dbscan, "", "Pkt/Time^2") self.plot_1d_dbscan(td3_training, labels3, list(self.time_data[group_id])[(len(self.time_data[group_id])-len(self.time_delta3[group_id])) :]+[arrival_time], self.td3_fig_dbscan, "Time", "Pkt/Time^3") scaler1 = preprocessing.StandardScaler() scaler2 = preprocessing.StandardScaler() scaler3 = preprocessing.StandardScaler() time_training1 = scaler1.fit_transform(np.array(self.time_delta1[group_id]).reshape(-1,1)) time_features1 = scaler1.transform(np.array(td1).reshape(1,-1)) time_training2 = scaler2.fit_transform(np.array(self.time_delta2[group_id]).reshape(-1,1)) time_features2 = scaler2.transform(np.array(td2).reshape(1,-1)) time_training3 = scaler3.fit_transform(np.array(self.time_delta3[group_id]).reshape(-1,1)) time_features3 = scaler3.transform(np.array(td3).reshape(1,-1)) time_classifier1 = OneClassSVM().fit(time_training1) time_prediction1 = time_classifier1.predict(time_features1) time_classifier2 = OneClassSVM().fit(time_training2) time_prediction2 = time_classifier2.predict(time_features2) time_classifier3 = OneClassSVM().fit(time_training3) time_prediction3 = time_classifier3.predict(time_features3) if self.plot: self.plot_1d_svm(self.time_delta1[group_id], time_classifier1, list(self.time_data[group_id])[(len(self.time_data[group_id])-len(self.time_delta1[group_id])) :], scaler1, self.td1_fig_svm, "", "Pkt/Time", "Pkt Rate One Class SVM Classification") self.plot_1d_svm(self.time_delta2[group_id], time_classifier2, list(self.time_data[group_id])[(len(self.time_data[group_id])-len(self.time_delta2[group_id])) :], scaler2, self.td2_fig_svm, "", "Pkt/Time^2") self.plot_1d_svm(self.time_delta3[group_id], time_classifier3, list(self.time_data[group_id])[(len(self.time_data[group_id])-len(self.time_delta3[group_id])) :], scaler3, self.td3_fig_svm, "Time", "Pkt/Time^3") except (KeyError, IndexError, ValueError) as e: print e db_predict1, db_predict2, db_predict3 = 0,0,0 return db_predict1, db_predict2, db_predict3
#读取数据 data_path1 = './data/13A17ProAll.xlsx' data = [] read_xsls(data_path1) clf = OneClassSVM(gamma='auto', nu=0.001).fit(data) Y = [] data_xsls = xlrd.open_workbook("./data/13A17ProAll.xlsx") sheet_name = data_xsls.sheets()[0] #count_nrows = sheet_name.nrows for i in range(30000, 60000): a = [] for j in range(9): a.append(sheet_name.cell_value(i, j + 1)) Y.append(clf.predict([a])) Z = [] for i in range(len(Y)): n = 0 if i < 19: Z.append(Y[i]) else: for j in range(20): n += (Y[i - j] / 20) Z.append(n) X = [] for i in range(len(Y)): X.append(i + 30000) plt.plot(X, Z) plt.show()
def base_experiment(pct_noise=0.15, noverlap_bits=0, exp_name='1-1', ntrials=10, verbose=True, seed=123456789): """ Run a single experiment, locally. @param pct_noise: The percentage of noise to add to the dataset. @param noverlap_bits: The number of bits the base class should overlap with the novelty class. @param exp_name: The name of the experiment. @param ntrials: The number of times to repeat the experiment. @param verbose: If True print the results. @param seed: The random seed to use. @return: A tuple containing the percentage errors for the SP's training and testing results and the SVM's training and testing results, respectively. """ # Base parameters ntrain, ntest = 800, 200 nsamples, nbits, pct_active = ntest + ntrain, 100, 0.4 clf_th = 0.5 log_dir = os.path.join(os.path.expanduser('~'), 'scratch', 'novelty_experiments', exp_name) # Configure the SP config = { 'ninputs': 100, 'trim': 1e-4, 'disable_boost': True, 'seed': seed, 'pct_active': None, 'random_permanence': True, 'pwindow': 0.5, 'global_inhibition': True, 'ncolumns': 200, 'nactive': 50, 'nsynapses': 75, 'seg_th': 15, 'syn_th': 0.5, 'pinc': 0.001, 'pdec': 0.001, 'nepochs': 10, 'log_dir': log_dir } # Seed numpy np.random.seed(seed) # Create the base dataset x_ds = SPDataset(nsamples, nbits, pct_active, pct_noise, seed=seed) x_tr, x_te = x_ds.data[:ntrain], x_ds.data[ntrain:] # Create the outlier dataset base_indexes = set(np.where(x_ds.base_class == 1)[0]) choices = [x for x in xrange(nbits) if x not in base_indexes] outlier_base = np.zeros(nbits, dtype='bool') outlier_base[np.random.choice(choices, x_ds.nactive - noverlap_bits, False)] = 1 outlier_base[np.random.permutation(list(base_indexes))[:noverlap_bits]] = 1 y_ds = SPDataset(ntest, nbits, pct_active, pct_noise, outlier_base, seed) y_te = y_ds.data if verbose: print "\nBase class' test noise: {0:2.2f}".format(1 - (np.mean(x_te, 0) * x_ds.base_class.astype('i')).sum() / 40.) print "Outlier's class noise: {0:2.2f}".format(1 - (np.mean(y_te, 0) * outlier_base.astype('i')).sum() / 40.) print 'Overlap between two classes: {0}'.format(np.dot( x_ds.base_class.astype('i'), outlier_base.astype('i'))) # Metrics metrics = SPMetrics() # Get the metrics for the datasets u_x_tr = metrics.compute_uniqueness(x_tr) o_x_tr = metrics.compute_overlap(x_tr) c_x_tr = 1 - metrics.compute_distance(x_tr) u_x_te = metrics.compute_uniqueness(x_te) o_x_te = metrics.compute_overlap(x_te) c_x_te = 1 - metrics.compute_distance(x_te) u_y_te = metrics.compute_uniqueness(y_te) o_y_te = metrics.compute_overlap(y_te) c_y_te = 1 - metrics.compute_distance(y_te) # Initialize the overall results sp_x_results = np.zeros(ntrials) sp_y_results = np.zeros(ntrials) svm_x_results = np.zeros(ntrials) svm_y_results = np.zeros(ntrials) # Iterate across the trials: for i in xrange(ntrials): # Make a new seed seed2 = np.random.randint(1000000) config['seed'] = seed2 config['log_dir'] = '{0}-{1}'.format(log_dir, i + 1) # Create the SP sp = SPRegion(**config) # Fit the SP sp.fit(x_tr) # Get the SP's output sp_x_tr = sp.predict(x_tr) sp_x_te = sp.predict(x_te) sp_y_te = sp.predict(y_te) # Get the metrics for the SP's results u_sp_x_tr = metrics.compute_uniqueness(sp_x_tr) o_sp_x_tr = metrics.compute_overlap(sp_x_tr) c_sp_x_tr = 1 - metrics.compute_distance(sp_x_tr) u_sp_x_te = metrics.compute_uniqueness(sp_x_te) o_sp_x_te = metrics.compute_overlap(sp_x_te) c_sp_x_te = 1 - metrics.compute_distance(sp_x_te) u_sp_y_te = metrics.compute_uniqueness(sp_y_te) o_sp_y_te = metrics.compute_overlap(sp_y_te) c_sp_y_te = 1 - metrics.compute_distance(sp_y_te) # Log all of the metrics sp._log_stats('Input Base Class Train Uniqueness', u_x_tr) sp._log_stats('Input Base Class Train Overlap', o_x_tr) sp._log_stats('Input Base Class Train Correlation', c_x_tr) sp._log_stats('Input Base Class Test Uniqueness', u_x_te) sp._log_stats('Input Base Class Test Overlap', o_x_te) sp._log_stats('Input Base Class Test Correlation', c_x_te) sp._log_stats('Input Novelty Class Test Uniqueness', u_y_te) sp._log_stats('Input Novelty Class Test Overlap', o_y_te) sp._log_stats('Input Novelty Class Test Correlation', c_y_te) sp._log_stats('SP Base Class Train Uniqueness', u_sp_x_tr) sp._log_stats('SP Base Class Train Overlap', o_sp_x_tr) sp._log_stats('SP Base Class Train Correlation', c_sp_x_tr) sp._log_stats('SP Base Class Test Uniqueness', u_sp_x_te) sp._log_stats('SP Base Class Test Overlap', o_sp_x_te) sp._log_stats('SP Base Class Test Correlation', c_sp_x_te) sp._log_stats('SP Novelty Class Test Uniqueness', u_sp_y_te) sp._log_stats('SP Novelty Class Test Overlap', o_sp_y_te) sp._log_stats('SP Novelty Class Test Correlation', c_sp_y_te) # Print the results fmt_s = '{0}:\t{1:2.4f}\t{2:2.4f}\t{3:2.4f}\t{4:2.4f}\t{5:2.4f}\t{5:2.4f}' if verbose: print '\nDescription\tx_tr\tx_te\ty_te\tsp_x_tr\tsp_x_te\tsp_y_te' print fmt_s.format('Uniqueness', u_x_tr, u_x_te, u_y_te, u_sp_x_tr, u_sp_x_te, u_sp_y_te) print fmt_s.format('Overlap', o_x_tr, o_x_te, o_y_te, o_sp_x_tr, o_sp_x_te, o_sp_y_te) print fmt_s.format('Correlation', c_x_tr, c_x_te, c_y_te, c_sp_x_tr, c_sp_x_te, c_sp_y_te) # Get average representation of the base class sp_base_result = np.mean(sp_x_tr, 0) sp_base_result[sp_base_result >= 0.5] = 1 sp_base_result[sp_base_result < 1] = 0 # Averaged results for each metric type u_sp_base_to_x_te = 0. o_sp_base_to_x_te = 0. c_sp_base_to_x_te = 0. u_sp_base_to_y_te = 0. o_sp_base_to_y_te = 0. c_sp_base_to_y_te = 0. for x, y in zip(sp_x_te, sp_y_te): # Refactor xt = np.vstack((sp_base_result, x)) yt = np.vstack((sp_base_result, y)) # Compute the sums u_sp_base_to_x_te += metrics.compute_uniqueness(xt) o_sp_base_to_x_te += metrics.compute_overlap(xt) c_sp_base_to_x_te += 1 - metrics.compute_distance(xt) u_sp_base_to_y_te += metrics.compute_uniqueness(yt) o_sp_base_to_y_te += metrics.compute_overlap(yt) c_sp_base_to_y_te += 1 - metrics.compute_distance(yt) u_sp_base_to_x_te /= ntest o_sp_base_to_x_te /= ntest c_sp_base_to_x_te /= ntest u_sp_base_to_y_te /= ntest o_sp_base_to_y_te /= ntest c_sp_base_to_y_te /= ntest # Log the results sp._log_stats('Base Train to Base Test Uniqueness', u_sp_base_to_x_te) sp._log_stats('Base Train to Base Test Overlap', o_sp_base_to_x_te) sp._log_stats('Base Train to Base Test Correlation', c_sp_base_to_x_te) sp._log_stats('Base Train to Novelty Test Uniqueness', u_sp_base_to_y_te) sp._log_stats('Base Train to Novelty Test Overlap', o_sp_base_to_y_te) sp._log_stats('Base Train to Novelty Test Correlation', c_sp_base_to_y_te) # Print the results if verbose: print '\nDescription\tx_tr->x_te\tx_tr->y_te' print 'Uniqueness:\t{0:2.4f}\t{1:2.4f}'.format(u_sp_base_to_x_te, u_sp_base_to_y_te) print 'Overlap:\t{0:2.4f}\t{1:2.4f}'.format(o_sp_base_to_x_te, o_sp_base_to_y_te) print 'Correlation:\t{0:2.4f}\t{1:2.4f}'.format(c_sp_base_to_x_te, c_sp_base_to_y_te) # Create an SVM clf = OneClassSVM(kernel='linear', nu=0.1, random_state=seed2) # Evaluate the SVM's performance clf.fit(x_tr) svm_x_te = len(np.where(clf.predict(x_te) == 1)[0]) / float(ntest) * \ 100 svm_y_te = len(np.where(clf.predict(y_te) == -1)[0]) / float(ntest) * \ 100 # Perform classification using overlap as the feature # -- The overlap must be above 50% clf_x_te = 0. clf_y_te = 0. for x, y in zip(sp_x_te, sp_y_te): # Refactor xt = np.vstack((sp_base_result, x)) yt = np.vstack((sp_base_result, y)) # Compute the accuracy xo = metrics.compute_overlap(xt) yo = metrics.compute_overlap(yt) if xo >= clf_th: clf_x_te += 1 if yo < clf_th: clf_y_te += 1 clf_x_te = (clf_x_te / ntest) * 100 clf_y_te = (clf_y_te / ntest) * 100 # Store the results as errors sp_x_results[i] = 100 - clf_x_te sp_y_results[i] = 100 - clf_y_te svm_x_results[i] = 100 - svm_x_te svm_y_results[i] = 100 - svm_y_te # Log the results sp._log_stats('SP % Correct Base Class', clf_x_te) sp._log_stats('SP % Correct Novelty Class', clf_y_te) sp._log_stats('SVM % Correct Base Class', svm_x_te) sp._log_stats('SVM % Correct Novelty Class', svm_y_te) # Print the results if verbose: print '\nSP Base Class Detection : {0:2.2f}%'.format(clf_x_te) print 'SP Novelty Class Detection : {0:2.2f}%'.format(clf_y_te) print 'SVM Base Class Detection : {0:2.2f}%'.format(svm_x_te) print 'SVM Novelty Class Detection : {0:2.2f}%'.format(svm_y_te) return sp_x_results, sp_y_results, svm_x_results, svm_y_results
#UNIRE X1_test_n E X0_outliers_n in X_TEST_n X_TEST_n = np.concatenate((X1_test_n, X0_outliers_n)) #UNIRE Y1_test E Y0 Y_TEST = np.concatenate((Y1_test, Y0)) pca = PCA(n_components=0.95) reducer = pca.fit(X1_train_n) X1_train_n_reduced = reducer.transform(X1_train_n) X_TEST_n_reduced = reducer.transform(X_TEST_n) clf = OneClassSVM(gamma='auto', nu=0.5) clf.fit(X1_train_n_reduced) Y1_pred_train = clf.predict(X1_train_n_reduced) Y_pred_TEST = clf.predict(X_TEST_n_reduced) #VALUTAZIONE #TRAIN SET #matrice di confusione confmat = confusion_matrix(y_true=Y1_train, y_pred=Y1_pred_train) fig, ax = plt.subplots(figsize=(2.5, 2.5)) ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.5) for i in range(confmat.shape[0]): for j in range(confmat.shape[1]): ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')
#UNIRE X1_test_n E X0_outliers_n in X_TEST_n X_TEST_n=np.concatenate((X1_test_n, X0_outliers_n)) #UNIRE Y1_test E Y0 Y_TEST=np.concatenate((Y1_test, Y0)) clf=OneClassSVM(gamma='auto', nu=0.1) clf.fit(X1_train_n) Y1_pred_train=clf.predict(X1_train_n) Y_pred_TEST=clf.predict(X_TEST_n) #VALUTAZIONE #TRAIN SET #matrice di confusione confmat = confusion_matrix(y_true=Y1_train, y_pred=Y1_pred_train) fig, ax = plt.subplots(figsize=(2.5, 2.5)) ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3) for i in range(confmat.shape[0]): for j in range(confmat.shape[1]):
count = 0 IFlen = [] for x in IF.predict(df): if x < 0 and count < len(df): df = df.drop(df.index[count]) dfOnlyClasses = dfOnlyClasses.drop(dfOnlyClasses.index[count]) IFlen.append(x) count = count + 1 #print(df) #print(dfOnlyClasses) #outlier dectection using one-class SVM OCSVM = OneClassSVM(gamma='auto').fit(dataFrame.drop(columns="class")) #negative values are outliers outliersWithOCSVM = [ x for x in OCSVM.predict(dataFrame.drop(columns="class")) if x < 0 ] print('amount of outliers using Isolation Forest:') print(len(IFlen)) print('amount of outliers using one-class SVM:') print(len(outliersWithOCSVM)) # In[2]: #import what's needed to build ann import keras from keras.models import Sequential from keras.layers import Dense from keras.utils import to_categorical from keras.callbacks import ModelCheckpoint from sklearn.model_selection import train_test_split
1 if i > 1 else 0 for i in lof.negative_outlier_factor_ * -1 ] #-------------------------------------------------------------------------------------------------# #------------------------------------------One-Class SVM------------------------------------------# #-------------------------------------------------------------------------------------------------# expected_perc_outliers = round(sum(d.diagnosis) / len(d), 1) boundary_smoothness = 1 / len(d.columns[3:12]) ocsvm = OneClassSVM(kernel='rbf', nu=expected_perc_outliers, gamma=boundary_smoothness, random_state=14) ocsvm.fit(d.iloc[:, 3:12]) ocsvm_outliers = np.where(ocsvm.predict(d.iloc[:, 3:12]) == -1)[0].tolist() print("Indices of outliers found by One-Class SVM: \n", ocsvm_outliers) d['ocsvm_outlier'] = [ 1 if i == -1 else 0 for i in ocsvm.predict(d.iloc[:, 3:12]) ] #-------------------------------------------------------------------------------------------------# #---------------------------------------Isolation Forest------------------------------------------# #-------------------------------------------------------------------------------------------------# expected_perc_outliers = round(sum(d.diagnosis) / len(d), 1) isoforest = IsolationForest(n_estimators=99, contamination=expected_perc_outliers, max_features=1.0, random_state=14) isoforest.fit(d.iloc[:, 3:12])
# plt.imshow(X_train[1].reshape(resize_size)) # In[120]: clf = OneClassSVM(nu=0.01, kernel="rbf", gamma=0.01) # In[121]: clf.fit(X_train) # In[122]: print "Error training: %d/%d" % (X_train[clf.predict(X_train)==-1].shape[0], X_train.shape[0]) print "Error training: %d/%d" % (X_val[clf.predict(X_val)==-1].shape[0], X_val.shape[0]) # In[108]: import cPickle as pickle # In[123]: pickle.dump(clf, open('full_retinal_img_clf.pkl', 'wb')) # In[124]: