class svm_model(): def train(self, X, ker): self.model = OneClassSVM(kernel=ker, shrinking=True,random_state=1) self.model.fit(X) def predict(self, X): return self.model.predict(X)
def main(): n = 1000 data = [] for i in range(n): data.append(np.array([np.random.randint(0, 5000) for i in range(np.random.randint(20, 150))])) data = np.array(data) # making all the data into 5 dimensions # howto : boxplot x = [] y = [] for i in data: sorted_i = sorted(i) x.append([max(sorted_i), np.percentile(sorted_i, 75), np.median(sorted_i), np.percentile(sorted_i, 25), min(sorted_i)]) y.append(0) x = np.array(x) ''' # making all the data into 5 dimensions # howto : distance start = time.time() data_i = 0 cnt = 1 x = np.zeros((n, n)) for i in data: data_j = data_i for j in data[cnt:]: dist = dtw(i, j, dist=lambda i, j: norm(i - j, ord=1))[0] x[data_i][data_j+1], x[data_j+1][data_i] = dist, dist data_j += 1 cnt += 1 data_i += 1 end = time.time() print(end - start) ''' # build model with x model = OneClassSVM() model.fit(x) # create test dataset test = [] for i in range(10): test.append(np.array([np.random.randint(0, 10000) for i in range(np.random.randint(20000, 30000))])) test = np.array(test) # transform test dataset x = [] y = [] for i in test: sorted_i = sorted(i) x.append([max(sorted_i), np.percentile(sorted_i, 75), np.median(sorted_i), np.percentile(sorted_i, 25), min(sorted_i)]) y.append(0) x = np.array(x) # predict test dataset pred = model.predict(x) '''
def fit(self, X, Y, W): clf = OneClassSVM(kernel=self.kernel, degree=self.degree, gamma=self.gamma, coef0=self.coef0, tol=self.tol, nu=self.nu, shrinking=self.shrinking, cache_size=self.cache_size, max_iter=self.max_iter) if W is not None: return OneClassSVMClassifier(clf.fit(X, W.reshape(-1))) return OneClassSVMClassifier(clf.fit(X))
class Cluster(object): def __init__(self, name): self.name = name self.raw_dataset = [] self.dataset = [] self.dataset_red = [] def get_featurevec(self, data): '''Takes in data in the form of an array of EmoPackets, and outputs a list of feature vectors.''' # CHECKED, all good :) num_bins = (len(data)/int(dsp.SAMPLE_RATE*dsp.STAGGER) - int(dsp.BIN_SIZE / dsp.STAGGER) + 1) size = int(dsp.BIN_SIZE*dsp.SAMPLE_RATE) starts = int(dsp.SAMPLE_RATE*dsp.STAGGER) points = [] for i in range(num_bins): points.append(dsp.get_features(data[i*starts:i*starts+size])) return points def add_data(self, raw): '''Allows the addition of new data. Will retrain upon addition. Expects a list of EmoPackets.''' self.dataset.extend(self.get_featurevec(raw)) def extract_features(self): '''Does feature extraction for all of the datasets.''' self.dataset = [] for sess in self.raw_dataset: self.dataset.extend(self.get_featurevec(sess)) def reduce_dim(self, NDIM=5): '''Reduces the dimension of the extracted feature vectors.''' X = np.array(self.dataset) self.pca = RandomizedPCA(n_components=NDIM).fit(X) self.dataset_red = self.pca.transform(X) def train(self): '''Trains the classifier.''' self.svm = OneClassSVM() self.svm.fit(self.dataset_red) def is_novel(self, pt): '''Says whether or not the bin is novel. Expects an array of EmoPackets''' X = self.pca.transform(np.array(self.get_featurevec(data)[0])) ans = self.svm.predict(X) self.dataset_red.append(X) self.train() return ans def save(self): '''Saves this classifier to a data directory.''' this_dir, this_filename = os.path.split(__file__) DATA_PATH = os.path.join(this_dir, "data", self.name+'.pkl') dumpfile = open(DATA_PATH, "wb") pickle.dump(self, dumpfile, pickle.HIGHEST_PROTOCOL) dumpfile.close()
def runClassifier(self, _driverId, numComponents=0): X = self.featuresHash.values() self.ids = self.featuresHash.keys() if self.runDimRed: X = self.dimRed(X, numComponents) clf = OCSVM(nu=self.nu, gamma=self.gamma) clf.fit(X) y_pred = clf.decision_function(X).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * self.outliers_fraction) self.label = y_pred > threshold self.label = map(int, self.label)
def select_best_support_vectors(data, nu=0.01, all_gammas=2 ** np.arange(-10, 10, 1)): all_errors = [] for gamma in all_gammas: clf = OneClassSVM(nu=nu, gamma=gamma) clf.fit(data) prediction = clf.predict(data) out_of_class_count = np.sum(prediction == -1) support_vectors_count = len(clf.support_vectors_) error = (float(out_of_class_count) / len(data) - nu) ** 2 error += (float(support_vectors_count) / len(data) - nu) ** 2 all_errors.append(error) index = np.argmin(all_errors) return all_gammas[index], all_errors
def embed_dat_matrix_two_dimensions(low_dimension_data_matrix, y=None, labels=None, density_colormap='Blues', instance_colormap='YlOrRd'): from sklearn.preprocessing import scale low_dimension_data_matrix = scale(low_dimension_data_matrix) # make mesh x_min, x_max = low_dimension_data_matrix[:, 0].min(), low_dimension_data_matrix[:, 0].max() y_min, y_max = low_dimension_data_matrix[:, 1].min(), low_dimension_data_matrix[:, 1].max() step_num = 50 h = min((x_max - x_min) / step_num, (y_max - y_min) / step_num) # step size in the mesh b = h * 10 # border size x_min, x_max = low_dimension_data_matrix[:, 0].min() - b, low_dimension_data_matrix[:, 0].max() + b y_min, y_max = low_dimension_data_matrix[:, 1].min() - b, low_dimension_data_matrix[:, 1].max() + b xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # induce a one class model to estimate densities from sklearn.svm import OneClassSVM gamma = max(x_max - x_min, y_max - y_min) clf = OneClassSVM(gamma=gamma, nu=0.1) clf.fit(low_dimension_data_matrix) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max] . [y_min, y_max]. if hasattr(clf, "decision_function"): score_matrix = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) else: score_matrix = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] # Put the result into a color plot levels = np.linspace(min(score_matrix), max(score_matrix), 40) score_matrix = score_matrix.reshape(xx.shape) if y is None: y = 'white' plt.contourf(xx, yy, score_matrix, cmap=plt.get_cmap(density_colormap), alpha=0.9, levels=levels) plt.scatter(low_dimension_data_matrix[:, 0], low_dimension_data_matrix[:, 1], alpha=.5, s=70, edgecolors='gray', c=y, cmap=plt.get_cmap(instance_colormap)) # labels if labels is not None: for id in range(low_dimension_data_matrix.shape[0]): label = labels[id] x = low_dimension_data_matrix[id, 0] y = low_dimension_data_matrix[id, 1] plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
def svm(data, fraction=0.05, kernel='poly', degree=3, gamma=0, coeff=0): svm = OneClassSVM(kernel=kernel, degree=degree, gamma=gamma, nu=fraction, coeff0=coeff) svm.fit(data) score = svm.predict(data) numeration = [[i] for i in xrange(1, len(data)+1, 1)] numeration = np.array(numeration) y = np.hstack((numeration, score)) anomalies = numeration for num,s in y: if (y == 1): y = np.delete(anomalies, num-1, axis=0) return anomalies
def outlier_detect(data_frame): #pandas to numpy - digestible by scikit columns = ['blm_tag_count','protest_count','justice_count','riot_count','breathe_count'] features = data_frame[list(columns)].values clf = OneClassSVM(nu=0.008, gamma=0.05) clf.fit(features) y_pred = clf.predict(features) mask=[y_pred==-1] oak_array = np.asarray(data_frame.hourly) protest_predict = oak_array[mask] protest_hours = list(protest_predict) return protest_hours
def select_best_outlier_fraction_cross_val(data, nu=0.05, all_gammas=2 ** np.arange(-10, 10, 50), folds_count=7): all_errors = [] kf_iterator = KFold(len(data), n_folds=folds_count) for gamma in all_gammas: error = 0 for train, test in kf_iterator: train_data = data[train,:] test_data = data[test,:] clf = OneClassSVM(nu=nu, gamma=gamma) clf.fit(train_data) prediction = clf.predict(test_data) outlier_fraction = np.mean(prediction == -1) error += (nu - outlier_fraction) ** 2 + (float(clf.support_vectors_.shape[0]) / len(data) - nu) ** 2 all_errors.append(error / folds_count) best_index = np.argmin(error) return int(best_index), all_errors
class OneClassSVMDetector(BaseOutlier): @staticmethod def get_attributes(): return { "nu":0.1, "kernel":['rbf','linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], "gamma":0.1, } def __init__(self,nu=0.1,kernel='rbf',gamma=0.1): self.nu = nu self.kernel = kernel self.gamma = gamma def fit(self,data=None): self.data = data self.check_finite(data) if(self._is_using_pandas(data)==True): self.data.interpolate(inplace=True) # self.datareshap = data.reshape(-1,1) self.clf = OneClassSVM(nu=self.nu, kernel=self.kernel, gamma=self.gamma) self.clf.fit(data.reshape(-1,1)) # print "done" return self def predict(self, X_test): y_pred_train = self.clf.predict(X_test.reshape(-1,1)) outlier_idx = np.where(y_pred_train == -1) inlier_idx = np.where(y_pred_train == 1) d = { 'timestamp': self.data.index[outlier_idx], 'anoms': self.data.iloc[outlier_idx] } anoms = pd.DataFrame(d) self.anomaly_idx = anoms.index self.anom_val = anoms['anoms'] return anoms def fit_predict(self, data=None): self.fit(data) return self.predict(data) def plot(self): import matplotlib.pyplot as plt f, ax = plt.subplots(1, 1) ax.plot(self.data, 'b') ax.plot(self.anomaly_idx, self.anom_val, 'ro') ax.set_title('Detected Anomalies') ax.set_ylabel('Count') f.tight_layout() return f
def cross_validate(): #for tinkering with the model #read data all_df = pd.read_csv('./data/train.csv',index_col = 'ID') #split data zeros_df = all_df[all_df.TARGET == 0] ones_df = all_df[all_df.TARGET == 1] num_ones = ones_df.shape[0] msk = np.random.permutation(len(zeros_df)) < num_ones zeros_train_df = zeros_df[~msk] zeros_test_df = zeros_df[msk] ones_test_df = ones_df train_df = zeros_train_df test_df = pd.concat([zeros_test_df,ones_test_df]) train_X = np.array(train_df.drop('TARGET', axis = 1)) train_Y = np.array(train_df.TARGET) test_X = np.array(test_df.drop('TARGET',axis = 1)) test_Y = np.array(test_df.TARGET) #true target values #init svm print('training svm') my_svm = OneClassSVM(verbose = True) my_svm.fit(train_X) #predict print('predicting') predictions = my_svm.predict(test_X) conf_matrix = confusion_matrix(test_Y,predictions) print('confusion matrix:') print(pd.DataFrame(conf_matrix,columns = [0,1])) print('accuracy:') print(sum(test_Y.reshape(predictions.shape) == predictions)/len(test_Y))
def find_anomaly(label1, label2, winsize): print("Find anomaly in channel", label1 + '-' + label2 + '...', file=sys.stderr) print("-"*80) print("Channel [" + label1 + '-' + label2 + ']') print("-"*80) # find difference electrode1 = eeg.chan_lab.index(label1) electrode2 = eeg.chan_lab.index(label2) wave = eeg.X[electrode1] - eeg.X[electrode2] # # import random # wave = [random.uniform(-20,20) for _ in range(400*30)] + [random.uniform(-2000,2000) for _ in range(5*30)] # wave = np.array(wave) print("Splitting into windows...", file=sys.stderr) wave_windows = np.array_split(wave, len(wave)/eeg.sample_rate/winsize) # wave_windows = np.array_split(wave, len(wave)/winsize) print("Extracting features...", file=sys.stderr) def extract_features(wave_window): max_val = max(wave_window) min_val = min(wave_window) stdev = np.std(wave_window) sum_val = sum(wave_window) sum_pos_val = sum([x for x in wave_window if x > 0]) sum_abs_val = sum([abs(x) for x in wave_window]) return [max_val, min_val, stdev, sum_val, sum_pos_val, sum_abs_val] Examples = np.array(map(extract_features, wave_windows)) print("Training model, assuming no more than", CONTAMINATION, "anomaly...", file=sys.stderr) od = OneClassSVM(nu=CONTAMINATION, kernel='poly', gamma=0.05, max_iter=100000) od.fit(Examples) decisions = od.decision_function(Examples) # print decisions # print max(decisions), min(decisions) print("Most likely windows with anomaly:") # find most likely windows, in desc order largest_indices = np.argsort((-np.absolute(decisions)).ravel())[:20] for large_index in largest_indices: print(large_index*winsize/60, "min (score:", decisions[large_index][0], ")") sys.stdout.flush()
def determine_test_similarity(self, model): clf_OCSVM = {} model_OCSVM = {} for i in range(len(model)): clf = OneClassSVM(kernel='rbf', nu=0.1, gamma=.023) clf_OCSVM[i] = clf OCSVMmodel = clf.fit(model[i]) model_OCSVM[i] = OCSVMmodel return clf_OCSVM, model_OCSVM
def plot_scatter(X_dict, y_dict, col1, col2, max_error, max_filled_gap, insens, f_colors = ['yellow', 'red', 'blue'], nu=0.98, high=0.95): planes = sorted(X_dict.keys()) planes_with_failures = sorted([key for key in X_dict.keys() if y_dict[key].sum()>0]) ocsvm = OneClassSVM(kernel='linear', nu=0.98) X_train = pd.concat(dict([(plane, X_dict[plane][[col1, col2]].dropna()) for plane in planes_with_failures])) ocsvm.fit(X_train.values) qb = QuantileBinarizer(low=0.0, high=0.95, each_side=False) qb.fit(X_train) mask_pref = pd.concat(dict( [(plane, get_mask_pref(y_dict[plane], max_error)) for plane in planes]), axis=0) mask_norm = pd.concat(dict( [(plane, get_mask_norm(y_dict[plane], max_error, insens)) for plane in planes]), axis=0) fig = plt.figure(figsize=(15,15), dpi=100) # plt.xlabel('Norm of res. phase: %s, group: %s' % (col1[0], str(col_groups[col1[0]][int(col1[1][-1])]))) # plt.ylabel('Norm of res. phase: %s, group: %s' % (col2[0], str(col_groups[col2[0]][int(col2[1][-1])]))) plt.xlabel(col1) plt.ylabel(col2) plot_norm = plt.scatter(pd.concat(X_dict)[col1].loc[mask_norm], pd.concat(X_dict)[col2].loc[mask_norm], c='lightgrey', zorder=1, s=6) plot_pref = [] for i, plane in enumerate(planes_with_failures): plot_pref.append(plt.scatter(X_dict[plane][col1].loc[get_mask_pref(y_dict[plane], max_error)], X_dict[plane][col2].loc[get_mask_pref(y_dict[plane], max_error)], c=f_colors[i], zorder=2, s=30)) x_min, x_max, y_min, y_max = plt.axis('tight') plt.axvline(qb._thresholds[col1]['high'], c='green') plt.axhline(qb._thresholds[col2]['high'], c='green') plot_line = plt.plot([x_min, x_max], [(ocsvm.intercept_ - ocsvm.coef_[0][0] * x_min) / ocsvm.coef_[0][1], (ocsvm.intercept_ - ocsvm.coef_[0][0] * x_max) / ocsvm.coef_[0][1]], c='red') # # plt.legend((plot_norm, plot_pref), ('No-failure', 'Pre-failure'), # # scatterpoints=1, loc='upper right', ncol=1) # #plt.savefig('./scatter/pair_group_of_fours3.png')
def predict_header_features(self, pkt_featurizer): group_id = pkt_featurizer.pkt_type features = pkt_featurizer.features arrival_time = pkt_featurizer.arrival_time try: vectorizer = DictVectorizer() vectorizer.fit(self.training_data[group_id]) training_data_vectorized = vectorizer.transform(self.training_data[group_id]) features_vectorized = vectorizer.transform(features) scaler = preprocessing.StandardScaler(with_mean=False) training_data_vectorized = scaler.fit_transform(training_data_vectorized) features_vectorized = scaler.transform(features_vectorized) classifier = OneClassSVM() classifier.fit(training_data_vectorized) result = classifier.predict(features_vectorized) distance = classifier.decision_function(features_vectorized) except KeyError: result = 0 distance = 0 return result, distance
class TwoStage(object): def __init__(self, *args, **kwargs): super(TwoStage, self).__init__(*args, **kwargs) self._oneCls = OneClassSVM(nu=NU, gamma=GAMMA) self._clf = RandomForestClassifier(n_estimators=30) self._scaler = StandardScaler() def fit(self, data, labels): sdata = self._scaler.fit_transform(data) self._oneCls.fit(sdata) self._clf.fit(sdata, labels) return self def predict(self, data): sdata = self._scaler.transform(data) is_known_cls = self._oneCls.predict(sdata) cls = self._clf.predict(sdata) cls[is_known_cls == -1] = "zother" classes = list(self._clf.classes_) + ["zother"] return cls, classes
class NoveltySeparator(BaseEstimator): def get_params(self, deep=True): return {} def fit(self, X, y): # lets treat users spending something in the rest of the month as outliers inliers = y - X[:, 0] inliers = np.where(inliers < 0.1, True, False) self.detector = OneClassSVM(nu=0.05, cache_size=2000, verbose=True) # training only on inliers print("Training detector") self.detector.fit(X[inliers]) results = self.detector.predict(X).reshape(X.shape[0]) # predicted inliers = results == 1 outliers = results == -1 print("Training estimators") self.est_inliers = Ridge(alpha=0.05) self.est_outliers = Ridge(alpha=0.05) self.est_inliers.fit(X[inliers], y[inliers]) self.est_inliers.fit(X[outliers], y[outliers]) def predict(self, X): y = np.zeros(X.shape[0]) labels = self.detector.predict(X).reshape(X.shape[0]) inliers = lables == 1 outliers = lables == -1 y[inliers] = self.est_inliers.predict(X[inliers]) y[outliers] = self.est_outliers.predict(X[outliers]) return y
def predict_pkt_length_features(self, pkt_featurizer): group_id = pkt_featurizer.pkt_type try: dbscan = DBSCAN() pkt_lengths = np.array(list(self.pkt_lengths[group_id])+[pkt_featurizer.len_bytes]).reshape(-1,1) labels = dbscan.fit_predict(pkt_lengths) dbscan_prediction = labels[-1] == -1 if self.plot: self.plot_1d_dbscan(pkt_lengths, labels, range(len(pkt_lengths)), self.pkt_lengths_fig_dbscan, "", "Pkt Length", "Pkt Length DBSCAN Clustering - Anomalous Pkts in Black") one_class_svm = OneClassSVM() scaler = preprocessing.StandardScaler() pkt_lengths_scaled = scaler.fit_transform(np.array(self.pkt_lengths[group_id]).reshape(-1,1)) features_scaled = scaler.transform(np.array(pkt_featurizer.len_bytes).reshape(1,-1)) one_class_svm.fit(pkt_lengths_scaled) svm_prediction = one_class_svm.predict(features_scaled) if self.plot and len(pkt_lengths_scaled) > 2: self.plot_1d_svm(self.pkt_lengths[group_id], one_class_svm, range(len(self.pkt_lengths[group_id])), scaler, self.pkt_lengths_fig_svm, "Pkt", "Pkt Length", "Pkt Length One Class SVM Classification") except (KeyError, IndexError) as e: print e dbscan_prediction = 0 return dbscan_prediction
def slice_probability_space_selection(data, nu=0.05, all_gammas=2 ** np.linspace(-10, 10, 50), rho=0.05, outlier_distribution = np.random.rand, folds_count=7): kf_iterator = KFold(len(data), n_folds=folds_count) all_errors = [] for gamma in all_gammas: error = 0.0 clf = OneClassSVM(nu=nu, gamma=gamma) for train, test in kf_iterator: train_data = data[train,:] test_data = data[test,:] clf = OneClassSVM(nu=nu, gamma=gamma) clf.fit(train_data) prediction = clf.predict(test_data) inlier_metric_part = np.mean(prediction == -1) inlier_metric_part = inlier_metric_part / (1 + rho) / len(data) outliers = outlier_distribution(*data.shape) - 0.5 outliers *= 8 * np.std(data) outlier_metric_part = np.mean(clf.predict(outliers) == 1) * rho / (1 + rho) / len(outliers) error += inlier_metric_part + outlier_metric_part all_errors.append(error / folds_count) index = np.argmin(all_errors) #best_index = pd.Series(all_errors).pct_change().argmax() - 1 return int(index), all_errors
def remove_outliers_SVM(self): ## Remove outliers using a OneClassSVM method print "Running SVM to remove outliers..." svm = OneClassSVM(kernel='rbf', nu=0.1, degree=3, verbose=1) fit = svm.fit(self.DataArray) decision = svm.decision_function(self.DataArray) _indices = [] # If a value is below the decision hyperplane, eliminate it for i in range(len(decision)): if decision[i] < 0: _indices.append(i) print self.DataArray.shape self.DataArray = np.delete(self.DataArray, _indices, axis=0) self.TargetArray = np.delete(self.TargetArray, _indices, axis=0) print self.DataArray.shape
def oneClass(self): model = OneClassSVM() model.fit(self.arr) model.predict(self.arr)
def get_model(self): start = datetime.now() print("Start at {}".format(start.strftime("%Y/%m/%d %H:%M:%S"))) train_example = [] xss_example = [] non_xss_example = [] # 读取训练集(整理好的XSS Payload) self.read_txt(self.train_path, train_example) # 读取正常请求样本集 self.read_txt(self.test_none_xss_path, non_xss_example) # 读取攻击请求样本集 self.read_txt(self.test_xss_path, xss_example) # 特征向量化训练样本 tf_idf_vector = TfIdfVector() train_vector = tf_idf_vector.fit_vector # 特征向量化黑白样本 test_normal_vector = tf_idf_vector.transform(xss_example) test_abnormal_vector = tf_idf_vector.transform(non_xss_example) y = [1] * (len(train_example)) # 遍历调优参数nu与gamma grid = { 'gamma': np.logspace(-8, 1, 10), 'nu': np.linspace(0.01, 0.20, 20) } # 核函数(rbf,linear,poly) kernel = 'rbf' # 最高准确度、召回率、F1值纪录 max_F1 = 0 max_Re = 0 max_Pr = 0 # 最高准确度、召回率、F1值时参数gamma的值 gamma_r_F1 = 0.01 gamma_r_Re = 0.01 gamma_r_Pr = 0.01 # 最高准确度、召回率、F1值时参数nu的值 nu_r_F1 = 0 nu_r_Re = 0 nu_r_Pr = 0 svdd = OneClassSVM(kernel=kernel) zero_count = 0 re_gamma = 0 total_loop = len(ParameterGrid(grid)) process_count = 0 for z in ParameterGrid(grid): process_count += 1 if re_gamma == z.get('gamma'): if zero_count >= 4: continue else: zero_count = 0 svdd.set_params(**z) svdd.fit(train_vector, y) k = svdd.get_params() # 攻击请求样本测试 f = svdd.predict(test_normal_vector) TP = f.tolist().count(1) # True positive FN = f.tolist().count(-1) # False Negative # 非攻击样本测试 f = svdd.predict(test_abnormal_vector) FP = f.tolist().count(1) # False positive Precision = 0 if TP == 0 else (TP / (TP + FP)) # Precision Recall = 0 if TP == 0 else (TP / (TP + FN)) # Recall if Recall == 0 or Precision == 0: F1_score = 0 zero_count += 1 re_gamma = k.get('gamma') else: F1_score = 2 * Precision * Recall / (Precision + Recall ) # F1 value if F1_score > max_F1: max_F1 = F1_score nu_r_F1 = k.get('nu') gamma_r_F1 = k.get('gamma') if Recall > max_Re: max_Re = Recall nu_r_Re = k.get('nu') gamma_r_Re = k.get('gamma') if Precision > max_Pr: max_Pr = Precision nu_r_Pr = k.get('nu') gamma_r_Pr = k.get('gamma') print( "========================== [{}] ===========================". format(datetime.now().strftime("%Y/%m/%d %H:%M:%S"))) print( "nu: ", k.get('nu'), 'gamma', k.get('gamma'), ) print("Precision: {}%".format(Precision * 100)) print("Recall: {}%".format(Recall * 100)) print("F1 score: {}".format(F1_score)) print("========================== [{}] ===========================". format(datetime.now().strftime("%Y/%m/%d %H:%M:%S"))) print( "MAX Precision: {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}" .format(max_Pr, nu_r_Pr, gamma_r_Pr)) print( "MAX Recall: {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}" .format(max_Re, nu_r_Re, gamma_r_Re)) print( "MAX F1: {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}" .format(max_F1, nu_r_F1, gamma_r_F1)) total_second = datetime.now() - start print("Cost {}s.".format(total_second.total_seconds())) with open(os.path.join(self.root_path, "ModuleTrain/cache/model.pkl"), 'wb') as file: svdd.set_params(kernel=kernel, nu=nu_r_F1, gamma=gamma_r_F1) svdd.fit(train_vector, y) pickle.dump(svdd, file) self.complete = True
def base_experiment(pct_noise=0.15, noverlap_bits=0, exp_name='1-1', ntrials=10, verbose=True, seed=123456789): """ Run a single experiment, locally. @param pct_noise: The percentage of noise to add to the dataset. @param noverlap_bits: The number of bits the base class should overlap with the novelty class. @param exp_name: The name of the experiment. @param ntrials: The number of times to repeat the experiment. @param verbose: If True print the results. @param seed: The random seed to use. @return: A tuple containing the percentage errors for the SP's training and testing results and the SVM's training and testing results, respectively. """ # Base parameters ntrain, ntest = 800, 200 nsamples, nbits, pct_active = ntest + ntrain, 100, 0.4 clf_th = 0.5 log_dir = os.path.join(os.path.expanduser('~'), 'scratch', 'novelty_experiments', exp_name) # Configure the SP config = { 'ninputs': 100, 'trim': 1e-4, 'disable_boost': True, 'seed': seed, 'pct_active': None, 'random_permanence': True, 'pwindow': 0.5, 'global_inhibition': True, 'ncolumns': 200, 'nactive': 50, 'nsynapses': 75, 'seg_th': 15, 'syn_th': 0.5, 'pinc': 0.001, 'pdec': 0.001, 'nepochs': 10, 'log_dir': log_dir } # Seed numpy np.random.seed(seed) # Create the base dataset x_ds = SPDataset(nsamples, nbits, pct_active, pct_noise, seed=seed) x_tr, x_te = x_ds.data[:ntrain], x_ds.data[ntrain:] # Create the outlier dataset base_indexes = set(np.where(x_ds.base_class == 1)[0]) choices = [x for x in xrange(nbits) if x not in base_indexes] outlier_base = np.zeros(nbits, dtype='bool') outlier_base[np.random.choice(choices, x_ds.nactive - noverlap_bits, False)] = 1 outlier_base[np.random.permutation(list(base_indexes))[:noverlap_bits]] = 1 y_ds = SPDataset(ntest, nbits, pct_active, pct_noise, outlier_base, seed) y_te = y_ds.data if verbose: print "\nBase class' test noise: {0:2.2f}".format(1 - (np.mean(x_te, 0) * x_ds.base_class.astype('i')).sum() / 40.) print "Outlier's class noise: {0:2.2f}".format(1 - (np.mean(y_te, 0) * outlier_base.astype('i')).sum() / 40.) print 'Overlap between two classes: {0}'.format(np.dot( x_ds.base_class.astype('i'), outlier_base.astype('i'))) # Metrics metrics = SPMetrics() # Get the metrics for the datasets u_x_tr = metrics.compute_uniqueness(x_tr) o_x_tr = metrics.compute_overlap(x_tr) c_x_tr = 1 - metrics.compute_distance(x_tr) u_x_te = metrics.compute_uniqueness(x_te) o_x_te = metrics.compute_overlap(x_te) c_x_te = 1 - metrics.compute_distance(x_te) u_y_te = metrics.compute_uniqueness(y_te) o_y_te = metrics.compute_overlap(y_te) c_y_te = 1 - metrics.compute_distance(y_te) # Initialize the overall results sp_x_results = np.zeros(ntrials) sp_y_results = np.zeros(ntrials) svm_x_results = np.zeros(ntrials) svm_y_results = np.zeros(ntrials) # Iterate across the trials: for i in xrange(ntrials): # Make a new seed seed2 = np.random.randint(1000000) config['seed'] = seed2 config['log_dir'] = '{0}-{1}'.format(log_dir, i + 1) # Create the SP sp = SPRegion(**config) # Fit the SP sp.fit(x_tr) # Get the SP's output sp_x_tr = sp.predict(x_tr) sp_x_te = sp.predict(x_te) sp_y_te = sp.predict(y_te) # Get the metrics for the SP's results u_sp_x_tr = metrics.compute_uniqueness(sp_x_tr) o_sp_x_tr = metrics.compute_overlap(sp_x_tr) c_sp_x_tr = 1 - metrics.compute_distance(sp_x_tr) u_sp_x_te = metrics.compute_uniqueness(sp_x_te) o_sp_x_te = metrics.compute_overlap(sp_x_te) c_sp_x_te = 1 - metrics.compute_distance(sp_x_te) u_sp_y_te = metrics.compute_uniqueness(sp_y_te) o_sp_y_te = metrics.compute_overlap(sp_y_te) c_sp_y_te = 1 - metrics.compute_distance(sp_y_te) # Log all of the metrics sp._log_stats('Input Base Class Train Uniqueness', u_x_tr) sp._log_stats('Input Base Class Train Overlap', o_x_tr) sp._log_stats('Input Base Class Train Correlation', c_x_tr) sp._log_stats('Input Base Class Test Uniqueness', u_x_te) sp._log_stats('Input Base Class Test Overlap', o_x_te) sp._log_stats('Input Base Class Test Correlation', c_x_te) sp._log_stats('Input Novelty Class Test Uniqueness', u_y_te) sp._log_stats('Input Novelty Class Test Overlap', o_y_te) sp._log_stats('Input Novelty Class Test Correlation', c_y_te) sp._log_stats('SP Base Class Train Uniqueness', u_sp_x_tr) sp._log_stats('SP Base Class Train Overlap', o_sp_x_tr) sp._log_stats('SP Base Class Train Correlation', c_sp_x_tr) sp._log_stats('SP Base Class Test Uniqueness', u_sp_x_te) sp._log_stats('SP Base Class Test Overlap', o_sp_x_te) sp._log_stats('SP Base Class Test Correlation', c_sp_x_te) sp._log_stats('SP Novelty Class Test Uniqueness', u_sp_y_te) sp._log_stats('SP Novelty Class Test Overlap', o_sp_y_te) sp._log_stats('SP Novelty Class Test Correlation', c_sp_y_te) # Print the results fmt_s = '{0}:\t{1:2.4f}\t{2:2.4f}\t{3:2.4f}\t{4:2.4f}\t{5:2.4f}\t{5:2.4f}' if verbose: print '\nDescription\tx_tr\tx_te\ty_te\tsp_x_tr\tsp_x_te\tsp_y_te' print fmt_s.format('Uniqueness', u_x_tr, u_x_te, u_y_te, u_sp_x_tr, u_sp_x_te, u_sp_y_te) print fmt_s.format('Overlap', o_x_tr, o_x_te, o_y_te, o_sp_x_tr, o_sp_x_te, o_sp_y_te) print fmt_s.format('Correlation', c_x_tr, c_x_te, c_y_te, c_sp_x_tr, c_sp_x_te, c_sp_y_te) # Get average representation of the base class sp_base_result = np.mean(sp_x_tr, 0) sp_base_result[sp_base_result >= 0.5] = 1 sp_base_result[sp_base_result < 1] = 0 # Averaged results for each metric type u_sp_base_to_x_te = 0. o_sp_base_to_x_te = 0. c_sp_base_to_x_te = 0. u_sp_base_to_y_te = 0. o_sp_base_to_y_te = 0. c_sp_base_to_y_te = 0. for x, y in zip(sp_x_te, sp_y_te): # Refactor xt = np.vstack((sp_base_result, x)) yt = np.vstack((sp_base_result, y)) # Compute the sums u_sp_base_to_x_te += metrics.compute_uniqueness(xt) o_sp_base_to_x_te += metrics.compute_overlap(xt) c_sp_base_to_x_te += 1 - metrics.compute_distance(xt) u_sp_base_to_y_te += metrics.compute_uniqueness(yt) o_sp_base_to_y_te += metrics.compute_overlap(yt) c_sp_base_to_y_te += 1 - metrics.compute_distance(yt) u_sp_base_to_x_te /= ntest o_sp_base_to_x_te /= ntest c_sp_base_to_x_te /= ntest u_sp_base_to_y_te /= ntest o_sp_base_to_y_te /= ntest c_sp_base_to_y_te /= ntest # Log the results sp._log_stats('Base Train to Base Test Uniqueness', u_sp_base_to_x_te) sp._log_stats('Base Train to Base Test Overlap', o_sp_base_to_x_te) sp._log_stats('Base Train to Base Test Correlation', c_sp_base_to_x_te) sp._log_stats('Base Train to Novelty Test Uniqueness', u_sp_base_to_y_te) sp._log_stats('Base Train to Novelty Test Overlap', o_sp_base_to_y_te) sp._log_stats('Base Train to Novelty Test Correlation', c_sp_base_to_y_te) # Print the results if verbose: print '\nDescription\tx_tr->x_te\tx_tr->y_te' print 'Uniqueness:\t{0:2.4f}\t{1:2.4f}'.format(u_sp_base_to_x_te, u_sp_base_to_y_te) print 'Overlap:\t{0:2.4f}\t{1:2.4f}'.format(o_sp_base_to_x_te, o_sp_base_to_y_te) print 'Correlation:\t{0:2.4f}\t{1:2.4f}'.format(c_sp_base_to_x_te, c_sp_base_to_y_te) # Create an SVM clf = OneClassSVM(kernel='linear', nu=0.1, random_state=seed2) # Evaluate the SVM's performance clf.fit(x_tr) svm_x_te = len(np.where(clf.predict(x_te) == 1)[0]) / float(ntest) * \ 100 svm_y_te = len(np.where(clf.predict(y_te) == -1)[0]) / float(ntest) * \ 100 # Perform classification using overlap as the feature # -- The overlap must be above 50% clf_x_te = 0. clf_y_te = 0. for x, y in zip(sp_x_te, sp_y_te): # Refactor xt = np.vstack((sp_base_result, x)) yt = np.vstack((sp_base_result, y)) # Compute the accuracy xo = metrics.compute_overlap(xt) yo = metrics.compute_overlap(yt) if xo >= clf_th: clf_x_te += 1 if yo < clf_th: clf_y_te += 1 clf_x_te = (clf_x_te / ntest) * 100 clf_y_te = (clf_y_te / ntest) * 100 # Store the results as errors sp_x_results[i] = 100 - clf_x_te sp_y_results[i] = 100 - clf_y_te svm_x_results[i] = 100 - svm_x_te svm_y_results[i] = 100 - svm_y_te # Log the results sp._log_stats('SP % Correct Base Class', clf_x_te) sp._log_stats('SP % Correct Novelty Class', clf_y_te) sp._log_stats('SVM % Correct Base Class', svm_x_te) sp._log_stats('SVM % Correct Novelty Class', svm_y_te) # Print the results if verbose: print '\nSP Base Class Detection : {0:2.2f}%'.format(clf_x_te) print 'SP Novelty Class Detection : {0:2.2f}%'.format(clf_y_te) print 'SVM Base Class Detection : {0:2.2f}%'.format(svm_x_te) print 'SVM Novelty Class Detection : {0:2.2f}%'.format(svm_y_te) return sp_x_results, sp_y_results, svm_x_results, svm_y_results
# X = X[indices] # y = y[indices] X_train = X[:n_samples_train, :] X_test = X[n_samples_train:, :] y_train = y[:n_samples_train] y_test = y[n_samples_train:] # # training only on normal data: # X_train = X_train[y_train == 0] # y_train = y_train[y_train == 0] print('OneClassSVM processing...') model = OneClassSVM(cache_size=500) tstart = time() model.fit(X_train) fit_time += time() - tstart tstart = time() scoring = -model.decision_function(X_test) # the lower,the more normal predict_time += time() - tstart fpr_, tpr_, thresholds_ = roc_curve(y_test, scoring) if fit_time + predict_time > max_time: raise TimeoutError f = interp1d(fpr_, tpr_) tpr += f(x_axis) tpr[0] = 0. precision_, recall_ = precision_recall_curve(y_test, scoring)[:2]
if run_lof_svm == 0: lof_scores = iso_scores osvm_scores = iso_scores elif j == 0: print('\n******LOF*******\n') start = time.time() lof = LocalOutlierFactor() lof.fit(X) end = time.time() time_all[j, 1] = end - start lof_scores = lof.negative_outlier_factor_ print('\n******1-class SVM*******\n') start = time.time() osvm = OneClassSVM(kernel='rbf') osvm.fit(X) end = time.time() time_all[j, 2] = end - start osvm_scores = osvm.score_samples(X) print('\n******Our Algo*******\n') start = time.time() #n_samples = int(t1/50) n_samples = 100 kwargs = { 'max_depth': 10, 'n_trees': 50, 'max_samples': n_samples, 'max_buckets': 3, 'epsilon': 0.1, 'sample_axis': 1,
import numpy as np from utils import Get_training_data, Get_testing_data from sklearn.decomposition import KernelPCA from sklearn.svm import OneClassSVM import matplotlib.pyplot as plt X = Get_training_data() transformer = KernelPCA(n_components=8, kernel='rbf') X_pca = [] for x in X: print(np.array(x).shape) transformed = transformer.fit_transform(x) for i in transformed: X_pca.append(i) clf = OneClassSVM(gamma='auto') X_pca = np.array(X_pca) print(np.array(X_pca).shape) plt.scatter(X_pca[:, 0], X_pca[:, 1], label="train data") plt.show() clf.fit(X_pca) print(clf.predict(X_pca))
print("The scores are computed on the full evaluation set.") print() y_true, y_pred = y_test, gscv.predict(X_test) print(classification_report(y_true, y_pred)) print(confusion_matrix(y_test, y_pred)) print() #%% # Novelty detection by One Class SVM with optimized hyperparameter from my_library import optimize_gamma optgamma = gscv.best_params_['gamma'] range_g = 2**np.arange(-20, 1, dtype=float) optgamma = optimize_gamma(X_train, range_g) clf = OneClassSVM(nu=0.003, kernel=gscv.best_params_['kernel'], gamma=optgamma) clf.fit(X_train) y_pred = gscv.predict(X_test) # prediction from my_library import ad_knn # Applicability Domain (inside: +1, outside: -1) ad_svm = clf.predict(X_test) # outliers = -1 ad_knn = ad_knn(X_train, X_test) results = np.c_[y_pred, y_test, ad_knn, ad_svm, X_test] df = pd.DataFrame(results, columns=list('ABCDEF')) df_knn = df[df.C == -1] df_svm = df[df.D == -1] print('AD svm =/= AD knn') print(df[df.C != df.D])
#print(tokens[token_start:token_finish]) #print(sims_flat[most_similar]) #print(p_values[0][f_dict_keys.index(f)]) return_dict[f]=0.75*sims_flat[most_similar]+0.25*p_values[0][f_dict_keys.index(f)] return return_dict if __name__ == "__main__": ocr_inst=OcrValidation() #ocr_inst.setup_model() #ocr_inst.feature_dict.pop("Sign-off") sum_rep=rp.SumRepresentation(ocr_inst.vocabulary,ocr_inst.feature_dict) cvec=CountVectorizer(vocabulary=ocr_inst.vocabulary,binary=True) d2v_train=pickle.load(open("doc2vec.p","rb")) d2v=rp.Doc2Vec(d2v_train) ocr_inst.doc2vec=d2v ocr_inst.exemplar_vec=ocr_inst.doc2vec.model.infer_vector([ocr_inst.texts[1]]) model=OneClassSVM(nu=0.05) ocr_inst.evaluate_mulitple([cvec,sum_rep,d2v],[model]) train_set=sum_rep.fit_transform(ocr_inst.texts[:75]) ocr_inst.train_set=train_set model_sum=OneClassSVM(nu=0.05) model_sum.fit(train_set) ocr_inst.model=model_sum ocr_inst.representation=sum_rep
'../data/label.csv') tst_feature = np.asarray(tstset['feature']) #load label with taxi==0, revert label to be taxi==1 tst_label = np.asarray(tstset['label']) featmean = np.mean(tr_feature, axis=0) featstd = np.std(tr_feature, axis=0) tr_feature -= featmean tr_feature /= featstd tst_feature -= featmean tst_feature /= featstd #model = RandomForestClassifier(n_estimators=20,criterion='entropy') model = OneClassSVM() model.fit(tr_feature) tr_accuracy = np.mean(model.predict(tr_feature) == tr_label) tst_res = model.predict(tst_feature) == tst_label tst_accuracy = np.mean(tst_res) print tst_res tst_pred = model.predict(tst_feature) print tst_pred proba = map(lambda x: max(x), tst_pred) tst_log = [] for each in zip(tst_res, proba): tst_log.append({'p': each[1], 'acc': each[0]}) records = sorted(tst_log, key=operator.itemgetter('p'), reverse=True) for i in range(1, len(records)): r = map(lambda x: x['acc'], records[0:i])
print('Data Loaded, {} pristine vectors'.format(data_length)) idx = np.arange(0, data_length) np.random.shuffle(idx) X_train = pristine_emb[idx[:int(args.train_prop * data_length)]] X_test = pristine_emb[idx[int(args.train_prop * data_length):]] print('Starting training on {} train vectors with {} test vectors'.format( X_train.shape[0], X_test.shape[0])) classifier = OneClassSVM(nu=0.0001, kernel='rbf', gamma=0.5 / 2048, cache_size=2000, verbose=True) classifier.fit(X_train) print('Finishing training') y_pred_train = classifier.predict(X_train) y_pred_test = classifier.predict(X_test) y_pred_outliers = classifier.predict(forged_emb) n_error_train = y_pred_train[y_pred_train == -1].size n_error_test = y_pred_test[y_pred_test == -1].size n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size print('Error train: {}/{} --> {}%'.format( n_error_train, X_train.shape[0], 100 * n_error_train / X_train.shape[0])) print('Error test: {}/{} --> {}%'.format( n_error_test, X_test.shape[0], 100 * n_error_test / X_test.shape[0])) print('Error forged: {}/{} --> {}%'.format(
def remove_outliers(features, max_fraction=0.1, min_fraction=0.25, verbose=False): """ Remove outliers from feature set. Since this is an unsupervised approach we iterate over many nu/gamma settings for the one-class SVM. For each setting, a certain fraction of the subjects will be classified as outliers. For some settings, this fraction will be very large, e.g., 90% which is not realistic. For this reason, you can set a maximum fraction, e.g., 10%. Only those parameter combinations that result in 10% or less outliers are considered for further analysis. Within those combinations we simply count how often a given subject is classified as an outlier. We then use a minimum fraction to determine when a subject is truly an outlier. :param features: :param max_fraction: Upper bound on number of outliers allowed :param min_fraction: Lower bound on number of times a subject is classified as outlier :param verbose: Verbosity. :return: Filtered feature set """ X, y = util.get_xy(features, target_column='diagnosis', exclude_columns=['age', 'gender', 'diagnosis']) subjects = {} nr_ok_fractions = 0 for nu in np.linspace(0.01, 1.0, num=20): for gamma in [2**x for x in range(-15, 4, 2)]: # Train classifier classifier = OneClassSVM(kernel='rbf', gamma=gamma, nu=nu) classifier.fit(X) y_pred = classifier.predict(X) # Calculate fraction of outliers count = 0.0 for i in range(len(y_pred)): if y_pred[i] == -1: count += 1.0 fraction = count / len(y_pred) # If fraction is less than threshold run through list again to find # which subjects are considered outliers. Each outlying subject is # added to the table and its value incremented by one if fraction < max_fraction: nr_ok_fractions += 1 for i in range(len(y_pred)): if y_pred[i] == -1: subject = features.index[i] if subject not in subjects.keys(): subjects[subject] = 0 subjects[subject] += 1 # Print number of times each subject is identified as outlier outliers = [] for subject in subjects.keys(): fraction = subjects[subject] / float(nr_ok_fractions) if fraction >= min_fraction: outliers.append(subject) # Remove outlying subjects if verbose: print('Removing {} outliers...'.format(len(outliers))) features.drop(outliers, axis=0, inplace=True) return features
train_activations = get_activations(new_model, get_inputs(train_path + '/*.jpg'), RELEVANT_LAYER_NAME) train_activations.to_csv(addr['ocs_train_activations_name']) test_activations = get_activations( new_model, get_test_inputs(test_path, class1Name, class2Name), RELEVANT_LAYER_NAME) test_activations.to_csv(addr['ocs_test_activations_name']) y_true = [1.] * shock_len + [-1.] * nonshock_len for kernel in ['linear', 'poly', 'rbf', 'sigmoid']: for nu in np.linspace(0.1, 0.9, num=9): for gamma in [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]: res_list = [] ocs = OneClassSVM(nu=nu, kernel=kernel, gamma=1.0 / (gamma)) ocs.fit(train_activations) y_pred = ocs.predict(test_activations) y_scores = ocs.decision_function(test_activations) precision, recall, thresholds = precision_recall_curve( y_true, y_scores) print(kernel, nu, gamma) print(metrics.accuracy_score(y_true, y_pred), metrics.precision_score(y_true, y_pred), metrics.recall_score(y_true, y_pred)) import gc gc.collect() sys.stdout = oldStdout
label=1) oc.readDataSet(equalLength=False, checkData=False) oc.dumpTeTrData(dumpName="anomaly.pkl") TrainFeat, TrainLabel, TestFeat, TestLabel = oc.loadTeTrDump( dumpName="anomaly.pkl") data = np.concatenate([TestFeat, TrainFeat]) label = np.concatenate([TestLabel, TrainLabel]) normal = data[label == 0] anomal = data[label == 1] training = normal[0:int(2 / 3 * len(normal))] test = normal[int(2 / 3 * len(normal))::] from sklearn.svm import OneClassSVM model = OneClassSVM(kernel='linear') model.fit(training) preds = model.predict(test) preds = np.reshape(preds, len(preds)) print("False Negatives: ", np.sum(preds == -1) / len(preds)) print("True Positives: ", np.sum(preds == 1) / len(preds)) preds = model.predict(anomal) preds = np.reshape(preds, len(preds)) print("False Positives: ", np.sum(preds == 1) / len(preds)) print("True Negatives: ", np.sum(preds == -1) / len(preds))
def outlier_SVM(df): ocsvm = OneClassSVM(kernel = 'rbf', gamma = 0.005, nu = 0.05) ocsvm.fit(df) outliers_svm = df[ocsvm.predict(df) == -1] return outliers_svm
from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from sklearn.metrics import f1_score from sklearn.svm import OneClassSVM X_dist, time = get_events_time(disturbed_sequences[:]) print("======== SEQUENCES =======") sequence = X_test[:] + X_dist[:] label = len(X_test[:]) * [1] + len(X_dist[:]) * [0] # define outlier detection model model = OneClassSVM(gamma='scale', nu=0.01) # fit on majority class model.fit(X_train) # detect outliers in the test set yhat = model.predict(sequence) # calculate score score = f1_score(label, yhat, pos_label=-1) print('F1 Score: %.3f' % score) # saveobj(f_history,h) print("####### End tranining ########") # %% # model = load_model(f_current_model) # param = loadobj(f_current_config) model = load_model(f_model) param = loadobj(f_config)
def _OneClassSVM(X): clf = OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1) clf.fit(X) return clf.predict(X)
print(Indx,file=fRank) fMaxAcc = -100.0 sMAXparam='' clfMax = '' idxmax = 0 for indx in (range(arTraining_std.shape[1]),):#Indx): for kernel in ['rbf']:#,'sigmoid']:#['poly', 'rbf', 'sigmoid']: for nu in [0.0000001,0.000001,0.00001,0.0001,0.001,0.01,0.1]:#,0.15,0.2,0.25,0.3,0.35,0.4]: for gam in [0.000001,0.00001,0.0001,0.001,0.01,0.1]:#,0.3,0.5]:#,1,10,100]: #print ('nu=',nu,'gamma=',gam,'kernel=',kernel) param = str(nu)+' '+str(gam) + ' '+ kernel clsf.set_params(**{'nu':nu,'gamma':gam,'kernel':kernel}) #gsC lsf = GridSearchCV(clsf,dParams,cv=tCVIndxs,scoring='scorer') #print('Training The Model') arTr = arTraining_std[:,indx] clsf.fit(arTr) #cls f.fit(arCVData,arCVLab) #print('Prediction') arV = arValidation_std[:,indx] y_pre = clsf.predict(arV) param = param + ' ' + str(indx).replace('\n','') #print (accuracy_score(y_valid_ref,y_pre)) fCurAcc = f1_score(y_valid_ref,y_pre)#accuracy_score(y_valid_ref,y_pre) xIn = accuracy_score(y_valid_ref[:iNumInClass],y_pre[:iNumInClass]) + math.exp(-100) #print('x=',x) fscoreIn = math.log(xIn) fscoreOut = 0.0 for sPhone in dPhoneIndx: iStart,iEnd = dPhoneIndx[sPhone] x = accuracy_score(y_valid_ref[iStart:iEnd],y_pre[iStart:iEnd]) fscoreTemp = math.log(x+math.exp(-100))
def eval(cfg, model, train_dataset, val_dataset, criterion, publisher="test"): model.eval() # get global features using a training dataset train_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, num_workers=cfg.nworkers, pin_memory=True) train_loader = tqdm(train_loader, ncols=100, desc="get train GF") train_global_features = [] with torch.no_grad(): for lidx, (inputs, targets) in enumerate(train_loader): inputs = inputs.to(cfg.device, non_blocking=True) inputs = torch.transpose( inputs, 1, 2)[:, : 3] # inputs.shape: Batch_size, num_channels, num_points) # targets = targets.to(cfg.device, non_blocking=True) # model encoder processing outputs, _, _ = model.encoder(inputs) # add a global feature to a list train_global_features.append(PytorchTools.t2n(outputs)) train_global_features = np.concatenate( train_global_features, axis=0) # shape (num_train_data, 1024) # get reconstructions for ply data reconstructions = model.decoder(outputs) # save reconstructions as ply rgb = np.full((reconstructions.shape[1], 3), 255, dtype=np.int32) xyz = PytorchTools.t2n(reconstructions[0]) write_ply("train_reconstruction.ply", xyz, rgb) inputs = torch.transpose(inputs, 1, 2) gt_xyz = PytorchTools.t2n(inputs[0]) write_ply("train_input.ply", gt_xyz, rgb) # get global features using a eval dataset val_loader = DataLoader(val_dataset, batch_size=cfg.batch_size, num_workers=cfg.nworkers, pin_memory=True) val_loader = tqdm(val_loader, ncols=100, desc="get eval GF") val_global_features = [] eval_labels = [] loss_list = [] with torch.no_grad(): for lidx, (inputs, targets) in enumerate(val_loader): inputs = inputs.to(cfg.device, non_blocking=True) inputs = torch.transpose( inputs, 1, 2)[:, :3] # inputs.shape: Batch_size, num_channels, num_points # targets = targets.to(cfg.device, non_blocking=True) # model encoder processing outputs, _, _ = model.encoder(inputs) # get reconstructions for loss of true data reconstructions = model.decoder(outputs) # compute loss inputs = torch.transpose(inputs, 1, 2) dist1, dist2 = criterion["chamfer_distance"](inputs, reconstructions) dist1 = np.mean(PytorchTools.t2n(dist1), axis=1) dist2 = np.mean(PytorchTools.t2n(dist2), axis=1) dist_loss = dist1 + dist2 # add dist_losses to a list loss_list.append(dist_loss) # add a global feature to a list val_global_features.append(PytorchTools.t2n(outputs)) # get eval labels eval_labels.append(targets) val_global_features = np.concatenate( val_global_features, axis=0) # shape (num_eval_data, 1024) eval_labels = np.squeeze(np.concatenate(eval_labels, axis=0), axis=-1) # shape (num_data) loss_list = np.concatenate(loss_list, axis=0) # save reconstructions as ply rgb = np.full((reconstructions.shape[1], 3), 255, dtype=np.int32) xyz = PytorchTools.t2n(reconstructions[0]) write_ply("test_reconstruction.ply", xyz, rgb) gt_xyz = PytorchTools.t2n(inputs[0]) write_ply("test_input.ply", gt_xyz, rgb) # use one class classification classifier = OneClassSVM(kernel='rbf', nu=0.1, gamma='auto') classifier.fit(train_global_features) pred_labels = classifier.predict(val_global_features) # visualize data using embeddings write_tsne("vis_embed.png", val_global_features, eval_labels) # get training data label _, true_label = train_dataset[0] # convert eval labels other than true labels to -1 eval_labels[eval_labels != true_label] = -1 # convert true labels to 1 eval_labels[eval_labels == true_label] = 1 # get loss of true data dist_loss = np.mean(loss_list[eval_labels]) # get a accuracy acc = np.mean(pred_labels == eval_labels) * 100 return acc, dist_loss
data_nor = data_pre[data_pre.normal1 == 1] data_abn = data_pre[data_pre.normal1 == -1] ax = pyplot.gca() data_nor.plot(x='timestamp_int', y='value', ax=ax,color='blue',marker='o') data_abn.plot(kind='scatter', x='timestamp_int', y='value', ax = ax, marker='x', color='r') pyplot.show() data_pre = data_pre.drop(['timestamp'], axis=1) min_max_scaler = preprocessing.StandardScaler() np_scaled = min_max_scaler.fit_transform(data_pre) # train one class SVM model = OneClassSVM(nu=0.95 * 0.01) data = pandas.DataFrame(np_scaled) model.fit(data) data_pre['normal2'] = pandas.Series(model.predict(data)) data_pre['normal2'] = data_pre['normal2'].map( {1: 0, -1: 1} ) print(data_pre['normal2'].value_counts()) fig, ax = pyplot.subplots() a = data_pre.loc[data_pre['normal2'] == 1, ['timestamp_int', 'value']] ax.plot(data_pre['timestamp_int'], data_pre['value'], color='blue',marker='.',linestyle=' ') ax.scatter(a['timestamp_int'], a['value'], color='red',marker='x') pyplot.show()
class OneClassSVM(object): def __init__(self, kernel='rbf', gamma='scale', tol=0.001, nu=0.5, shrinking=True, max_iter=1000): """ Unsupervised Outlier Detection. Arguments --------- kernel : {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}, optional (default=rbf). Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable gamma : {‘scale’, ‘auto’} or float, default=’scale’ Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. tol : float, default=1e-3 Tolerance for stopping criterion nu : float, default=0.5 An upper bound on the fraction of training errors and a lower bound of the fraction of support vectors. Should be in the interval (0, 1]. By default 0.5 will be taken max_iter : int, default=-1 Hard limit on iterations within solver, or -1 for no limit. Reference --------- For more information, please visit https://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html """ self.model = SVM(kernel=kernel, gamma=gamma, tol=tol, nu=nu, shrinking=shrinking, max_iter=max_iter) self.transformer = None def fit(self, x): """ Arguments --------- x: ndarray, the event count matrix of shape num_instances-by-num_events """ print('OneClassSVM Fit') x = x.reshape((len(x), -1)) self.transformer = get_transformer(x, 'minmax') x = self.transformer.transform(x) self.model.fit(x) def predict(self, x): """ Predict anomalies with mined invariants Arguments --------- x: the input event count matrix Returns ------- y_pred: ndarray, the predicted label vector of shape (num_instances,) """ print('OneClassSVM Predict') x = x.reshape((len(x), -1)) x = self.transformer.transform(x) y_pred = self.model.predict(x) y_pred = np.where(y_pred > 0, 0, 1) return y_pred
regularization_string = "_012" X0 = dataset.dataset("./tica/tica%d%s.h5" % (tica_lagtime, regularization_string)) slicer = featurizer.FirstSlicer(2) X = slicer.transform(X0) Xf = np.concatenate(X) hexbin(Xf[:, 0], Xf[:, 1], bins='log') Xf_train = Xf[::100] svm = OneClassSVM() svm.fit(Xf_train) kde = sklearn.neighbors.kde.KernelDensity() kde.fit(Xf) scores = map(lambda x: kde.score(x), X) ind0 = (Xf[:, 0] > 0.75) & (Xf[:, 0] < 0.92) & (Xf[:, 1] > 0.63) & (Xf[:, 1] < 1.10) Xf0 = Xf[ind0] Xf0.shape kde0 = sklearn.neighbors.kde.KernelDensity() kde0.fit(Xf0) scores = map(lambda x: kde0.score(x), X)
ax.set_ylabel( 'Margin' ) ax.set_zlabel( 'Similarity of Neighboring Districts' ) ax.set_zlim( [ 0., 1. ] ) ax.set_xlim( [ 0., 500. ] ) ax.set_ylim( [ 0., 1. ] ) fig.show() angles = np.linspace(0,360,41)[:-1] # Take 20 angles between 0 and 360 rotanimate(ax, angles,'movie.gif',delay=20, width = 6., height = 5.) # do outlier search using one-class SVM data[ 0, : ] = preprocessing.scale( data[ 0, : ] ) model = OneClassSVM( gamma = .001, nu = .1 ) fit = model.fit( data ) preds = model.predict( data ) inlier = np.where( preds == 1. )[ 0 ] outlier = np.where( preds == -1. )[ 0 ] fig = plt.figure() ax = fig.add_subplot( 111, projection = '3d' ) ax.scatter( data[ inlier, 0 ], data[ inlier, 1 ], data[ inlier, 2 ], c = 'b' ) ax.scatter( data[ outlier, 0 ], data[ outlier, 1 ], data[ outlier, 2 ], c = 'k' ) ax.set_xlabel( '$P^2/A$' ) ax.set_ylabel( 'Margin' ) ax.set_zlabel( 'Similarity of Neighboring Districts' ) ax.set_ylim( [0., 1 ] ) ax.set_zlim( [ 0., 1. ] )
# In[4]: y_satellite.iloc[:, 0].value_counts().plot.bar() plt.savefig('img.png') plt.show() # In[5]: y_satellite[0].value_counts() # In[6]: gamma_values, err_values_gamma = [], [] for g in np.linspace(0.0000015, 0.00015, 10): onesvm = OneClassSVM(nu=y_satellite.mean(), gamma=g) onesvm.fit(satellite) yhat = onesvm.predict(satellite) yhat = ((yhat - 1) * -1) / 2 acc = accuracy_score(y_satellite, yhat) err = 1 - acc gamma_values.append(g) err_values_gamma.append(err) # In[7]: plt.subplots(figsize=(10, 5)) plt.plot(gamma_values, err_values_gamma, 'o-') plt.xlabel('gamma') plt.ylabel('error') plt.show()
def one_class_svm(n, g): data_set = pandas.read_csv(selected_features_path) data_set.pop(data_set.columns[0]) # class distribution print(data_set.groupby('Class').size()) # Split-out validation dataset array = data_set.values X = array[:, 0:number_of_features] Y = array[:, number_of_features] validation_size = 0.20 seed = 7 X_train, X_validation, Y_train, Y_validation = \ X[0:50], X[50:], Y[0:50], Y[50:] # model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed) # Test options and evaluation metric seed = 7 scoring = 'accuracy' # Spot Check Algorithms models = [] models.append(('LR', LogisticRegression())) # models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier())) models.append(('NB', GaussianNB())) models.append(('SVM', SVC())) models.append(('OneClassSVM', OneClassSVM())) # evaluate each model in turn results = [] names = [] #for name, model in models: # kfold = model_selection.KFold(n_splits=10, random_state=seed) # cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) # results.append(cv_results) # names.append(name) # msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) # print(msg) model = OneClassSVM(nu=n, kernel='rbf', gamma=g) model.fit(X_train) # print('\n') # print(model) # print('\n') preds = model.predict(X_validation) correct_preds = [] for pred in preds: if pred == -1: correct_preds.append(1) else: correct_preds.append(0) targs = Y_validation print('\n') correct_targs = [] for targ in targs: correct_targs.append(targ) # print(correct_targs) # print(correct_preds) print("accuracy: ", metrics.accuracy_score(correct_targs, correct_preds)) # print("precision: ", metrics.precision_score(correct_targs, correct_preds, average=None)) # print("recall: ", metrics.recall_score(correct_targs, correct_preds, average=None)) # print("f1: ", metrics.f1_score(correct_targs, correct_preds, average=None)) # print("area under curve (auc): ", metrics.roc_auc_score(correct_targs, preds)) res = metrics.accuracy_score(correct_targs, correct_preds) # print(type(np.float64(res).item())) fres = np.float64(res).item() return fres
def classifier(data): from sklearn.covariance import EllipticEnvelope from sklearn.svm import OneClassSVM from sklearn.datasets import load_boston from sklearn import preprocessing # Get data # Define "classifiers" to be used legend1 = {} legend2 = {} evaluation = [[val["coverage"], val["num_exons"], val["distance_to_next"]] for val in data] X = [[val["coverage"], val["num_exons"], val["distance_to_next"]] for val in data] X = preprocessing.scale(X) evaluation = preprocessing.scale(evaluation) # Learn a frontier for outlier detection with several classifiers sample = random.sample(X, 20000) clf = OneClassSVM(nu=.1, kernel='rbf') test = random.sample(evaluation, 2000) print >> sys.stderr, "fitting data" clf.fit(sample) print >> sys.stderr, "predicting data" Y = clf.predict(test) print >> sys.stderr, "plotting data" fig, axes = subplots() for i in range(len(test)): if Y[i] == 1: color = 'blue' else: color = 'red' axes.scatter(test[i][2], test[i][1], c=color) #ylim([50,2000]) #num exons ylabel("distance") #xlim([3,10]) xlabel("coverage") savefig("DistanceVCoverage.pdf") fig, axes = subplots() """ for i in range(len(test)): if Y[i] == 1: color = 'blue' else: color = 'red' axes.scatter(test[i][1], test[i][0], c=color) #xlim([0,10]) #num exons xlabel("number of exons") #ylim([3,15]) ylabel("coverage") savefig("ExonsvsCoverage.pdf") """ full_test = clf.predict(evaluation) novel, regular = [],[] for i in range(len(full_test)): result = full_test[i] if result == -1: print data[i]["id"] novel.append(data[i]["num_exons"]) else: regular.append(data[i]["num_exons"]) multi_exon_novel = [val for val in novel if val > 1] multi_exon_regular = [val for val in regular if val > 1] print >> sys.stderr, "novel, regular" print >> sys.stderr, len(novel), len(regular) print >> sys.stderr, mean(multi_exon_novel), mean(multi_exon_regular), len(multi_exon_novel), len(multi_exon_regular)
def base_experiment(config, pct_noise=0.15, noverlap_bits=0, ntrials=10, verbose=False, seed=123456789): """ Run a single experiment, locally. @param config: The configuration parameters. @param pct_noise: The percentage of noise to add to the dataset. @param noverlap_bits: The number of bits the base class should overlap with the novelty class. @param ntrials: The number of times to repeat the experiment. @param verbose: If True print the results. @param seed: The random seed to use. """ # Base parameters ntrain, ntest = 800, 200 nsamples, nbits, pct_active = ntest + ntrain, 100, 0.4 clf_th = 0.5 # Build the directory, if needed base_dir = config['log_dir'] if not os.path.exists(base_dir): os.makedirs(base_dir) # Seed numpy np.random.seed(seed) # Create the base dataset x_ds = SPDataset(nsamples, nbits, pct_active, pct_noise, seed=seed) x_tr, x_te = x_ds.data[:ntrain], x_ds.data[ntrain:] # Create the outlier dataset base_indexes = set(np.where(x_ds.base_class == 1)[0]) choices = [x for x in xrange(nbits) if x not in base_indexes] outlier_base = np.zeros(nbits, dtype='bool') outlier_base[np.random.choice(choices, x_ds.nactive - noverlap_bits, False)] = 1 outlier_base[np.random.permutation(list(base_indexes))[:noverlap_bits]] = 1 y_ds = SPDataset(ntest, nbits, pct_active, pct_noise, outlier_base, seed) y_te = y_ds.data if verbose: print "\nBase class' test noise: {0:2.2f}".format( 1 - (np.mean(x_te, 0) * x_ds.base_class.astype('i')).sum() / 40.) print "Outlier's class noise: {0:2.2f}".format( 1 - (np.mean(y_te, 0) * outlier_base.astype('i')).sum() / 40.) print 'Overlap between two classes: {0}'.format( np.dot(x_ds.base_class.astype('i'), outlier_base.astype('i'))) # Metrics metrics = SPMetrics() # Get the metrics for the datasets u_x_tr = metrics.compute_uniqueness(x_tr) o_x_tr = metrics.compute_overlap(x_tr) u_x_te = metrics.compute_uniqueness(x_te) o_x_te = metrics.compute_overlap(x_te) u_y_te = metrics.compute_uniqueness(y_te) o_y_te = metrics.compute_overlap(y_te) # Initialize the overall results sp_x_results = np.zeros(ntrials) sp_y_results = np.zeros(ntrials) svm_x_results = np.zeros(ntrials) svm_y_results = np.zeros(ntrials) # Iterate across the trials: for i, seed2 in enumerate(generate_seeds(ntrials, seed)): # Create the SP config['seed'] = seed2 sp = SPRegion(**config) # Fit the SP sp.fit(x_tr) # Get the SP's output sp_x_tr = sp.predict(x_tr) sp_x_te = sp.predict(x_te) sp_y_te = sp.predict(y_te) # Get the metrics for the SP's results u_sp_x_tr = metrics.compute_uniqueness(sp_x_tr) o_sp_x_tr = metrics.compute_overlap(sp_x_tr) u_sp_x_te = metrics.compute_uniqueness(sp_x_te) o_sp_x_te = metrics.compute_overlap(sp_x_te) u_sp_y_te = metrics.compute_uniqueness(sp_y_te) o_sp_y_te = metrics.compute_overlap(sp_y_te) # Log all of the metrics sp._log_stats('Input Base Class Train Uniqueness', u_x_tr) sp._log_stats('Input Base Class Train Overlap', o_x_tr) sp._log_stats('Input Base Class Test Uniqueness', u_x_te) sp._log_stats('Input Base Class Test Overlap', o_x_te) sp._log_stats('Input Novelty Class Test Uniqueness', u_y_te) sp._log_stats('Input Novelty Class Test Overlap', o_y_te) sp._log_stats('SP Base Class Train Uniqueness', u_sp_x_tr) sp._log_stats('SP Base Class Train Overlap', o_sp_x_tr) sp._log_stats('SP Base Class Test Uniqueness', u_sp_x_te) sp._log_stats('SP Base Class Test Overlap', o_sp_x_te) sp._log_stats('SP Novelty Class Test Uniqueness', u_sp_y_te) sp._log_stats('SP Novelty Class Test Overlap', o_sp_y_te) # Print the results fmt_s = '{0}:\t{1:2.4f}\t{2:2.4f}\t{3:2.4f}\t{4:2.4f}\t{5:2.4f}\t{6:2.4f}' if verbose: print '\nDescription\tx_tr\tx_te\ty_te\tsp_x_tr\tsp_x_te\tsp_y_te' print fmt_s.format('Uniqueness', u_x_tr, u_x_te, u_y_te, u_sp_x_tr, u_sp_x_te, u_sp_y_te) print fmt_s.format('Overlap', o_x_tr, o_x_te, o_y_te, o_sp_x_tr, o_sp_x_te, o_sp_y_te) # Get average representation of the base class sp_base_result = np.mean(sp_x_tr, 0) sp_base_result[sp_base_result >= 0.5] = 1 sp_base_result[sp_base_result < 1] = 0 # Averaged results for each metric type u_sp_base_to_x_te = 0. o_sp_base_to_x_te = 0. u_sp_base_to_y_te = 0. o_sp_base_to_y_te = 0. for x, y in zip(sp_x_te, sp_y_te): # Refactor xt = np.vstack((sp_base_result, x)) yt = np.vstack((sp_base_result, y)) # Compute the sums u_sp_base_to_x_te += metrics.compute_uniqueness(xt) o_sp_base_to_x_te += metrics.compute_overlap(xt) u_sp_base_to_y_te += metrics.compute_uniqueness(yt) o_sp_base_to_y_te += metrics.compute_overlap(yt) u_sp_base_to_x_te /= ntest o_sp_base_to_x_te /= ntest u_sp_base_to_y_te /= ntest o_sp_base_to_y_te /= ntest # Log the results sp._log_stats('Base Train to Base Test Uniqueness', u_sp_base_to_x_te) sp._log_stats('Base Train to Base Test Overlap', o_sp_base_to_x_te) sp._log_stats('Base Train to Novelty Test Uniqueness', u_sp_base_to_y_te) sp._log_stats('Base Train to Novelty Test Overlap', o_sp_base_to_y_te) # Print the results if verbose: print '\nDescription\tx_tr->x_te\tx_tr->y_te' print 'Uniqueness:\t{0:2.4f}\t{1:2.4f}'.format( u_sp_base_to_x_te, u_sp_base_to_y_te) print 'Overlap:\t{0:2.4f}\t{1:2.4f}'.format( o_sp_base_to_x_te, o_sp_base_to_y_te) # Create an SVM clf = OneClassSVM(kernel='linear', nu=0.1, random_state=seed2) # Evaluate the SVM's performance clf.fit(x_tr) svm_x_te = len(np.where(clf.predict(x_te) == 1)[0]) / float(ntest) * \ 100 svm_y_te = len(np.where(clf.predict(y_te) == -1)[0]) / float(ntest) * \ 100 # Perform classification using overlap as the feature # -- The overlap must be above 50% clf_x_te = 0. clf_y_te = 0. for x, y in zip(sp_x_te, sp_y_te): # Refactor xt = np.vstack((sp_base_result, x)) yt = np.vstack((sp_base_result, y)) # Compute the accuracy xo = metrics.compute_overlap(xt) yo = metrics.compute_overlap(yt) if xo >= clf_th: clf_x_te += 1 if yo < clf_th: clf_y_te += 1 clf_x_te = (clf_x_te / ntest) * 100 clf_y_te = (clf_y_te / ntest) * 100 # Store the results as errors sp_x_results[i] = 100 - clf_x_te sp_y_results[i] = 100 - clf_y_te svm_x_results[i] = 100 - svm_x_te svm_y_results[i] = 100 - svm_y_te # Log the results sp._log_stats('SP % Correct Base Class', clf_x_te) sp._log_stats('SP % Correct Novelty Class', clf_y_te) sp._log_stats('SVM % Correct Base Class', svm_x_te) sp._log_stats('SVM % Correct Novelty Class', svm_y_te) # Print the results if verbose: print '\nSP Base Class Detection : {0:2.2f}%'.format(clf_x_te) print 'SP Novelty Class Detection : {0:2.2f}%'.format(clf_y_te) print 'SVM Base Class Detection : {0:2.2f}%'.format(svm_x_te) print 'SVM Novelty Class Detection : {0:2.2f}%'.format(svm_y_te) # Save the results with open(os.path.join(base_dir, 'results.pkl'), 'wb') as f: cPickle.dump( (sp_x_results, sp_y_results, svm_x_results, svm_y_results), f, cPickle.HIGHEST_PROTOCOL)
def main(): usage="refine2d using simmx information " parser = EMArgumentParser(usage=usage,version=EMANVERSION) parser.add_argument("--ptcls", type=str,help="particle file", default=None) parser.add_argument("--simmx", type=str,help="simmx", default=None) parser.add_argument("--npca", type=int,help="number of pca factors", default=10) parser.add_argument("--niter", type=int,help="number of iterations", default=5) parser.add_argument("--outlier", type=float,help="outlier fraction", default=0.1) parser.add_argument("--ncls", type=int,help="number of centers", default=128) parser.add_argument("--nref", type=int,help="number of references", default=32) (options, args) = parser.parse_args() logid=E2init(sys.argv) simmxfile=options.simmx for itr in range(options.niter): ### start from the simmx print "Pre-processing simmx" e=EMData(simmxfile) pts=e.numpy().T.copy() for i in range(len(pts)): pts[i]-=np.mean(pts[i]) pts[i]/=np.std(pts[i]) pts=pts.astype(np.float).copy(); #e=from_numpy(pts.T.copy()) #e.write_image("simmx_tmp.hdf") #exit() print "Doing PCA" (nptcl, ncls) = pts.shape; #nfac=options.npca pca=PCA(options.npca) pts_pca=pca.fit_transform(pts) bs=pts_pca bs/=np.std(bs) print bs.shape,pts.shape np.savetxt("test_pca_{:02d}".format(itr),pts_pca) print "Removing outliers" outliers_fraction=options.outlier svm=OneClassSVM(nu=0.95 * outliers_fraction + 0.05,kernel="rbf", gamma=0.1) svm.fit(bs) y_pred = svm.decision_function(bs).ravel() nkeep=int(len(bs)*(1-outliers_fraction)) st=np.argsort(y_pred)[::-1] st=st[:nkeep] print "Clustering" ncnt=options.ncls centroids,_ = kmeans(bs[st],ncnt) l,_ = vq(bs[st],centroids) labels=np.zeros(len(bs))-1 labels[st]=l print "Class averaging" e=EMData(1,len(labels)) for i in range(len(labels)): e.set_value_at(0,i,labels[i]) clsmxfile="clsmx_{:02d}.hdf".format(itr) e.write_image(clsmxfile) clsout="classes_{:02d}.hdf".format(itr) run("e2classaverage.py --input={} --classmx={} --output={} --force --center xform.center --iter=5 --align=rotate_translate_flip:maxshift=32 --averager=mean --keep=.6 --cmp=ccc --aligncmp=ccc --normproc=normalize --parallel=thread:12".format(options.ptcls,clsmxfile,clsout)) simmxfile="simmx_{:02d}.hdf".format(itr) run("e2simmx.py {} {} {} --align rotate_translate_flip --aligncmp ccc --cmp ccc --saveali --parallel thread:12".format(options.ptcls, clsout, simmxfile)) E2end(logid)
X0_outliers_n = scaler.transform(X0) #UNIRE X1_test_n E X0_outliers_n in X_TEST_n X_TEST_n = np.concatenate((X1_test_n, X0_outliers_n)) #UNIRE Y1_test E Y0 Y_TEST = np.concatenate((Y1_test, Y0)) pca = PCA(n_components=0.95) reducer = pca.fit(X1_train_n) X1_train_n_reduced = reducer.transform(X1_train_n) X_TEST_n_reduced = reducer.transform(X_TEST_n) clf = OneClassSVM(gamma='auto', nu=0.5) clf.fit(X1_train_n_reduced) Y1_pred_train = clf.predict(X1_train_n_reduced) Y_pred_TEST = clf.predict(X_TEST_n_reduced) #VALUTAZIONE #TRAIN SET #matrice di confusione confmat = confusion_matrix(y_true=Y1_train, y_pred=Y1_pred_train) fig, ax = plt.subplots(figsize=(2.5, 2.5)) ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.5) for i in range(confmat.shape[0]):
axis_alpha = np.arange(alpha_min, alpha_max, 0.0001) unif = np.random.uniform(lim_inf, lim_sup, size=(n_generated, n_features)) # fit: print('IsolationForest processing...') iforest = IsolationForest() iforest.fit(X_train) s_X_iforest = iforest.decision_function(X_test) print('LocalOutlierFactor processing...') lof = LocalOutlierFactor(n_neighbors=20) lof.fit(X_train) s_X_lof = lof.decision_function(X_test) print('OneClassSVM processing...') ocsvm = OneClassSVM() ocsvm.fit(X_train[:min(ocsvm_max_train, n_samples_train - 1)]) s_X_ocsvm = ocsvm.decision_function(X_test).reshape(1, -1)[0] s_unif_iforest = iforest.decision_function(unif) s_unif_lof = lof.decision_function(unif) s_unif_ocsvm = ocsvm.decision_function(unif).reshape(1, -1)[0] plt.subplot(121) auc_iforest, em_iforest, amax_iforest = em(t, t_max, volume_support, s_unif_iforest, s_X_iforest, n_generated) auc_lof, em_lof, amax_lof = em(t, t_max, volume_support, s_unif_lof, s_X_lof, n_generated) auc_ocsvm, em_ocsvm, amax_ocsvm = em(t, t_max, volume_support, s_unif_ocsvm, s_X_ocsvm,
class SVMClassifier: def __init__(self, nu=0.1, kernel="rbf", gamma=0.1): self.nu = nu self.kernel = kernel self.gamma = gamma self.svm = OneClassSVM(nu=nu, kernel=kernel, gamma=gamma) def get_parameters_string(self): return self.kernel + "_" + str(self.nu) + "_" + str(self.gamma) def print_details(self): print self.get_details() def get_details(self): result = "--------------------------\n" result += "Classifier type: SVM\n" result += "Parameters\n" result += "Kernel :" + self.kernel + "\n" result += "nu :" + str(self.nu) + "\n" result += "gamma :" + str(self.gamma) + "\n" result += "--------------------------\n" return result def train(self, dataset): first_label = dataset.feature_vectors[0].label for feature_vector in dataset.feature_vectors: if feature_vector.label != first_label: print "Training set vectors should be of the same label!!!" return None self.user_id = dataset.feature_vectors[0].label self.svm.fit([ feature_vector.values for feature_vector in dataset.feature_vectors ]) def test(self, dataset): # each row is a labeled_sample samples = [ feature_vector.values for feature_vector in dataset.feature_vectors ] result = [] labels = [ 1 if feature_vector.label == self.user_id else -1 for feature_vector in dataset.feature_vectors ] predictions = self.svm.predict(samples) tp = len([ 1 for index in range(len(predictions)) if labels[index] == 1 == predictions[index] ]) tn = len([ 1 for index in range(len(predictions)) if labels[index] == -1 == predictions[index] ]) fp = len([ 1 for index in range(len(predictions)) if labels[index] == -1 and predictions[index] == 1 ]) fn = len([ 1 for index in range(len(predictions)) if labels[index] == 1 and predictions[index] == -1 ]) result.append([tp, tn, fp, fn]) return result
org_dataset_label = org_dataset[len(org_dataset.columns) - 1] data_0 = org_dataset.loc[org_dataset[len(org_dataset.columns) - 1] == 0] data_1 = org_dataset.loc[org_dataset[len(org_dataset.columns) - 1] == 1] data_2 = org_dataset.loc[org_dataset[len(org_dataset.columns) - 1] == 2] majority_class = data_2.append(data_0) org_dataset_X = np.array(org_dataset_features) org_dataset_y = np.ravel(np.array(org_dataset_label)) X_train, X_test, y_train, y_test = cross_validation.train_test_split( org_dataset_X, org_dataset_y, test_size=0.30) #OC SVM clf = OneClassSVM() clf.fit(majority_class) preds = clf.predict(org_dataset) org = np.array(org_dataset) #subset1 and larger SPLIT i = 0 count_1 = 0 count_2 = 0 subset1_X = [] subset1_y = [] larger_X = [] larger_y = [] larger = [] for i in list(range(0, len(preds))): if preds[i] == -1: count_1 = count_1 + 1
# Creates and saves model of data as fit by the OC SVM # args: data file, label file, name for model import numpy as np from sklearn.svm import OneClassSVM from sklearn.metrics import f1_score, precision_score, recall_score import sys from joblib import dump, load import csv train = np.loadtxt(sys.argv[1], delimiter=",") data = np.loadtxt(sys.argv[2], delimiter=",") labels = np.loadtxt(sys.argv[3], delimiter=",") clf = OneClassSVM(kernel='rbf', gamma='scale') clf.fit(train) dump(clf, sys.argv[3] + '.joblib') predicted = clf.predict(data) #Results of self test selfResults = open(sys.argv[3] + 'selfResults.csv', "w+") writer = csv.writer(selfResults) writer.writerow([predicted]) selfResults.close() #Results of self test recall selfRecall = open(sys.argv[3] + 'selfRecall.csv', "w+") writer = csv.writer(selfRecall) writer.writerow([recall_score(labels, predicted)]) selfRecall.close()
import numpy as np from rop_dataextract import * from sklearn.svm import OneClassSVM import sys MAX_EVENT_COUNTERS = 4 TIME_DELTA = 10000 CLUSTER_POINTS = 32 TRAIN_POINTS = 100000 TEST_POINTS = -1 svm = OneClassSVM() train_set, test_set = getSetNames(sys.argv) print "aggregating data..." obs = aggrTimeseries(train_set, TRAIN_POINTS, CLUSTER_POINTS, MAX_EVENT_COUNTERS, TIME_DELTA) print len(obs) print "fitting model..." svm.fit(obs) print "aggregating test..." test = aggrTimeseries(test_set, TEST_POINTS, CLUSTER_POINTS, MAX_EVENT_COUNTERS, TIME_DELTA) print "testing..." prediction = svm.predict(test) print sum(prediction) print len(prediction)
kf.get_n_splits(X) param_dist = { 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['scale', 'auto'], 'nu': stats.uniform(.0, .99), 'shrinking': [True, False] } n_inter = 20 # clf = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_inter, cv=5, scoring="accuracy") print(kf) for train_index, test_index in kf.split(X): print("Rodada") X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf = clf.fit(X_train, y_train) y_pred_train = clf.predict(X_train) y_pred_test = clf.predict(X_test) n_error_train = y_pred_train[y_pred_train == -1].size n_error_test = y_pred_test[y_pred_test == -1].size print("Train error: {:d}".format(n_error_train)) print("Test error: {:d}".format(n_error_test)) end = time.time() print("It took: %.2f seconds" % (end - start))
def classifier(data): from sklearn.covariance import EllipticEnvelope from sklearn.svm import OneClassSVM from sklearn.datasets import load_boston from sklearn import preprocessing # Get data # Define "classifiers" to be used legend1 = {} legend2 = {} evaluation = [[val["coverage"], val["num_exons"], val["distance_to_next"]] for val in data] X = [[val["coverage"], val["num_exons"], val["distance_to_next"]] for val in data] X = preprocessing.scale(X) evaluation = preprocessing.scale(evaluation) # Learn a frontier for outlier detection with several classifiers sample = random.sample(X, 20000) clf = OneClassSVM(nu=.1, kernel='rbf') test = random.sample(evaluation, 2000) print >> sys.stderr, "fitting data" clf.fit(sample) print >> sys.stderr, "predicting data" Y = clf.predict(test) print >> sys.stderr, "plotting data" fig, axes = subplots() for i in range(len(test)): if Y[i] == 1: color = 'blue' else: color = 'red' axes.scatter(test[i][2], test[i][1], c=color) #ylim([50,2000]) #num exons ylabel("distance") #xlim([3,10]) xlabel("coverage") savefig("DistanceVCoverage.pdf") fig, axes = subplots() """ for i in range(len(test)): if Y[i] == 1: color = 'blue' else: color = 'red' axes.scatter(test[i][1], test[i][0], c=color) #xlim([0,10]) #num exons xlabel("number of exons") #ylim([3,15]) ylabel("coverage") savefig("ExonsvsCoverage.pdf") """ full_test = clf.predict(evaluation) novel, regular = [], [] for i in range(len(full_test)): result = full_test[i] if result == -1: print data[i]["id"] novel.append(data[i]["num_exons"]) else: regular.append(data[i]["num_exons"]) multi_exon_novel = [val for val in novel if val > 1] multi_exon_regular = [val for val in regular if val > 1] print >> sys.stderr, "novel, regular" print >> sys.stderr, len(novel), len(regular) print >> sys.stderr, mean(multi_exon_novel), mean(multi_exon_regular), len( multi_exon_novel), len(multi_exon_regular)
def base_experiment(config, ntrials=1, seed=123456789): """ Run a single experiment, locally. @param config: The configuration parameters to use for the SP. @param ntrials: The number of times to repeat the experiment. @param seed: The random seed to use. @return: A tuple containing the percentage errors for the SP's training and testing results and the SVM's training and testing results, respectively. """ # Base parameters ntrain, ntest = 800, 200 clf_th = 0.5 # Seed numpy np.random.seed(seed) # Get the data (tr_x, tr_y), (te_x, te_y) = load_mnist() tr_x_0 = np.random.permutation(tr_x[tr_y == 0]) x_tr = tr_x_0[:ntrain] x_te = tr_x_0[ntrain:ntrain + ntest] outliers = [np.random.permutation(tr_x[tr_y == i])[:ntest] for i in xrange(1, 10)] # Metrics metrics = SPMetrics() # Get the metrics for the datasets u_x_tr = metrics.compute_uniqueness(x_tr) o_x_tr = metrics.compute_overlap(x_tr) c_x_tr = 1 - metrics.compute_distance(x_tr) u_x_te = metrics.compute_uniqueness(x_te) o_x_te = metrics.compute_overlap(x_te) c_x_te = 1 - metrics.compute_distance(x_te) u_y_te, o_y_te, c_y_te = [], [], [] for outlier in outliers: u_y_te.append(metrics.compute_uniqueness(outlier)) o_y_te.append(metrics.compute_overlap(outlier)) c_y_te.append(1 - metrics.compute_distance(outlier)) # Initialize the overall results sp_x_results = np.zeros(ntrials) sp_y_results = [np.zeros(ntrials) for _ in xrange(9)] svm_x_results = np.zeros(ntrials) svm_y_results = [np.zeros(ntrials) for _ in xrange(9)] # Iterate across the trials: for nt in xrange(ntrials): # Make a new seeod seed2 = np.random.randint(1000000) config['seed'] = seed2 # Create the SP sp = SPRegion(**config) # Fit the SP sp.fit(x_tr) # Get the SP's output sp_x_tr = sp.predict(x_tr) sp_x_te = sp.predict(x_te) sp_y_te = [sp.predict(outlier) for outlier in outliers] # Get the metrics for the SP's results u_sp_x_tr = metrics.compute_uniqueness(sp_x_tr) o_sp_x_tr = metrics.compute_overlap(sp_x_tr) c_sp_x_tr = 1 - metrics.compute_distance(sp_x_tr) u_sp_x_te = metrics.compute_uniqueness(sp_x_te) o_sp_x_te = metrics.compute_overlap(sp_x_te) c_sp_x_te = 1 - metrics.compute_distance(sp_x_te) u_sp_y_te, o_sp_y_te, c_sp_y_te = [], [], [] for y in sp_y_te: u_sp_y_te.append(metrics.compute_uniqueness(y)) o_sp_y_te.append(metrics.compute_overlap(y)) c_sp_y_te.append(1 - metrics.compute_distance(y)) # Log all of the metrics sp._log_stats('Input Base Class Train Uniqueness', u_x_tr) sp._log_stats('Input Base Class Train Overlap', o_x_tr) sp._log_stats('Input Base Class Train Correlation', c_x_tr) sp._log_stats('Input Base Class Test Uniqueness', u_x_te) sp._log_stats('Input Base Class Test Overlap', o_x_te) sp._log_stats('Input Base Class Test Correlation', c_x_te) sp._log_stats('SP Base Class Train Uniqueness', u_sp_x_tr) sp._log_stats('SP Base Class Train Overlap', o_sp_x_tr) sp._log_stats('SP Base Class Train Correlation', c_sp_x_tr) sp._log_stats('SP Base Class Test Uniqueness', u_sp_x_te) sp._log_stats('SP Base Class Test Overlap', o_sp_x_te) sp._log_stats('SP Base Class Test Correlation', c_sp_x_te) for i, (a, b, c, d, e, f) in enumerate(zip(u_y_te, o_y_te, c_y_te, u_sp_y_te, o_sp_y_te, c_sp_y_te), 1): sp._log_stats('Input Novelty Class {0} Uniqueness'.format(i), a) sp._log_stats('Input Novelty Class {0} Overlap'.format(i), b) sp._log_stats('Input Novelty Class {0} Correlation'.format(i), c) sp._log_stats('SP Novelty Class {0} Uniqueness'.format(i), d) sp._log_stats('SP Novelty Class {0} Overlap'.format(i), e) sp._log_stats('SP Novelty Class {0} Correlation'.format(i), f) # Get average representation of the base class sp_base_result = np.mean(sp_x_tr, 0) sp_base_result[sp_base_result >= 0.5] = 1 sp_base_result[sp_base_result < 1] = 0 # Averaged results for each metric type u_sp_base_to_x_te = 0. o_sp_base_to_x_te = 0. c_sp_base_to_x_te = 0. u_sp, o_sp, c_sp = np.zeros(9), np.zeros(9), np.zeros(9) for i, x in enumerate(sp_x_te): xt = np.vstack((sp_base_result, x)) u_sp_base_to_x_te += metrics.compute_uniqueness(xt) o_sp_base_to_x_te += metrics.compute_overlap(xt) c_sp_base_to_x_te += 1 - metrics.compute_distance(xt) for j, yi in enumerate(sp_y_te): yt = np.vstack((sp_base_result, yi[i])) u_sp[j] += metrics.compute_uniqueness(yt) o_sp[j] += metrics.compute_overlap(yt) c_sp[j] += 1 - metrics.compute_distance(yt) u_sp_base_to_x_te /= ntest o_sp_base_to_x_te /= ntest c_sp_base_to_x_te /= ntest for i in xrange(9): u_sp[i] /= ntest o_sp[i] /= ntest c_sp[i] /= ntest # Log the results sp._log_stats('Base Train to Base Test Uniqueness', u_sp_base_to_x_te) sp._log_stats('Base Train to Base Test Overlap', o_sp_base_to_x_te) sp._log_stats('Base Train to Base Test Correlation', c_sp_base_to_x_te) for i, j in enumerate(xrange(1, 10)): sp._log_stats('Base Train to Novelty {0} Uniqueness'.format(j), u_sp[i]) sp._log_stats('Base Train to Novelty {0} Overlap'.format(j), o_sp[i]) sp._log_stats('Base Train to Novelty {0} Correlation'.format(j), c_sp[i]) # Create an SVM clf = OneClassSVM(kernel='linear', nu=0.1, random_state=seed2) # Evaluate the SVM's performance clf.fit(x_tr) svm_x_te = len(np.where(clf.predict(x_te) == 1)[0]) / float(ntest) * \ 100 svm_y_te = np.array([len(np.where(clf.predict(outlier) == -1)[0]) / float(ntest) * 100 for outlier in outliers]) # Perform classification using overlap as the feature # -- The overlap must be above 50% clf_x_te = 0. clf_y_te = np.zeros(9) for i, x in enumerate(sp_x_te): xt = np.vstack((sp_base_result, x)) xo = metrics.compute_overlap(xt) if xo >= clf_th: clf_x_te += 1 for j, yi in enumerate(sp_y_te): yt = np.vstack((sp_base_result, yi[i])) yo = metrics.compute_overlap(yt) if yo < clf_th: clf_y_te[j] += 1 clf_x_te = (clf_x_te / ntest) * 100 clf_y_te = (clf_y_te / ntest) * 100 # Store the results as errors sp_x_results[nt] = 100 - clf_x_te sp_y_results[nt] = 100 - clf_y_te svm_x_results[nt] = 100 - svm_x_te svm_y_results[nt] = 100 - svm_y_te # Log the results sp._log_stats('SP % Correct Base Class', clf_x_te) sp._log_stats('SVM % Correct Base Class', svm_x_te) for i, j in enumerate(xrange(1, 10)): sp._log_stats('SP % Correct Novelty Class {0}'.format(j), clf_y_te[i]) sp._log_stats('SVM % Correct Novelty Class {0}'.format(j), svm_y_te[i]) sp._log_stats('SP % Mean Correct Novelty Class', np.mean(clf_y_te)) sp._log_stats('SVM % Mean Correct Novelty Class', np.mean(svm_y_te)) sp._log_stats('SP % Adjusted Score', (np.mean(clf_y_te) * clf_x_te) / 100) sp._log_stats('SVM % Adjusted Score', (np.mean(svm_y_te) * svm_x_te) / 100) return sp_x_results, sp_y_results, svm_x_results, svm_y_results