class svm_model(): def train(self, X, ker): self.model = OneClassSVM(kernel=ker, shrinking=True,random_state=1) self.model.fit(X) def predict(self, X): return self.model.predict(X)
def main(): n = 1000 data = [] for i in range(n): data.append(np.array([np.random.randint(0, 5000) for i in range(np.random.randint(20, 150))])) data = np.array(data) # making all the data into 5 dimensions # howto : boxplot x = [] y = [] for i in data: sorted_i = sorted(i) x.append([max(sorted_i), np.percentile(sorted_i, 75), np.median(sorted_i), np.percentile(sorted_i, 25), min(sorted_i)]) y.append(0) x = np.array(x) ''' # making all the data into 5 dimensions # howto : distance start = time.time() data_i = 0 cnt = 1 x = np.zeros((n, n)) for i in data: data_j = data_i for j in data[cnt:]: dist = dtw(i, j, dist=lambda i, j: norm(i - j, ord=1))[0] x[data_i][data_j+1], x[data_j+1][data_i] = dist, dist data_j += 1 cnt += 1 data_i += 1 end = time.time() print(end - start) ''' # build model with x model = OneClassSVM() model.fit(x) # create test dataset test = [] for i in range(10): test.append(np.array([np.random.randint(0, 10000) for i in range(np.random.randint(20000, 30000))])) test = np.array(test) # transform test dataset x = [] y = [] for i in test: sorted_i = sorted(i) x.append([max(sorted_i), np.percentile(sorted_i, 75), np.median(sorted_i), np.percentile(sorted_i, 25), min(sorted_i)]) y.append(0) x = np.array(x) # predict test dataset pred = model.predict(x) '''
def fit(self, X, Y, W): clf = OneClassSVM(kernel=self.kernel, degree=self.degree, gamma=self.gamma, coef0=self.coef0, tol=self.tol, nu=self.nu, shrinking=self.shrinking, cache_size=self.cache_size, max_iter=self.max_iter) if W is not None: return OneClassSVMClassifier(clf.fit(X, W.reshape(-1))) return OneClassSVMClassifier(clf.fit(X))
class Cluster(object): def __init__(self, name): self.name = name self.raw_dataset = [] self.dataset = [] self.dataset_red = [] def get_featurevec(self, data): '''Takes in data in the form of an array of EmoPackets, and outputs a list of feature vectors.''' # CHECKED, all good :) num_bins = (len(data)/int(dsp.SAMPLE_RATE*dsp.STAGGER) - int(dsp.BIN_SIZE / dsp.STAGGER) + 1) size = int(dsp.BIN_SIZE*dsp.SAMPLE_RATE) starts = int(dsp.SAMPLE_RATE*dsp.STAGGER) points = [] for i in range(num_bins): points.append(dsp.get_features(data[i*starts:i*starts+size])) return points def add_data(self, raw): '''Allows the addition of new data. Will retrain upon addition. Expects a list of EmoPackets.''' self.dataset.extend(self.get_featurevec(raw)) def extract_features(self): '''Does feature extraction for all of the datasets.''' self.dataset = [] for sess in self.raw_dataset: self.dataset.extend(self.get_featurevec(sess)) def reduce_dim(self, NDIM=5): '''Reduces the dimension of the extracted feature vectors.''' X = np.array(self.dataset) self.pca = RandomizedPCA(n_components=NDIM).fit(X) self.dataset_red = self.pca.transform(X) def train(self): '''Trains the classifier.''' self.svm = OneClassSVM() self.svm.fit(self.dataset_red) def is_novel(self, pt): '''Says whether or not the bin is novel. Expects an array of EmoPackets''' X = self.pca.transform(np.array(self.get_featurevec(data)[0])) ans = self.svm.predict(X) self.dataset_red.append(X) self.train() return ans def save(self): '''Saves this classifier to a data directory.''' this_dir, this_filename = os.path.split(__file__) DATA_PATH = os.path.join(this_dir, "data", self.name+'.pkl') dumpfile = open(DATA_PATH, "wb") pickle.dump(self, dumpfile, pickle.HIGHEST_PROTOCOL) dumpfile.close()
def determine_test_similarity(self, model): clf_OCSVM = {} model_OCSVM = {} for i in range(len(model)): clf = OneClassSVM(kernel='rbf', nu=0.1, gamma=.023) clf_OCSVM[i] = clf OCSVMmodel = clf.fit(model[i]) model_OCSVM[i] = OCSVMmodel return clf_OCSVM, model_OCSVM
def runClassifier(self, _driverId, numComponents=0): X = self.featuresHash.values() self.ids = self.featuresHash.keys() if self.runDimRed: X = self.dimRed(X, numComponents) clf = OCSVM(nu=self.nu, gamma=self.gamma) clf.fit(X) y_pred = clf.decision_function(X).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * self.outliers_fraction) self.label = y_pred > threshold self.label = map(int, self.label)
def select_best_support_vectors(data, nu=0.01, all_gammas=2 ** np.arange(-10, 10, 1)): all_errors = [] for gamma in all_gammas: clf = OneClassSVM(nu=nu, gamma=gamma) clf.fit(data) prediction = clf.predict(data) out_of_class_count = np.sum(prediction == -1) support_vectors_count = len(clf.support_vectors_) error = (float(out_of_class_count) / len(data) - nu) ** 2 error += (float(support_vectors_count) / len(data) - nu) ** 2 all_errors.append(error) index = np.argmin(all_errors) return all_gammas[index], all_errors
def embed_dat_matrix_two_dimensions(low_dimension_data_matrix, y=None, labels=None, density_colormap='Blues', instance_colormap='YlOrRd'): from sklearn.preprocessing import scale low_dimension_data_matrix = scale(low_dimension_data_matrix) # make mesh x_min, x_max = low_dimension_data_matrix[:, 0].min(), low_dimension_data_matrix[:, 0].max() y_min, y_max = low_dimension_data_matrix[:, 1].min(), low_dimension_data_matrix[:, 1].max() step_num = 50 h = min((x_max - x_min) / step_num, (y_max - y_min) / step_num) # step size in the mesh b = h * 10 # border size x_min, x_max = low_dimension_data_matrix[:, 0].min() - b, low_dimension_data_matrix[:, 0].max() + b y_min, y_max = low_dimension_data_matrix[:, 1].min() - b, low_dimension_data_matrix[:, 1].max() + b xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # induce a one class model to estimate densities from sklearn.svm import OneClassSVM gamma = max(x_max - x_min, y_max - y_min) clf = OneClassSVM(gamma=gamma, nu=0.1) clf.fit(low_dimension_data_matrix) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max] . [y_min, y_max]. if hasattr(clf, "decision_function"): score_matrix = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) else: score_matrix = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] # Put the result into a color plot levels = np.linspace(min(score_matrix), max(score_matrix), 40) score_matrix = score_matrix.reshape(xx.shape) if y is None: y = 'white' plt.contourf(xx, yy, score_matrix, cmap=plt.get_cmap(density_colormap), alpha=0.9, levels=levels) plt.scatter(low_dimension_data_matrix[:, 0], low_dimension_data_matrix[:, 1], alpha=.5, s=70, edgecolors='gray', c=y, cmap=plt.get_cmap(instance_colormap)) # labels if labels is not None: for id in range(low_dimension_data_matrix.shape[0]): label = labels[id] x = low_dimension_data_matrix[id, 0] y = low_dimension_data_matrix[id, 1] plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
def outlier_detect(data_frame): #pandas to numpy - digestible by scikit columns = ['blm_tag_count','protest_count','justice_count','riot_count','breathe_count'] features = data_frame[list(columns)].values clf = OneClassSVM(nu=0.008, gamma=0.05) clf.fit(features) y_pred = clf.predict(features) mask=[y_pred==-1] oak_array = np.asarray(data_frame.hourly) protest_predict = oak_array[mask] protest_hours = list(protest_predict) return protest_hours
def svm(data, fraction=0.05, kernel='poly', degree=3, gamma=0, coeff=0): svm = OneClassSVM(kernel=kernel, degree=degree, gamma=gamma, nu=fraction, coeff0=coeff) svm.fit(data) score = svm.predict(data) numeration = [[i] for i in xrange(1, len(data)+1, 1)] numeration = np.array(numeration) y = np.hstack((numeration, score)) anomalies = numeration for num,s in y: if (y == 1): y = np.delete(anomalies, num-1, axis=0) return anomalies
def select_best_outlier_fraction_cross_val(data, nu=0.05, all_gammas=2 ** np.arange(-10, 10, 50), folds_count=7): all_errors = [] kf_iterator = KFold(len(data), n_folds=folds_count) for gamma in all_gammas: error = 0 for train, test in kf_iterator: train_data = data[train,:] test_data = data[test,:] clf = OneClassSVM(nu=nu, gamma=gamma) clf.fit(train_data) prediction = clf.predict(test_data) outlier_fraction = np.mean(prediction == -1) error += (nu - outlier_fraction) ** 2 + (float(clf.support_vectors_.shape[0]) / len(data) - nu) ** 2 all_errors.append(error / folds_count) best_index = np.argmin(error) return int(best_index), all_errors
class OneClassSVMDetector(BaseOutlier): @staticmethod def get_attributes(): return { "nu":0.1, "kernel":['rbf','linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], "gamma":0.1, } def __init__(self,nu=0.1,kernel='rbf',gamma=0.1): self.nu = nu self.kernel = kernel self.gamma = gamma def fit(self,data=None): self.data = data self.check_finite(data) if(self._is_using_pandas(data)==True): self.data.interpolate(inplace=True) # self.datareshap = data.reshape(-1,1) self.clf = OneClassSVM(nu=self.nu, kernel=self.kernel, gamma=self.gamma) self.clf.fit(data.reshape(-1,1)) # print "done" return self def predict(self, X_test): y_pred_train = self.clf.predict(X_test.reshape(-1,1)) outlier_idx = np.where(y_pred_train == -1) inlier_idx = np.where(y_pred_train == 1) d = { 'timestamp': self.data.index[outlier_idx], 'anoms': self.data.iloc[outlier_idx] } anoms = pd.DataFrame(d) self.anomaly_idx = anoms.index self.anom_val = anoms['anoms'] return anoms def fit_predict(self, data=None): self.fit(data) return self.predict(data) def plot(self): import matplotlib.pyplot as plt f, ax = plt.subplots(1, 1) ax.plot(self.data, 'b') ax.plot(self.anomaly_idx, self.anom_val, 'ro') ax.set_title('Detected Anomalies') ax.set_ylabel('Count') f.tight_layout() return f
def cross_validate(): #for tinkering with the model #read data all_df = pd.read_csv('./data/train.csv',index_col = 'ID') #split data zeros_df = all_df[all_df.TARGET == 0] ones_df = all_df[all_df.TARGET == 1] num_ones = ones_df.shape[0] msk = np.random.permutation(len(zeros_df)) < num_ones zeros_train_df = zeros_df[~msk] zeros_test_df = zeros_df[msk] ones_test_df = ones_df train_df = zeros_train_df test_df = pd.concat([zeros_test_df,ones_test_df]) train_X = np.array(train_df.drop('TARGET', axis = 1)) train_Y = np.array(train_df.TARGET) test_X = np.array(test_df.drop('TARGET',axis = 1)) test_Y = np.array(test_df.TARGET) #true target values #init svm print('training svm') my_svm = OneClassSVM(verbose = True) my_svm.fit(train_X) #predict print('predicting') predictions = my_svm.predict(test_X) conf_matrix = confusion_matrix(test_Y,predictions) print('confusion matrix:') print(pd.DataFrame(conf_matrix,columns = [0,1])) print('accuracy:') print(sum(test_Y.reshape(predictions.shape) == predictions)/len(test_Y))
def find_anomaly(label1, label2, winsize): print("Find anomaly in channel", label1 + '-' + label2 + '...', file=sys.stderr) print("-"*80) print("Channel [" + label1 + '-' + label2 + ']') print("-"*80) # find difference electrode1 = eeg.chan_lab.index(label1) electrode2 = eeg.chan_lab.index(label2) wave = eeg.X[electrode1] - eeg.X[electrode2] # # import random # wave = [random.uniform(-20,20) for _ in range(400*30)] + [random.uniform(-2000,2000) for _ in range(5*30)] # wave = np.array(wave) print("Splitting into windows...", file=sys.stderr) wave_windows = np.array_split(wave, len(wave)/eeg.sample_rate/winsize) # wave_windows = np.array_split(wave, len(wave)/winsize) print("Extracting features...", file=sys.stderr) def extract_features(wave_window): max_val = max(wave_window) min_val = min(wave_window) stdev = np.std(wave_window) sum_val = sum(wave_window) sum_pos_val = sum([x for x in wave_window if x > 0]) sum_abs_val = sum([abs(x) for x in wave_window]) return [max_val, min_val, stdev, sum_val, sum_pos_val, sum_abs_val] Examples = np.array(map(extract_features, wave_windows)) print("Training model, assuming no more than", CONTAMINATION, "anomaly...", file=sys.stderr) od = OneClassSVM(nu=CONTAMINATION, kernel='poly', gamma=0.05, max_iter=100000) od.fit(Examples) decisions = od.decision_function(Examples) # print decisions # print max(decisions), min(decisions) print("Most likely windows with anomaly:") # find most likely windows, in desc order largest_indices = np.argsort((-np.absolute(decisions)).ravel())[:20] for large_index in largest_indices: print(large_index*winsize/60, "min (score:", decisions[large_index][0], ")") sys.stdout.flush()
def remove_outliers_SVM(self): ## Remove outliers using a OneClassSVM method print "Running SVM to remove outliers..." svm = OneClassSVM(kernel='rbf', nu=0.1, degree=3, verbose=1) fit = svm.fit(self.DataArray) decision = svm.decision_function(self.DataArray) _indices = [] # If a value is below the decision hyperplane, eliminate it for i in range(len(decision)): if decision[i] < 0: _indices.append(i) print self.DataArray.shape self.DataArray = np.delete(self.DataArray, _indices, axis=0) self.TargetArray = np.delete(self.TargetArray, _indices, axis=0) print self.DataArray.shape
def plot_scatter(X_dict, y_dict, col1, col2, max_error, max_filled_gap, insens, f_colors = ['yellow', 'red', 'blue'], nu=0.98, high=0.95): planes = sorted(X_dict.keys()) planes_with_failures = sorted([key for key in X_dict.keys() if y_dict[key].sum()>0]) ocsvm = OneClassSVM(kernel='linear', nu=0.98) X_train = pd.concat(dict([(plane, X_dict[plane][[col1, col2]].dropna()) for plane in planes_with_failures])) ocsvm.fit(X_train.values) qb = QuantileBinarizer(low=0.0, high=0.95, each_side=False) qb.fit(X_train) mask_pref = pd.concat(dict( [(plane, get_mask_pref(y_dict[plane], max_error)) for plane in planes]), axis=0) mask_norm = pd.concat(dict( [(plane, get_mask_norm(y_dict[plane], max_error, insens)) for plane in planes]), axis=0) fig = plt.figure(figsize=(15,15), dpi=100) # plt.xlabel('Norm of res. phase: %s, group: %s' % (col1[0], str(col_groups[col1[0]][int(col1[1][-1])]))) # plt.ylabel('Norm of res. phase: %s, group: %s' % (col2[0], str(col_groups[col2[0]][int(col2[1][-1])]))) plt.xlabel(col1) plt.ylabel(col2) plot_norm = plt.scatter(pd.concat(X_dict)[col1].loc[mask_norm], pd.concat(X_dict)[col2].loc[mask_norm], c='lightgrey', zorder=1, s=6) plot_pref = [] for i, plane in enumerate(planes_with_failures): plot_pref.append(plt.scatter(X_dict[plane][col1].loc[get_mask_pref(y_dict[plane], max_error)], X_dict[plane][col2].loc[get_mask_pref(y_dict[plane], max_error)], c=f_colors[i], zorder=2, s=30)) x_min, x_max, y_min, y_max = plt.axis('tight') plt.axvline(qb._thresholds[col1]['high'], c='green') plt.axhline(qb._thresholds[col2]['high'], c='green') plot_line = plt.plot([x_min, x_max], [(ocsvm.intercept_ - ocsvm.coef_[0][0] * x_min) / ocsvm.coef_[0][1], (ocsvm.intercept_ - ocsvm.coef_[0][0] * x_max) / ocsvm.coef_[0][1]], c='red') # # plt.legend((plot_norm, plot_pref), ('No-failure', 'Pre-failure'), # # scatterpoints=1, loc='upper right', ncol=1) # #plt.savefig('./scatter/pair_group_of_fours3.png')
def fit(self,data=None): self.data = data self.check_finite(data) if(self._is_using_pandas(data)==True): self.data.interpolate(inplace=True) # self.datareshap = data.reshape(-1,1) self.clf = OneClassSVM(nu=self.nu, kernel=self.kernel, gamma=self.gamma) self.clf.fit(data.reshape(-1,1)) # print "done" return self
def predict_header_features(self, pkt_featurizer): group_id = pkt_featurizer.pkt_type features = pkt_featurizer.features arrival_time = pkt_featurizer.arrival_time try: vectorizer = DictVectorizer() vectorizer.fit(self.training_data[group_id]) training_data_vectorized = vectorizer.transform(self.training_data[group_id]) features_vectorized = vectorizer.transform(features) scaler = preprocessing.StandardScaler(with_mean=False) training_data_vectorized = scaler.fit_transform(training_data_vectorized) features_vectorized = scaler.transform(features_vectorized) classifier = OneClassSVM() classifier.fit(training_data_vectorized) result = classifier.predict(features_vectorized) distance = classifier.decision_function(features_vectorized) except KeyError: result = 0 distance = 0 return result, distance
class TwoStage(object): def __init__(self, *args, **kwargs): super(TwoStage, self).__init__(*args, **kwargs) self._oneCls = OneClassSVM(nu=NU, gamma=GAMMA) self._clf = RandomForestClassifier(n_estimators=30) self._scaler = StandardScaler() def fit(self, data, labels): sdata = self._scaler.fit_transform(data) self._oneCls.fit(sdata) self._clf.fit(sdata, labels) return self def predict(self, data): sdata = self._scaler.transform(data) is_known_cls = self._oneCls.predict(sdata) cls = self._clf.predict(sdata) cls[is_known_cls == -1] = "zother" classes = list(self._clf.classes_) + ["zother"] return cls, classes
class NoveltySeparator(BaseEstimator): def get_params(self, deep=True): return {} def fit(self, X, y): # lets treat users spending something in the rest of the month as outliers inliers = y - X[:, 0] inliers = np.where(inliers < 0.1, True, False) self.detector = OneClassSVM(nu=0.05, cache_size=2000, verbose=True) # training only on inliers print("Training detector") self.detector.fit(X[inliers]) results = self.detector.predict(X).reshape(X.shape[0]) # predicted inliers = results == 1 outliers = results == -1 print("Training estimators") self.est_inliers = Ridge(alpha=0.05) self.est_outliers = Ridge(alpha=0.05) self.est_inliers.fit(X[inliers], y[inliers]) self.est_inliers.fit(X[outliers], y[outliers]) def predict(self, X): y = np.zeros(X.shape[0]) labels = self.detector.predict(X).reshape(X.shape[0]) inliers = lables == 1 outliers = lables == -1 y[inliers] = self.est_inliers.predict(X[inliers]) y[outliers] = self.est_outliers.predict(X[outliers]) return y
def predict_pkt_length_features(self, pkt_featurizer): group_id = pkt_featurizer.pkt_type try: dbscan = DBSCAN() pkt_lengths = np.array(list(self.pkt_lengths[group_id])+[pkt_featurizer.len_bytes]).reshape(-1,1) labels = dbscan.fit_predict(pkt_lengths) dbscan_prediction = labels[-1] == -1 if self.plot: self.plot_1d_dbscan(pkt_lengths, labels, range(len(pkt_lengths)), self.pkt_lengths_fig_dbscan, "", "Pkt Length", "Pkt Length DBSCAN Clustering - Anomalous Pkts in Black") one_class_svm = OneClassSVM() scaler = preprocessing.StandardScaler() pkt_lengths_scaled = scaler.fit_transform(np.array(self.pkt_lengths[group_id]).reshape(-1,1)) features_scaled = scaler.transform(np.array(pkt_featurizer.len_bytes).reshape(1,-1)) one_class_svm.fit(pkt_lengths_scaled) svm_prediction = one_class_svm.predict(features_scaled) if self.plot and len(pkt_lengths_scaled) > 2: self.plot_1d_svm(self.pkt_lengths[group_id], one_class_svm, range(len(self.pkt_lengths[group_id])), scaler, self.pkt_lengths_fig_svm, "Pkt", "Pkt Length", "Pkt Length One Class SVM Classification") except (KeyError, IndexError) as e: print e dbscan_prediction = 0 return dbscan_prediction
def check_authors_vocabulary(category): """Use 80% of the authors as training set and the rest as test set. A good score validates the assumption that there is a defined vocabulary for a given category. """ print category with open('%s_tweets.json' % category, 'r') as f: tweets = json.load(f) tweets_by_author = defaultdict(list) for tweet in tweets['tweets']: tweets_by_author[tweet['author_name']].append(tweet) authors = tweets_by_author.keys() training_set_count = len(authors) * 80 / 100 training_authors = random.sample(authors, training_set_count) test_authors = list(set(authors) - set(training_authors)) train_set = [] test_set = [] for author, tweets in tweets_by_author.items(): if author in training_authors: train_set.extend([prepare_tweet(t['text']) for t in tweets]) else: test_set.extend([prepare_tweet(t['text']) for t in tweets]) vectorizer = CountVectorizer( max_features=10000, #stop_words='english', max_df=0.7) classifier = OneClassSVM() text_clf = Pipeline([('vect', vectorizer), ('tfidf', TfidfTransformer(sublinear_tf=True, norm='l2')), ('clf', classifier)]) text_clf = text_clf.fit(train_set) predicted = text_clf.predict(test_set) print np.mean(predicted == 1) print classification_report([1 for _ in range(len(test_set))], predicted)
def __init__(self, embedder, detector, G2V_nhid=128, G2V_wl_iter=2, FGSD_hist_bins=200, IF_n_trees=200, IF_sample_ratio=0.5, LOF_n_neighbors=20, LOF_n_leaf=30, normalize_embedding=False, **kwargs): embedders = { 'Graph2Vec': Graph2Vec(wl_iterations=G2V_wl_iter, dimensions=G2V_nhid, attributed=True, epochs=50), 'FGSD': FGSD(hist_bins=FGSD_hist_bins, hist_range=20) } detectors = { 'IF': IsolationForest(n_estimators=IF_n_trees, max_samples=IF_sample_ratio, contamination=0.1), 'LOF': LocalOutlierFactor(n_neighbors=LOF_n_neighbors, leaf_size=LOF_n_leaf, contamination=0.1), 'OCSVM': OneClassSVM(gamma='scale', nu=0.1) } assert embedder in embedders.keys() assert detector in detectors.keys() self.embedder = embedders[embedder] self.detector = detectors[detector] self.embedder_name = embedder self.detector_name = detector self.normalize_embedding = normalize_embedding
def thread_monitoring_pre_train(self): ######################################################## ## Normalize and apply PCA to the training data result = self.pca.fit_transform( self.scaler.fit_transform(self.anomaly_data)) ## First element of the Tuple is True or False either if it is a neighbourhood-based method or not self.anomaly_algorithms[0][2] = EllipticEnvelope( support_fraction=1, contamination=self.contamination) self.anomaly_algorithms[1][2] = DBSCAN(eps=self.avg_dist( result[:, 0], result[:, 1]), metric='euclidean', min_samples=2) self.anomaly_algorithms[2][2] = OneClassSVM(kernel='rbf', nu=self.contamination, gamma=0.05) ######################################################## ## Predict outliers - use DBSCAN (unsupervised technique) for first fitering of outliers ## get predictions for all training data DBSCAN_index = 1 predictions_temp = self.anomaly_algorithms[DBSCAN_index][ 2].fit_predict(result) ######################################################### ## Filter data - for each element of the training data filtered_anomaly = np.array([]) for temp_i in np.arange(len(self.anomaly_data)): ## If sample is not outlier if predictions_temp[temp_i] != -1: if len(filtered_anomaly) == 0: filtered_anomaly = self.anomaly_data[temp_i] else: filtered_anomaly = np.vstack( (filtered_anomaly, self.anomaly_data[temp_i])) ########################################################## ## Update data self.anomaly_data = filtered_anomaly ## Train algorithms self.thread_monitoring_train()
def test(): np.random.seed(42) dataset = datasets.load_iris() test_indices = np.random.choice(150, 10) test_set = dataset.data[test_indices, :] test_set = np.vstack((test_set, np.array([[0.0, 0.0, 0.0, 0.0]]))) test_set = np.vstack((test_set, np.array([[10.0, 10.0, 10.0, 10.0]]))) print "GMM" gmm = GMM(n_components=3, covariance_type='diag') bc_gmm = BackgroundCheck(estimator=gmm, mu=0.0, m=1.0) bc_gmm.fit(dataset.data) print bc_gmm.predict_proba(test_set) print "OneClassSVM" sv = OneClassSVM() bc_sv = BackgroundCheck(estimator=sv, mu=0.0, m=1.0) bc_sv.fit(dataset.data) print bc_sv.predict_proba(test_set)
def fit(self, X, y): # lets treat users spending something in the rest of the month as outliers inliers = y - X[:, 0] inliers = np.where(inliers < 0.1, True, False) self.detector = OneClassSVM(nu=0.05, cache_size=2000, verbose=True) # training only on inliers print("Training detector") self.detector.fit(X[inliers]) results = self.detector.predict(X).reshape(X.shape[0]) # predicted inliers = results == 1 outliers = results == -1 print("Training estimators") self.est_inliers = Ridge(alpha=0.05) self.est_outliers = Ridge(alpha=0.05) self.est_inliers.fit(X[inliers], y[inliers]) self.est_inliers.fit(X[outliers], y[outliers])
def COSVM(training_data, testing_data, nu_list, kernel_list): # Build SVM model clf = OneClassSVM(nu=nu_list, kernel=kernel_list, gamma=0.1) clf.fit(training_data) y_pred_test = clf.predict(testing_data) n_error_test = y_pred_test[y_pred_test == -1].size testing_accuracy = 1 - 1.0 * n_error_test / testing_data.shape[0] # # print(n_error_test) # print('final accuracy on testing data: ', testing_accuracy, '\n') test_score = clf.decision_function(testing_data) test_score = test_score.reshape(-1) return test_score
def model(self, nu = [0.001, 0.01, 0.001], contamination = [0.001, 0.001, 0.001]): if self.verbose: print(datetime.now(), 'the model is being created ...') self.ocsvm_rbf = OneClassSVM(gamma = 'scale', kernel = 'rbf', nu = nu[0]) self.ocsvm_sigmoid = OneClassSVM(gamma = 'auto', kernel = 'sigmoid', nu = nu[1]) self.ocsvm_linear = OneClassSVM(kernel = 'linear', nu = nu[2]) self.ifo = IsolationForest(contamination = contamination[0]) self.lof = LocalOutlierFactor(contamination = contamination[1], novelty = True) self.ee = EllipticEnvelope(contamination = contamination[2]) if self.verbose: print(datetime.now(), 'the model is ready.')
def data_learn(name="song_data"): """GET endpoint for training model Arguments: name -- the name of file to get the data from, default song_data """ # load data df = pd.read_pickle("./" + url_prefix + "data/" + name + ".pkl").drop( columns=['analysis_url', 'track_href', 'type', 'uri']) # drop "useless" columns df = df.drop(columns=['duration_ms', 'key', 'mode', 'time_signature']) # split data into train and test sets X_train, X_test, y_train, y_test = train_test_split( df.drop(columns=['name', 'id']), df['id'], test_size=0.30) # fit estimator #clf = IsolationForest(n_estimators = 500, contamination = 0.11) clf = make_pipeline(StandardScaler(), OneClassSVM(nu=0.11, gamma=0.04)) clf.fit(X_train, y_train) # Predict off test data and create array of outliers predictions = clf.predict(X_test) outliers = [id for id, predict in zip(y_test, predictions) if predict < 0] count = len(outliers) with open("./" + url_prefix + "model/" + "clf.pkl", 'wb') as fid: pickle.dump(clf, fid, 2) return jsonify({ 'success': True, "test_outliers": json.loads(df.loc[df['id'].isin(outliers)].to_json(orient='index')), "test_outliers_count": count, "test_outliers_%": count / len(y_test) * 100 })
def oneclass(c: Config): normal_traffic_array, traffic_scenario = load_normal_traffic_array(c) bridge_scenarios = [HealthyScenario()] + each_pier_scenarios(c) response_type = ResponseType.YTranslation points = [ Point(x=x, y=0, z=z) for x, z in itertools.product( np.linspace(c.bridge.x_min, c.bridge.x_max / 2, 20), np.linspace(c.bridge.z_min, c.bridge.z_max / 2, 3), ) ] results = [] for b, bridge_scenario in enumerate(bridge_scenarios): print_i(f"One class: bridge scenario {bridge_scenario.name}") responses = responses_to_traffic_array( c=c, traffic_array=normal_traffic_array, response_type=response_type, bridge_scenario=bridge_scenario, points=points, fem_runner=OSRunner(c), ).T print(len(normal_traffic_array)) print(responses.shape) # Fit on the healthy scenario. if b == 0: assert len(responses) == len(points) clfs = [] for r, rs in enumerate(responses): print_i(f"Training classifier {r} / {len(responses)}") clfs.append(OneClassSVM().fit(rs.reshape(-1, 1))) scenario_results = [] for p, _ in enumerate(points): print_i(f"Predicting points {p} / {len(points)}") prediction = clfs[p].predict(responses[p].reshape(-1, 1)) print(prediction) print(len(prediction[prediction < 0])) print(len(prediction[prediction > 0]))
def Estimators(num_extimators=100, max_samples=0.25, contamination=0.2, eps=0.2): ifsf = IsolationForest(max_samples=max_samples, random_state=0, contamination=contamination, n_estimators=num_extimators, n_jobs=-1) lofsf = LocalOutlierFactor(n_neighbors=15, metric='euclidean', algorithm='auto', contamination=contamination, n_jobs=-1) ocsvm = OneClassSVM(nu=contamination, kernel="rbf", gamma=0.1) dbscan = DBSCAN(eps=eps, min_samples=10, metric='euclidean', algorithm='auto', n_jobs=-1) return {"if": ifsf, "lof": lofsf, "dbs": dbscan, "svm": ocsvm}
def tau_to_npy(a, domain='xs'): tau_a = np.array([]) for i in range(a.shape[1]): # print(i) temp1 = a[:, i].reshape(-1, 1) y_pred = OneClassSVM(nu=0.1).fit(temp1).predict(temp1) index = np.where(y_pred == 1)[0].tolist() length = len(index) average = np.sum(temp1[index]) / length tau_a = np.append(tau_a, average) tau_a = tau_a.reshape(1, -1) if domain == 'xs': np.save("Xs.npy", tau_a) elif domain == 'xt': np.save("Xt.npy", tau_a) elif domain == 'xs_add': np.save("Xs_add.npy", tau_a) elif domain == 'xt_add': np.save("Xt_add.npy", tau_a) else: print("data save error")
def main(): print('------------01') iris = load_iris() pca = PCA(n_components=2) data = pca.fit_transform(iris.data) print(type(data)) print(data) # nuで異常値の割合を指定。predictすると正常値=1,異常値=-1。 ocsvm = OneClassSVM(nu=0.1, gamma="auto") ocsvm.fit(data) preds = ocsvm.predict(data) print(preds) plt.scatter(data[:, 0], data[:, 1], c=preds, cmap=plt.cm.RdBu) plt.show() print('------------02A') x = np.linspace(-5, 5, 500) y = np.linspace(-1.5, 1.5, 250) X, Y = np.meshgrid(x, y) print('X.ravel():') print(X.ravel()) print(X.shape) print(Y.shape) z1 = np.array([X.ravel(), Y.ravel()]) print(z1.shape) z2 = ocsvm.decision_function(np.array([X.ravel(), Y.ravel()]).T) print(z2.shape) # (250, 500) # (250, 500) # (2, 125000) # (125000,) # (250, 500) print(z2.reshape(X.shape).shape) df = ocsvm.decision_function(np.array([X.ravel(), Y.ravel()]).T).reshape(X.shape) plt.scatter(data[:, 0], data[:, 1], c=preds, cmap=plt.cm.RdBu, alpha=0.8) r = max([abs(df.min()), abs(df.max())]) print('------------02B') print(df.min()) print(max([abs(df.min()), abs(df.max())])) print(df) plt.contourf(X, Y, df, 10, vmin=-r, vmax=r, cmap=plt.cm.RdBu, alpha=.5) plt.show()
def get_best_params(self, param_grid, n_iter=5): """ This function compute a GridSearchCV for different training sets inputs: n_iter: number of iterations of the GridSearchCV in different training sets param_grid: dictionary with the name and values of the parameter to change. ex: {"nu": [.2, .5, .7]} return: """ self.train_score, self.test_score = pd.DataFrame(), pd.DataFrame() self.train_score["best_nu"] = np.zeros(len(param_grid["nu"])) self.test_score["best_nu"] = np.zeros(len(param_grid["nu"])) #set index self.train_score = self.train_score.set_index(param_grid["nu"]) self.test_score = self.test_score.set_index(param_grid["nu"]) count_cv = 0 for i in range(n_iter): #self.X = self.X.sample(self.X.shape[0]) #shuffle pandas dataframe is very slow np.random.shuffle(self.X) self.ocsvm = OneClassSVM(kernel="rbf", gamma="auto") self.gsCV = GridSearchCV(self.ocsvm, param_grid=param_grid, cv=self.k_folds, scoring=self.ocsvm_score, return_train_score=True, n_jobs=3) #idd=False self.gsCV.fit(self.X, self.y) #self.cv_results["iter_"+str(i)] = self.gsCV.cv_results_ for cv in range(self.k_folds): self.train_score["score_cv_" + str(count_cv)] = self.gsCV.cv_results_[ "split" + str(cv) + "_train_score"] self.test_score["score_cv_" + str(count_cv)] = self.gsCV.cv_results_[ "split" + str(cv) + "_test_score"] count_cv += 1 self.train_score.loc[self.gsCV.best_params_["nu"], "best_nu"] += 1 self.test_score.loc[self.gsCV.best_params_["nu"], "best_nu"] += 1 return self.train_score, self.test_score
def update_event(self, input_called=-1): if input_called == 0: clf = OneClassSVM() if self.input(1) != None: clf.set_params(**self.input(1)) try: X = self.input(2) clf.fit(X) except: pass self.set_output_val(1, clf) self.exec_output(0)
def __init__(self): rospy.init_node('svm_imu_test') self.is_training = True rospy.Subscriber('/base_state', BaseState, self.base_state_CB, queue_size=1) self.pub = list() self.clf = list() for i in range(4): self.clf.append(OneClassSVM(nu=0.4, kernel="poly", gamma=0.4)) self.pub.append( rospy.Publisher('/observer_' + str(i), sensorFusionMsg, queue_size=1)) rospy.loginfo("Training period starting") rospy.Timer(rospy.Duration(30), self.timer_cb, oneshot=True) rospy.spin()
def outlier_rejection(X=None, y=None, method='IsolationForest', contamination=0.1): """This will be our function used to resample our dataset. """ outlier_model = ( IsolationForest(contamination=contamination), LocalOutlierFactor(contamination=contamination), OneClassSVM(nu=contamination), EllipticEnvelope(contamination=contamination), ) outlier_model = {i.__class__.__name__: i for i in outlier_model} if X is None: return outlier_model.keys() model = outlier_model.get(method) if model is None: raise ValueError("method '{}' is invalid".format(method)) y_pred = model.fit_predict(X) return X[y_pred == 1], y[y_pred == 1]
def runOCSVMGridSearch(data_folder, cfg): #Gather the dataset #train_x[:len(train_x)/2] = (training) 70% of reg samples #train_x[len(train_x)/2:] = (training) 70% of facet samples #test_x[:len(test_x)/2] = (testing) 30% of reg samples #test_x[len(test_x)/2:] = (testing) 30% of facet samples train_x, train_y, test_x, test_y = gatherHoldoutData(data_folder, cfg) # PreProcess data train_x, test_x = preprocessData(train_x, test_x, "soft_scaling") parameters = {'nu': np.linspace(0.01, 1, 100)} # fit the model #clf = OneClassSVM(nu=0.95 * 0.5 + 0.05, kernel="rbf") clf = GridSearchCV(OneClassSVM(kernel="rbf"), parameters, cv=5, scoring='recall') clf.fit(train_x, train_y[:len(train_y) / 2]) #fit to normal samples only print("Best parameters set found on development set:") print(clf.best_params_) print("Grid scores on training set:") means = clf.cv_results_['mean_test_score'] stds = clf.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, clf.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) print("Classification results on test set:") y_true, y_pred = test_y, clf.predict(test_x) print y_true print y_pred print(classification_report(y_true, y_pred)) print accuracy_score(y_true, y_pred) print("accuracy: ", accuracy_score(y_true, y_pred)) print("precision: ", precision_score(y_true, y_pred)) print("recall: ", recall_score(y_true, y_pred)) print("area under curve (auc): ", roc_auc_score(y_true, y_pred))
def train_CSD_SVM(args): ''' Train a SVM outlier detector using real images :param real_img_dir: A directory contains real images :param svm_model_path: A path for saving trained model :return: ''' train_paths = list( map(lambda x: args.real_img_dir + x, os.listdir(args.real_img_dir))) logging.info("Training file paths: {}".format(len(train_paths))) train_feat = get_color_feat(train_paths) train_feat = np.squeeze(train_feat, axis=1) y_true = [1] * np.shape(train_feat)[0] # train SVM parameters = {'gamma': [0.001, 0.0001, 1 / 588, 0.01, 0.1]} svm_model = OneClassSVM(nu=0.1, kernel="rbf") clf = GridSearchCV(svm_model, parameters, cv=5, scoring='accuracy') clf.fit(train_feat, y_true) logging.info(clf.best_estimator_.get_params()) # save the model joblib.dump(clf.best_estimator_, args.svm_model_path) logging.info('model saved')
def detect_outlier(data, classifier="Robust Covariance", outlier_fraction=0.005): classifiers = { "Empirical Covariance": EllipticEnvelope(support_fraction=1., contamination=outlier_fraction), "Robust Covariance": EllipticEnvelope(contamination=outlier_fraction), "OCSVM": OneClassSVM(nu=outlier_fraction, gamma=0.05) } # colors = ['m', 'g', 'b'] legend = {} # Learn a frontier for outlier detection with several classifiers xx1, yy1 = np.meshgrid(np.linspace(5, 10, 500), np.linspace(10, 15, 500)) plt.figure(1) clf = classifiers[classifier] clf.fit(data) scores = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()]).reshape(xx1.shape) legend[classifier] = plt.contour(xx1, yy1, scores, levels=[0], linewidths=2, colors='m', linestyles='dashed') legend_key = list(legend.keys()) # Plot the results (= shape of the data points cloud) plt.figure(1) # two clusters plt.title("Identify potential outliers") plt.xlabel('log: Above grade (ground) living area square feet') plt.ylabel('log: Sales Price') plt.scatter(data.iloc[:, 0], data.iloc[:, 1], color='black') plt.xlim((xx1.min(), xx1.max())) plt.ylim((yy1.min(), yy1.max())) plt.legend([legend_key[0]]).legendHandles[0].set_color('m') plt.show()
def __init__(self, kernel, detector, labeled=True, WL_iter=5, PK_bin_width=1, LOF_n_neighbors=20, LOF_n_leaf=30, **kwargs): kernels = { 'WL': WeisfeilerLehman(n_iter=WL_iter, normalize=True, base_graph_kernel=VertexHistogram), 'PK': Propagation(t_max=WL_iter, w=PK_bin_width, normalize=True) if labeled else PropagationAttr( t_max=WL_iter, w=PK_bin_width, normalize=True), } detectors = { 'OCSVM': OneClassSVM(kernel='precomputed', nu=0.1), 'LOF': LocalOutlierFactor(n_neighbors=LOF_n_neighbors, leaf_size=LOF_n_leaf, metric='precomputed', contamination=0.1), # 'IF': current similarity forest also has problem } assert kernel in kernels.keys() assert detector in detectors.keys() self.kernel = kernels[kernel] self.detector = detectors[detector] self.kernel_name = kernel self.detector_name = detector self.labeled = labeled
def trainLocalModel(r, key, datas): r_list = datas requests = pipeline.group_requests( r_list, lambda r: '{} {}'.format(r.method, r.url)) for i, (k, v_list) in enumerate(sorted(requests.items())): d2 = pipeline.group_requests(v_list, lambda r: r.label_type) normal_request = d2.get('normal', []) anormal_request = d2.get('anormal', []) anormal_size = int(0.01 * len(normal_request)) train_list, _ = train_test_split(normal_request + anormal_request[-anormal_size:-1], random_state=RANDOM_STATE, train_size=TRAIN_SIZE) clf_svm = make_pipeline( make_union(*[class_() for class_ in TF_LIST]), OneClassSVM(kernel='sigmoid', nu=NU, gamma='auto')) clf_isolation = make_pipeline( make_union(*[class_() for class_ in TF_LIST]), IsolationForest(n_estimators=128, max_samples=400, max_features=0.7, random_state=rng)) #clf_lof = make_pipeline( # make_union(*[class_() for class_ in TF_LIST]), # LocalOutlierFactor(n_neighbors=20)) models = [clf_svm.fit(train_list), clf_isolation.fit(train_list)] #, clf_lof.fit(train_list)] return models
def fit_selected_classifier(X_data): """ Fits the selected classifier Parameters: X_data (np.ndarray) - input dataset Returns: None """ global classifier if sel_classifier == Classifier.OCSVM: #classifier = OneClassSVM(kernel='sigmoid', gamma='scale', nu=0.1).fit(X_data) classifier = OneClassSVM(gamma='scale', verbose=True).fit(X_data) if sel_classifier == Classifier.IFOREST: classifier = IsolationForest(random_state=0, contamination=0.1).fit(X_data) if sel_classifier == Classifier.EllipticEnvelope: classifier = EllipticEnvelope(random_state=0, contamination=0.1).fit(X_data) if sel_classifier == Classifier.LocalOutlierFactor: classifier = LocalOutlierFactor(contamination=0.1, novelty=True).fit(X_data)
def fit(self, params): if len(self.data) <= 1: self.DEBUG_INFO = "No samples are there!" self.changed("alert_generated") return X = np.asarray(self.data)[:,0:2] y = np.asarray(self.data)[:,2] if np.unique(y).size == 1: # only one-class self.clf = OneClassSVM(nu=params['nu'], gamma=params['gamma'], degree=params['degree'], coef0=params['coef0'], kernel=params['kernel']) self.clf.fit(X) else: self.clf = SVC(C=params['C'], gamma=params['gamma'], degree=params['degree'], coef0=params['coef0'], kernel=params['kernel']) self.clf.fit(X, y) self.is_fitted = True self.changed("model_fitted")
def __init__( self, trainable_invertible_ica, predictor_model, novelty_detector=OneClassSVM(nu=0.1, gamma="auto"), aug_max_iter: Optional[int] = None, augmentation_size: Optional[int] = None, ): """Build CausalMechanismTransfer object. Parameters ---------- trainable_invertible_ica : object Trainable invertible ICA model for estimating the mechanism function. Required to implement ``train()`` and ``inv()``. predictor_model : object Trainable predictor model to be trained on the augmented data. Needs to implement ``fit()`` and ``predict()``. aug_max_iter : int or None The maximum number of iterations for performing the augmentation. augmentation_size : int or None The size of the augmentation. Fully augmented if ``None``. Returns ---------- None : None """ self.trainable_invertible_ica = trainable_invertible_ica self.augmenter = ICATransferAugmenter( self.trainable_invertible_ica.get_invertible_ica_model(), novelty_detector=novelty_detector, max_iter=aug_max_iter) self.predictor_model = predictor_model self.augmentation_size = augmentation_size
def single_eval_one_class(X_train, X_test, y_test, species_train, args, class_list): if args.classifier == 'GaussianMixed': # define the gaussian mixed model, fit to training data and make predictions clf = GaussianMixed(class_list, threshold=args.threshold, mixmodel=True, epsilon=1e-6) clf.fit(X_train, species_train) y_preds, y_score = clf.predict(X_test) else: # define the one class svm, and fit it to the training data clf = OneClassSVM(kernel=args.classifier, gamma='auto') clf.fit(X_train) #make predictions and calculate the roc curve y_preds = clf.predict(X_test) y_score = clf.decision_function(X_test) # calculate the false and true positive rate, followed by the AUROC fpr, tpr, _ = roc_curve(y_test, y_score) roc_auc = auc(fpr, tpr) return y_preds, roc_auc, fpr, tpr
def _load_detection_models(self): self._detection_models = {} for ds_url in data_sets.DS_URL_LIST[TEST_CONFIG['DS_URL_SLICE']]: try: normal_list = self._get_from_data_server(ds_url, 'n') except ValueError as err: self.log_debug( '_load_detection_models', 'could not get req_list "{}": {}'.format(ds_url, err)) return train_list, _ = train_test_split(normal_list, random_state=RANDOM_STATE, train_size=TRAIN_SIZE) clf = make_pipeline( make_union(*[class_() for class_ in TF_LIST]), OneClassSVM(random_state=0, nu=NU, gamma=GAMMA)) clf.fit(train_list) key = str(normal_list[0]) self._detection_models[key] = clf self.log_debug('_load_detection_models', 'loaded "{}"'.format(key))
def fit_sklearn_model(embeddings, model_name, output_filename, n_neighbors=4): logger.info('final size of the collected embeddings: {}'.format( len(embeddings))) embedding_array = np.concatenate(embeddings) if model_name == 'local_outlier_factor': logger.info('using local outlier factor with n_neighbour {}'.format( n_neighbors)) clf = LocalOutlierFactor(n_neighbors=n_neighbors, novelty=True, contamination=0.1) elif model_name == 'isolation_forest': clf = IsolationForest(contamination=0.1) elif model_name == 'svm': clf = OneClassSVM(kernel='linear') else: raise ValueError('model {} not supported'.format(model_name)) clf.fit(embedding_array) logger.info('Saving OOD model to {}'.format(output_filename)) with open(output_filename, "wb") as out_stream: pickle.dump(clf, out_stream) return clf
def __generate_probas(self, samples, resolution, affinity_matrix, number_of_questions): print( f"📞 Looks like there's a probability distribution ({self.name}) that wants to phone in an expert (that's " f"you)\n" ) clf = OneClassSVM(kernel='precomputed') samples_and_weights = {0: 0.5} for nq in range(number_of_questions): indices = list(samples_and_weights.keys()) if nq == 0: idx = np.random.choice(range(1, len(samples))) else: preds = clf.decision_function(affinity_matrix[:, indices]) idx = [i for i, _ in sorted(enumerate(preds), key=lambda x: x[1]) if i not in samples_and_weights][ 0] sample = samples[idx] print('Score the sample below with a number between 0 and 1 (higher is better)\n') if hasattr(sample, '_repr_html_'): print(sample) else: print(sample) weight = float(input('Score: ')) assert 0 <= weight <= 1 samples_and_weights[idx] = weight indices = list(samples_and_weights.keys()) clf.fit( affinity_matrix[indices, :][:, indices], sample_weight=list(samples_and_weights.values()) ) indices = list(samples_and_weights.keys()) preds = clf.decision_function(affinity_matrix[:, indices]) scores = KernelDiscretizedMethod.discretized_scores( resolution, samples, affinity_matrix, lambda mask, _idx: preds[mask].mean()) Z = logsumexp([s for s in scores.values()]) return {idx: s - Z for idx, s in scores.items()}
def main(): print "loading data" train = read_data(0) print train.shape label = np.ones(train.shape[0], ) print label.shape # train = train.tolist() # label = label.tolist() print "training the one-class SVM" model = OneClassSVM(kernel='rbf', nu=0.2, degree=3, gamma=0.009, shrinking=1) model.fit(train) print "predicting the test data" label_test = np.ones(200, ) test = read_data(1) pred1 = model.predict(train) print pred1[np.where(pred1 > 0)].sum() / pred1.shape[0] pred2 = model.predict(test) print pred2[np.where(pred2 > 0)].sum() / pred2.shape[0] # pred = np.zeros(200,) pred = [] for i in range(200): #print 'the iteration:', i, p_label[i] if pred2[i] == 1: pred.append('healthy') elif pred2[i] == -1: pred.append('dzs_1r+dzs_1l') print len(pred) np.save('corrcoef_predict_9.npy', pred)
def outlier_detection_SVM(): df_X, df_y = load_confirmed() df_X = remove_not_numeric(df_X) index = df_X[df_y == 1].index X = df_X.drop(index) y = df_y.drop(index) print(X) print(len(index), index) div = int(len(X) * 0.7) X_train = X[:div] X_test = X[div:] X_outliers = df_X.ix[index] print(X_outliers) pipe = Pipeline([("imputer", Imputer()), ("scaler", MinMaxScaler()), ("decomposition", PCA(n_components=100))]) X_train = pipe.fit_transform(X_train) clf = OneClassSVM(gamma=(1 / len(X_train)), nu=0.37) clf.fit(X_train) X_test = pipe.transform(X_test) X_outliers = pipe.transform(X_outliers) print(X_outliers) y_pred_train = clf.predict(X_train) y_pred_test = clf.predict(X_test) print("********** " + str(len(X_test)) + " " + str(len(y_pred_test))) y_pred_outliers = clf.predict(X_outliers) n_error_train = y_pred_train[y_pred_train == -1].size n_error_test = y_pred_test[y_pred_test == -1].size n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size print("train error:", n_error_train / len(y_pred_train)) print("test error:", n_error_test / len(y_pred_test)) print("outliers error:", n_error_outliers / len(y_pred_outliers)) return y_pred_train, y_pred_test, y_pred_outliers, y[div:]
def slice_probability_space_selection(data, nu=0.05, all_gammas=2 ** np.linspace(-10, 10, 50), rho=0.05, outlier_distribution = np.random.rand, folds_count=7): kf_iterator = KFold(len(data), n_folds=folds_count) all_errors = [] for gamma in all_gammas: error = 0.0 clf = OneClassSVM(nu=nu, gamma=gamma) for train, test in kf_iterator: train_data = data[train,:] test_data = data[test,:] clf = OneClassSVM(nu=nu, gamma=gamma) clf.fit(train_data) prediction = clf.predict(test_data) inlier_metric_part = np.mean(prediction == -1) inlier_metric_part = inlier_metric_part / (1 + rho) / len(data) outliers = outlier_distribution(*data.shape) - 0.5 outliers *= 8 * np.std(data) outlier_metric_part = np.mean(clf.predict(outliers) == 1) * rho / (1 + rho) / len(outliers) error += inlier_metric_part + outlier_metric_part all_errors.append(error / folds_count) index = np.argmin(all_errors) #best_index = pd.Series(all_errors).pct_change().argmax() - 1 return int(index), all_errors
def base_experiment(pct_noise=0.15, noverlap_bits=0, exp_name='1-1', ntrials=10, verbose=True, seed=123456789): """ Run a single experiment, locally. @param pct_noise: The percentage of noise to add to the dataset. @param noverlap_bits: The number of bits the base class should overlap with the novelty class. @param exp_name: The name of the experiment. @param ntrials: The number of times to repeat the experiment. @param verbose: If True print the results. @param seed: The random seed to use. @return: A tuple containing the percentage errors for the SP's training and testing results and the SVM's training and testing results, respectively. """ # Base parameters ntrain, ntest = 800, 200 nsamples, nbits, pct_active = ntest + ntrain, 100, 0.4 clf_th = 0.5 log_dir = os.path.join(os.path.expanduser('~'), 'scratch', 'novelty_experiments', exp_name) # Configure the SP config = { 'ninputs': 100, 'trim': 1e-4, 'disable_boost': True, 'seed': seed, 'pct_active': None, 'random_permanence': True, 'pwindow': 0.5, 'global_inhibition': True, 'ncolumns': 200, 'nactive': 50, 'nsynapses': 75, 'seg_th': 15, 'syn_th': 0.5, 'pinc': 0.001, 'pdec': 0.001, 'nepochs': 10, 'log_dir': log_dir } # Seed numpy np.random.seed(seed) # Create the base dataset x_ds = SPDataset(nsamples, nbits, pct_active, pct_noise, seed=seed) x_tr, x_te = x_ds.data[:ntrain], x_ds.data[ntrain:] # Create the outlier dataset base_indexes = set(np.where(x_ds.base_class == 1)[0]) choices = [x for x in xrange(nbits) if x not in base_indexes] outlier_base = np.zeros(nbits, dtype='bool') outlier_base[np.random.choice(choices, x_ds.nactive - noverlap_bits, False)] = 1 outlier_base[np.random.permutation(list(base_indexes))[:noverlap_bits]] = 1 y_ds = SPDataset(ntest, nbits, pct_active, pct_noise, outlier_base, seed) y_te = y_ds.data if verbose: print "\nBase class' test noise: {0:2.2f}".format(1 - (np.mean(x_te, 0) * x_ds.base_class.astype('i')).sum() / 40.) print "Outlier's class noise: {0:2.2f}".format(1 - (np.mean(y_te, 0) * outlier_base.astype('i')).sum() / 40.) print 'Overlap between two classes: {0}'.format(np.dot( x_ds.base_class.astype('i'), outlier_base.astype('i'))) # Metrics metrics = SPMetrics() # Get the metrics for the datasets u_x_tr = metrics.compute_uniqueness(x_tr) o_x_tr = metrics.compute_overlap(x_tr) c_x_tr = 1 - metrics.compute_distance(x_tr) u_x_te = metrics.compute_uniqueness(x_te) o_x_te = metrics.compute_overlap(x_te) c_x_te = 1 - metrics.compute_distance(x_te) u_y_te = metrics.compute_uniqueness(y_te) o_y_te = metrics.compute_overlap(y_te) c_y_te = 1 - metrics.compute_distance(y_te) # Initialize the overall results sp_x_results = np.zeros(ntrials) sp_y_results = np.zeros(ntrials) svm_x_results = np.zeros(ntrials) svm_y_results = np.zeros(ntrials) # Iterate across the trials: for i in xrange(ntrials): # Make a new seed seed2 = np.random.randint(1000000) config['seed'] = seed2 config['log_dir'] = '{0}-{1}'.format(log_dir, i + 1) # Create the SP sp = SPRegion(**config) # Fit the SP sp.fit(x_tr) # Get the SP's output sp_x_tr = sp.predict(x_tr) sp_x_te = sp.predict(x_te) sp_y_te = sp.predict(y_te) # Get the metrics for the SP's results u_sp_x_tr = metrics.compute_uniqueness(sp_x_tr) o_sp_x_tr = metrics.compute_overlap(sp_x_tr) c_sp_x_tr = 1 - metrics.compute_distance(sp_x_tr) u_sp_x_te = metrics.compute_uniqueness(sp_x_te) o_sp_x_te = metrics.compute_overlap(sp_x_te) c_sp_x_te = 1 - metrics.compute_distance(sp_x_te) u_sp_y_te = metrics.compute_uniqueness(sp_y_te) o_sp_y_te = metrics.compute_overlap(sp_y_te) c_sp_y_te = 1 - metrics.compute_distance(sp_y_te) # Log all of the metrics sp._log_stats('Input Base Class Train Uniqueness', u_x_tr) sp._log_stats('Input Base Class Train Overlap', o_x_tr) sp._log_stats('Input Base Class Train Correlation', c_x_tr) sp._log_stats('Input Base Class Test Uniqueness', u_x_te) sp._log_stats('Input Base Class Test Overlap', o_x_te) sp._log_stats('Input Base Class Test Correlation', c_x_te) sp._log_stats('Input Novelty Class Test Uniqueness', u_y_te) sp._log_stats('Input Novelty Class Test Overlap', o_y_te) sp._log_stats('Input Novelty Class Test Correlation', c_y_te) sp._log_stats('SP Base Class Train Uniqueness', u_sp_x_tr) sp._log_stats('SP Base Class Train Overlap', o_sp_x_tr) sp._log_stats('SP Base Class Train Correlation', c_sp_x_tr) sp._log_stats('SP Base Class Test Uniqueness', u_sp_x_te) sp._log_stats('SP Base Class Test Overlap', o_sp_x_te) sp._log_stats('SP Base Class Test Correlation', c_sp_x_te) sp._log_stats('SP Novelty Class Test Uniqueness', u_sp_y_te) sp._log_stats('SP Novelty Class Test Overlap', o_sp_y_te) sp._log_stats('SP Novelty Class Test Correlation', c_sp_y_te) # Print the results fmt_s = '{0}:\t{1:2.4f}\t{2:2.4f}\t{3:2.4f}\t{4:2.4f}\t{5:2.4f}\t{5:2.4f}' if verbose: print '\nDescription\tx_tr\tx_te\ty_te\tsp_x_tr\tsp_x_te\tsp_y_te' print fmt_s.format('Uniqueness', u_x_tr, u_x_te, u_y_te, u_sp_x_tr, u_sp_x_te, u_sp_y_te) print fmt_s.format('Overlap', o_x_tr, o_x_te, o_y_te, o_sp_x_tr, o_sp_x_te, o_sp_y_te) print fmt_s.format('Correlation', c_x_tr, c_x_te, c_y_te, c_sp_x_tr, c_sp_x_te, c_sp_y_te) # Get average representation of the base class sp_base_result = np.mean(sp_x_tr, 0) sp_base_result[sp_base_result >= 0.5] = 1 sp_base_result[sp_base_result < 1] = 0 # Averaged results for each metric type u_sp_base_to_x_te = 0. o_sp_base_to_x_te = 0. c_sp_base_to_x_te = 0. u_sp_base_to_y_te = 0. o_sp_base_to_y_te = 0. c_sp_base_to_y_te = 0. for x, y in zip(sp_x_te, sp_y_te): # Refactor xt = np.vstack((sp_base_result, x)) yt = np.vstack((sp_base_result, y)) # Compute the sums u_sp_base_to_x_te += metrics.compute_uniqueness(xt) o_sp_base_to_x_te += metrics.compute_overlap(xt) c_sp_base_to_x_te += 1 - metrics.compute_distance(xt) u_sp_base_to_y_te += metrics.compute_uniqueness(yt) o_sp_base_to_y_te += metrics.compute_overlap(yt) c_sp_base_to_y_te += 1 - metrics.compute_distance(yt) u_sp_base_to_x_te /= ntest o_sp_base_to_x_te /= ntest c_sp_base_to_x_te /= ntest u_sp_base_to_y_te /= ntest o_sp_base_to_y_te /= ntest c_sp_base_to_y_te /= ntest # Log the results sp._log_stats('Base Train to Base Test Uniqueness', u_sp_base_to_x_te) sp._log_stats('Base Train to Base Test Overlap', o_sp_base_to_x_te) sp._log_stats('Base Train to Base Test Correlation', c_sp_base_to_x_te) sp._log_stats('Base Train to Novelty Test Uniqueness', u_sp_base_to_y_te) sp._log_stats('Base Train to Novelty Test Overlap', o_sp_base_to_y_te) sp._log_stats('Base Train to Novelty Test Correlation', c_sp_base_to_y_te) # Print the results if verbose: print '\nDescription\tx_tr->x_te\tx_tr->y_te' print 'Uniqueness:\t{0:2.4f}\t{1:2.4f}'.format(u_sp_base_to_x_te, u_sp_base_to_y_te) print 'Overlap:\t{0:2.4f}\t{1:2.4f}'.format(o_sp_base_to_x_te, o_sp_base_to_y_te) print 'Correlation:\t{0:2.4f}\t{1:2.4f}'.format(c_sp_base_to_x_te, c_sp_base_to_y_te) # Create an SVM clf = OneClassSVM(kernel='linear', nu=0.1, random_state=seed2) # Evaluate the SVM's performance clf.fit(x_tr) svm_x_te = len(np.where(clf.predict(x_te) == 1)[0]) / float(ntest) * \ 100 svm_y_te = len(np.where(clf.predict(y_te) == -1)[0]) / float(ntest) * \ 100 # Perform classification using overlap as the feature # -- The overlap must be above 50% clf_x_te = 0. clf_y_te = 0. for x, y in zip(sp_x_te, sp_y_te): # Refactor xt = np.vstack((sp_base_result, x)) yt = np.vstack((sp_base_result, y)) # Compute the accuracy xo = metrics.compute_overlap(xt) yo = metrics.compute_overlap(yt) if xo >= clf_th: clf_x_te += 1 if yo < clf_th: clf_y_te += 1 clf_x_te = (clf_x_te / ntest) * 100 clf_y_te = (clf_y_te / ntest) * 100 # Store the results as errors sp_x_results[i] = 100 - clf_x_te sp_y_results[i] = 100 - clf_y_te svm_x_results[i] = 100 - svm_x_te svm_y_results[i] = 100 - svm_y_te # Log the results sp._log_stats('SP % Correct Base Class', clf_x_te) sp._log_stats('SP % Correct Novelty Class', clf_y_te) sp._log_stats('SVM % Correct Base Class', svm_x_te) sp._log_stats('SVM % Correct Novelty Class', svm_y_te) # Print the results if verbose: print '\nSP Base Class Detection : {0:2.2f}%'.format(clf_x_te) print 'SP Novelty Class Detection : {0:2.2f}%'.format(clf_y_te) print 'SVM Base Class Detection : {0:2.2f}%'.format(svm_x_te) print 'SVM Novelty Class Detection : {0:2.2f}%'.format(svm_y_te) return sp_x_results, sp_y_results, svm_x_results, svm_y_results
def main(): usage="refine2d using simmx information " parser = EMArgumentParser(usage=usage,version=EMANVERSION) parser.add_argument("--ptcls", type=str,help="particle file", default=None) parser.add_argument("--simmx", type=str,help="simmx", default=None) parser.add_argument("--npca", type=int,help="number of pca factors", default=10) parser.add_argument("--niter", type=int,help="number of iterations", default=5) parser.add_argument("--outlier", type=float,help="outlier fraction", default=0.1) parser.add_argument("--ncls", type=int,help="number of centers", default=128) parser.add_argument("--nref", type=int,help="number of references", default=32) (options, args) = parser.parse_args() logid=E2init(sys.argv) simmxfile=options.simmx for itr in range(options.niter): ### start from the simmx print "Pre-processing simmx" e=EMData(simmxfile) pts=e.numpy().T.copy() for i in range(len(pts)): pts[i]-=np.mean(pts[i]) pts[i]/=np.std(pts[i]) pts=pts.astype(np.float).copy(); #e=from_numpy(pts.T.copy()) #e.write_image("simmx_tmp.hdf") #exit() print "Doing PCA" (nptcl, ncls) = pts.shape; #nfac=options.npca pca=PCA(options.npca) pts_pca=pca.fit_transform(pts) bs=pts_pca bs/=np.std(bs) print bs.shape,pts.shape np.savetxt("test_pca_{:02d}".format(itr),pts_pca) print "Removing outliers" outliers_fraction=options.outlier svm=OneClassSVM(nu=0.95 * outliers_fraction + 0.05,kernel="rbf", gamma=0.1) svm.fit(bs) y_pred = svm.decision_function(bs).ravel() nkeep=int(len(bs)*(1-outliers_fraction)) st=np.argsort(y_pred)[::-1] st=st[:nkeep] print "Clustering" ncnt=options.ncls centroids,_ = kmeans(bs[st],ncnt) l,_ = vq(bs[st],centroids) labels=np.zeros(len(bs))-1 labels[st]=l print "Class averaging" e=EMData(1,len(labels)) for i in range(len(labels)): e.set_value_at(0,i,labels[i]) clsmxfile="clsmx_{:02d}.hdf".format(itr) e.write_image(clsmxfile) clsout="classes_{:02d}.hdf".format(itr) run("e2classaverage.py --input={} --classmx={} --output={} --force --center xform.center --iter=5 --align=rotate_translate_flip:maxshift=32 --averager=mean --keep=.6 --cmp=ccc --aligncmp=ccc --normproc=normalize --parallel=thread:12".format(options.ptcls,clsmxfile,clsout)) simmxfile="simmx_{:02d}.hdf".format(itr) run("e2simmx.py {} {} {} --align rotate_translate_flip --aligncmp ccc --cmp ccc --saveali --parallel thread:12".format(options.ptcls, clsout, simmxfile)) E2end(logid)
def decision_function(self, data): return -OneClassSVM.decision_function(self, data)
# indices = np.arange(X.shape[0]) # np.random.shuffle(indices) # shuffle the dataset # X = X[indices] # y = y[indices] X_train = X[:n_samples_train, :] X_test = X[n_samples_train:, :] y_train = y[:n_samples_train] y_test = y[n_samples_train:] # # training only on normal data: # X_train = X_train[y_train == 0] # y_train = y_train[y_train == 0] print('OneClassSVM processing...') model = OneClassSVM(cache_size=500) tstart = time() model.fit(X_train) fit_time += time() - tstart tstart = time() scoring = -model.decision_function(X_test) # the lower,the more normal predict_time += time() - tstart fpr_, tpr_, thresholds_ = roc_curve(y_test, scoring) if fit_time + predict_time > max_time: raise TimeoutError f = interp1d(fpr_, tpr_) tpr += f(x_axis) tpr[0] = 0.
X_train = X[:n_samples_train, :] X_test = X[n_samples_train:, :] y_train = y[:n_samples_train] y_test = y[n_samples_train:] # training and testing only on normal data: X_train = X_train[y_train == 0] y_train = y_train[y_train == 0] X_test = X_test[y_test == 0] y_test = y_test[y_test == 0] # define models: iforest = IsolationForest() lof = LocalOutlierFactor(n_neighbors=20) ocsvm = OneClassSVM() lim_inf = X.min(axis=0) lim_sup = X.max(axis=0) volume_support = (lim_sup - lim_inf).prod() t = np.arange(0, 100 / volume_support, 0.01 / volume_support) axis_alpha = np.arange(alpha_min, alpha_max, 0.0001) unif = np.random.uniform(lim_inf, lim_sup, size=(n_generated, n_features)) # fit: print('IsolationForest processing...') iforest = IsolationForest() iforest.fit(X_train) s_X_iforest = iforest.decision_function(X_test) print('LocalOutlierFactor processing...')
ax.set_xlabel( 'Ratio' ) ax.set_ylabel( 'Margin' ) ax.set_zlabel( 'Similarity of Neighboring Districts' ) ax.set_zlim( [ 0., 1. ] ) ax.set_xlim( [ 0., 500. ] ) ax.set_ylim( [ 0., 1. ] ) fig.show() angles = np.linspace(0,360,41)[:-1] # Take 20 angles between 0 and 360 rotanimate(ax, angles,'movie.gif',delay=20, width = 6., height = 5.) # do outlier search using one-class SVM data[ 0, : ] = preprocessing.scale( data[ 0, : ] ) model = OneClassSVM( gamma = .001, nu = .1 ) fit = model.fit( data ) preds = model.predict( data ) inlier = np.where( preds == 1. )[ 0 ] outlier = np.where( preds == -1. )[ 0 ] fig = plt.figure() ax = fig.add_subplot( 111, projection = '3d' ) ax.scatter( data[ inlier, 0 ], data[ inlier, 1 ], data[ inlier, 2 ], c = 'b' ) ax.scatter( data[ outlier, 0 ], data[ outlier, 1 ], data[ outlier, 2 ], c = 'k' ) ax.set_xlabel( '$P^2/A$' ) ax.set_ylabel( 'Margin' ) ax.set_zlabel( 'Similarity of Neighboring Districts' ) ax.set_ylim( [0., 1 ] )
def classifier(data): from sklearn.covariance import EllipticEnvelope from sklearn.svm import OneClassSVM from sklearn.datasets import load_boston from sklearn import preprocessing # Get data # Define "classifiers" to be used legend1 = {} legend2 = {} evaluation = [[val["coverage"], val["num_exons"], val["distance_to_next"]] for val in data] X = [[val["coverage"], val["num_exons"], val["distance_to_next"]] for val in data] X = preprocessing.scale(X) evaluation = preprocessing.scale(evaluation) # Learn a frontier for outlier detection with several classifiers sample = random.sample(X, 20000) clf = OneClassSVM(nu=.1, kernel='rbf') test = random.sample(evaluation, 2000) print >> sys.stderr, "fitting data" clf.fit(sample) print >> sys.stderr, "predicting data" Y = clf.predict(test) print >> sys.stderr, "plotting data" fig, axes = subplots() for i in range(len(test)): if Y[i] == 1: color = 'blue' else: color = 'red' axes.scatter(test[i][2], test[i][1], c=color) #ylim([50,2000]) #num exons ylabel("distance") #xlim([3,10]) xlabel("coverage") savefig("DistanceVCoverage.pdf") fig, axes = subplots() """ for i in range(len(test)): if Y[i] == 1: color = 'blue' else: color = 'red' axes.scatter(test[i][1], test[i][0], c=color) #xlim([0,10]) #num exons xlabel("number of exons") #ylim([3,15]) ylabel("coverage") savefig("ExonsvsCoverage.pdf") """ full_test = clf.predict(evaluation) novel, regular = [],[] for i in range(len(full_test)): result = full_test[i] if result == -1: print data[i]["id"] novel.append(data[i]["num_exons"]) else: regular.append(data[i]["num_exons"]) multi_exon_novel = [val for val in novel if val > 1] multi_exon_regular = [val for val in regular if val > 1] print >> sys.stderr, "novel, regular" print >> sys.stderr, len(novel), len(regular) print >> sys.stderr, mean(multi_exon_novel), mean(multi_exon_regular), len(multi_exon_novel), len(multi_exon_regular)