예제 #1
0
class svm_model():
    def train(self, X, ker):
        self.model = OneClassSVM(kernel=ker, shrinking=True,random_state=1)
        self.model.fit(X)

    def predict(self, X):
        return self.model.predict(X)
예제 #2
0
def main():
	n = 1000
	data = []
	for i in range(n):
		data.append(np.array([np.random.randint(0, 5000) for i in range(np.random.randint(20, 150))]))
	data = np.array(data)

	# making all the data into 5 dimensions
	# howto : boxplot
	x = []
	y = []
	for i in data:
		sorted_i = sorted(i)
		x.append([max(sorted_i), np.percentile(sorted_i, 75), np.median(sorted_i), np.percentile(sorted_i, 25), min(sorted_i)])
		y.append(0)
	x = np.array(x)

	'''
	# making all the data into 5 dimensions
	# howto : distance
	start = time.time()
	data_i = 0
	cnt = 1
	x = np.zeros((n, n))
	for i in data:
		data_j = data_i
		for j in data[cnt:]:
			dist = dtw(i, j, dist=lambda i, j: norm(i - j, ord=1))[0]
			x[data_i][data_j+1], x[data_j+1][data_i] = dist, dist
			data_j += 1
		cnt += 1
		data_i += 1
	end = time.time()
	print(end - start)
	'''

	# build model with x
	model = OneClassSVM()
	model.fit(x)

	# create test dataset
	test = []
	for i in range(10):
		test.append(np.array([np.random.randint(0, 10000) for i in range(np.random.randint(20000, 30000))]))
	test = np.array(test)

	# transform test dataset
	x = []
	y = []
	for i in test:
		sorted_i = sorted(i)
		x.append([max(sorted_i), np.percentile(sorted_i, 75), np.median(sorted_i), np.percentile(sorted_i, 25), min(sorted_i)])
		y.append(0)
	x = np.array(x)

	# predict test dataset
	pred = model.predict(x)

	'''
예제 #3
0
 def fit(self, X, Y, W):
     clf = OneClassSVM(kernel=self.kernel, degree=self.degree,
                       gamma=self.gamma, coef0=self.coef0, tol=self.tol,
                       nu=self.nu, shrinking=self.shrinking,
                       cache_size=self.cache_size, max_iter=self.max_iter)
     if W is not None:
         return OneClassSVMClassifier(clf.fit(X, W.reshape(-1)))
     return OneClassSVMClassifier(clf.fit(X))
예제 #4
0
파일: learn.py 프로젝트: cmcneil/openepoc
class Cluster(object):

    def __init__(self, name):
        self.name = name
        self.raw_dataset = []
        self.dataset = []
        self.dataset_red = []
    
    def get_featurevec(self, data):
            '''Takes in data in the form of an array of EmoPackets, and outputs
                a list of feature vectors.'''
            # CHECKED, all good :) 
            num_bins = (len(data)/int(dsp.SAMPLE_RATE*dsp.STAGGER) -
                        int(dsp.BIN_SIZE / dsp.STAGGER) + 1)
            size = int(dsp.BIN_SIZE*dsp.SAMPLE_RATE)
            starts = int(dsp.SAMPLE_RATE*dsp.STAGGER)
            points = []
            for i in range(num_bins):
                points.append(dsp.get_features(data[i*starts:i*starts+size]))
            return points

    def add_data(self, raw):
        '''Allows the addition of new data. Will retrain upon addition.
            Expects a list of EmoPackets.'''
        self.dataset.extend(self.get_featurevec(raw))

    def extract_features(self):
        '''Does feature extraction for all of the datasets.'''
        self.dataset = []
        for sess in self.raw_dataset:
            self.dataset.extend(self.get_featurevec(sess))

    def reduce_dim(self, NDIM=5):
        '''Reduces the dimension of the extracted feature vectors.'''
        X = np.array(self.dataset)
        self.pca = RandomizedPCA(n_components=NDIM).fit(X)
        self.dataset_red = self.pca.transform(X)
        
    def train(self):
        '''Trains the classifier.'''
        self.svm = OneClassSVM()
        self.svm.fit(self.dataset_red)

    def is_novel(self, pt):
        '''Says whether or not the bin is novel. Expects an array of EmoPackets'''
        X = self.pca.transform(np.array(self.get_featurevec(data)[0]))
        ans = self.svm.predict(X)
        self.dataset_red.append(X)
        self.train()
        return ans
                    
    def save(self):
        '''Saves this classifier to a data directory.'''
        this_dir, this_filename = os.path.split(__file__)
        DATA_PATH = os.path.join(this_dir, "data", self.name+'.pkl')
        dumpfile = open(DATA_PATH, "wb")
        pickle.dump(self, dumpfile, pickle.HIGHEST_PROTOCOL)
        dumpfile.close()
예제 #5
0
 def determine_test_similarity(self, model):
     clf_OCSVM = {}
     model_OCSVM = {}
     for i in range(len(model)):
         clf = OneClassSVM(kernel='rbf', nu=0.1, gamma=.023)
         clf_OCSVM[i] = clf
         OCSVMmodel = clf.fit(model[i])
         model_OCSVM[i] = OCSVMmodel
     return clf_OCSVM, model_OCSVM
예제 #6
0
    def runClassifier(self, _driverId, numComponents=0):
        X = self.featuresHash.values()
        self.ids = self.featuresHash.keys()
        if self.runDimRed:
            X = self.dimRed(X, numComponents)

        clf = OCSVM(nu=self.nu, gamma=self.gamma)
        clf.fit(X)
        y_pred = clf.decision_function(X).ravel()
        threshold = stats.scoreatpercentile(y_pred, 100 * self.outliers_fraction)
        self.label = y_pred > threshold
        self.label = map(int, self.label)
def select_best_support_vectors(data, nu=0.01, all_gammas=2 ** np.arange(-10, 10, 1)):
    all_errors = []
    for gamma in all_gammas:
        clf = OneClassSVM(nu=nu, gamma=gamma)
        clf.fit(data)
        prediction = clf.predict(data)
        out_of_class_count = np.sum(prediction == -1)
        support_vectors_count = len(clf.support_vectors_)
        error = (float(out_of_class_count) / len(data) - nu) ** 2
        error += (float(support_vectors_count) / len(data) - nu) ** 2
        all_errors.append(error)
    index = np.argmin(all_errors)
    return all_gammas[index], all_errors
예제 #8
0
def embed_dat_matrix_two_dimensions(low_dimension_data_matrix,
                                    y=None,
                                    labels=None,
                                    density_colormap='Blues',
                                    instance_colormap='YlOrRd'):
    from sklearn.preprocessing import scale
    low_dimension_data_matrix = scale(low_dimension_data_matrix)
    # make mesh
    x_min, x_max = low_dimension_data_matrix[:, 0].min(), low_dimension_data_matrix[:, 0].max()
    y_min, y_max = low_dimension_data_matrix[:, 1].min(), low_dimension_data_matrix[:, 1].max()
    step_num = 50
    h = min((x_max - x_min) / step_num, (y_max - y_min) / step_num)  # step size in the mesh
    b = h * 10  # border size
    x_min, x_max = low_dimension_data_matrix[:, 0].min() - b, low_dimension_data_matrix[:, 0].max() + b
    y_min, y_max = low_dimension_data_matrix[:, 1].min() - b, low_dimension_data_matrix[:, 1].max() + b
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    # induce a one class model to estimate densities
    from sklearn.svm import OneClassSVM
    gamma = max(x_max - x_min, y_max - y_min)
    clf = OneClassSVM(gamma=gamma, nu=0.1)
    clf.fit(low_dimension_data_matrix)

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, m_max] . [y_min, y_max].
    if hasattr(clf, "decision_function"):
        score_matrix = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    else:
        score_matrix = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
    # Put the result into a color plot
    levels = np.linspace(min(score_matrix), max(score_matrix), 40)
    score_matrix = score_matrix.reshape(xx.shape)

    if y is None:
        y = 'white'

    plt.contourf(xx, yy, score_matrix, cmap=plt.get_cmap(density_colormap), alpha=0.9, levels=levels)
    plt.scatter(low_dimension_data_matrix[:, 0], low_dimension_data_matrix[:, 1],
                alpha=.5,
                s=70,
                edgecolors='gray',
                c=y,
                cmap=plt.get_cmap(instance_colormap))
    # labels
    if labels is not None:
        for id in range(low_dimension_data_matrix.shape[0]):
            label = labels[id]
            x = low_dimension_data_matrix[id, 0]
            y = low_dimension_data_matrix[id, 1]
            plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
예제 #9
0
def outlier_detect(data_frame):
    #pandas to numpy - digestible by scikit
    columns = ['blm_tag_count','protest_count','justice_count','riot_count','breathe_count']
    features = data_frame[list(columns)].values

    clf = OneClassSVM(nu=0.008, gamma=0.05)
    clf.fit(features)
    y_pred = clf.predict(features)

    mask=[y_pred==-1]
    oak_array = np.asarray(data_frame.hourly)
    protest_predict = oak_array[mask]
    protest_hours = list(protest_predict)
    
    return protest_hours
예제 #10
0
파일: svm.py 프로젝트: bondarchukYV/AD
def svm(data, fraction=0.05, kernel='poly', degree=3, gamma=0, coeff=0):
    svm = OneClassSVM(kernel=kernel, degree=degree, gamma=gamma, nu=fraction, coeff0=coeff)
    svm.fit(data)

    score = svm.predict(data)
    numeration = [[i] for i in xrange(1, len(data)+1, 1)]
    numeration = np.array(numeration)
    y = np.hstack((numeration, score))

    anomalies = numeration
    for num,s in y:
        if (y == 1):
            y = np.delete(anomalies, num-1, axis=0)

    return anomalies
def select_best_outlier_fraction_cross_val(data, nu=0.05, all_gammas=2 ** np.arange(-10, 10, 50), folds_count=7):
    all_errors = []
    kf_iterator = KFold(len(data), n_folds=folds_count)
    for gamma in all_gammas:
        error = 0
        for train, test in kf_iterator:
            train_data = data[train,:]
            test_data = data[test,:]
            clf = OneClassSVM(nu=nu, gamma=gamma)
            clf.fit(train_data)
            prediction = clf.predict(test_data)
            outlier_fraction = np.mean(prediction == -1)
            error += (nu - outlier_fraction) ** 2 + (float(clf.support_vectors_.shape[0]) / len(data) - nu) ** 2
        all_errors.append(error / folds_count)
    best_index = np.argmin(error)
    return int(best_index), all_errors
예제 #12
0
class OneClassSVMDetector(BaseOutlier):
    @staticmethod
    def get_attributes():
        return {
            "nu":0.1,
            "kernel":['rbf','linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
            "gamma":0.1,
        }
    def __init__(self,nu=0.1,kernel='rbf',gamma=0.1):
        self.nu = nu
        self.kernel = kernel
        self.gamma = gamma
    def fit(self,data=None):
        self.data = data
        self.check_finite(data)
        if(self._is_using_pandas(data)==True):
            self.data.interpolate(inplace=True)
        # self.datareshap = data.reshape(-1,1)
        self.clf = OneClassSVM(nu=self.nu, kernel=self.kernel, gamma=self.gamma)
        self.clf.fit(data.reshape(-1,1))
        # print "done"
        return self
    def predict(self, X_test):
        y_pred_train = self.clf.predict(X_test.reshape(-1,1))

        outlier_idx = np.where(y_pred_train == -1)
        inlier_idx = np.where(y_pred_train == 1)
        d = {
            'timestamp': self.data.index[outlier_idx],
            'anoms': self.data.iloc[outlier_idx]
        }
        anoms = pd.DataFrame(d)
        self.anomaly_idx = anoms.index
        self.anom_val = anoms['anoms']
        return anoms
    def fit_predict(self, data=None):
        self.fit(data)
        return self.predict(data)
    def plot(self):
        import matplotlib.pyplot as plt
        f, ax = plt.subplots(1, 1)
        ax.plot(self.data, 'b')
        ax.plot(self.anomaly_idx, self.anom_val, 'ro')
        ax.set_title('Detected Anomalies')
        ax.set_ylabel('Count')
        f.tight_layout()
        return f
def cross_validate():
    #for tinkering with the model
    #read data
    all_df = pd.read_csv('./data/train.csv',index_col = 'ID')

    #split data
    zeros_df = all_df[all_df.TARGET == 0]
    ones_df = all_df[all_df.TARGET == 1]
    
    num_ones = ones_df.shape[0]
    msk = np.random.permutation(len(zeros_df)) < num_ones
    
    zeros_train_df = zeros_df[~msk]
    zeros_test_df = zeros_df[msk]


    ones_test_df = ones_df
    
    train_df = zeros_train_df
    test_df = pd.concat([zeros_test_df,ones_test_df])
    
    train_X = np.array(train_df.drop('TARGET', axis = 1))
    train_Y = np.array(train_df.TARGET)
    
    test_X = np.array(test_df.drop('TARGET',axis = 1))
    test_Y = np.array(test_df.TARGET) #true target values
    
    
    #init svm 
    print('training svm')
    my_svm = OneClassSVM(verbose = True)
    my_svm.fit(train_X)
    
    
    #predict
    print('predicting')
    predictions = my_svm.predict(test_X)
    
    

    conf_matrix = confusion_matrix(test_Y,predictions)
    print('confusion matrix:')
    print(pd.DataFrame(conf_matrix,columns = [0,1]))
    
    print('accuracy:')
    print(sum(test_Y.reshape(predictions.shape) == predictions)/len(test_Y))
예제 #14
0
  def find_anomaly(label1, label2, winsize):
    print("Find anomaly in channel", label1 + '-' + label2 + '...', file=sys.stderr)
    print("-"*80)
    print("Channel [" + label1 + '-' + label2 + ']')
    print("-"*80)

    # find difference
    electrode1 = eeg.chan_lab.index(label1)
    electrode2 = eeg.chan_lab.index(label2)
    wave = eeg.X[electrode1] - eeg.X[electrode2]

    # # import random
    # wave = [random.uniform(-20,20) for _ in range(400*30)] + [random.uniform(-2000,2000) for _ in range(5*30)]
    # wave = np.array(wave)

    print("Splitting into windows...", file=sys.stderr)
    wave_windows = np.array_split(wave, len(wave)/eeg.sample_rate/winsize)
    # wave_windows = np.array_split(wave, len(wave)/winsize)

    print("Extracting features...", file=sys.stderr)
    def extract_features(wave_window): 
      max_val = max(wave_window)
      min_val = min(wave_window)
      stdev = np.std(wave_window)
      sum_val = sum(wave_window)
      sum_pos_val = sum([x for x in wave_window if x > 0])
      sum_abs_val = sum([abs(x) for x in wave_window])
      return [max_val, min_val, stdev, sum_val, sum_pos_val, sum_abs_val]

    Examples = np.array(map(extract_features, wave_windows))

    print("Training model, assuming no more than", CONTAMINATION, "anomaly...", file=sys.stderr)
    od = OneClassSVM(nu=CONTAMINATION, kernel='poly', gamma=0.05, max_iter=100000)
    od.fit(Examples)

    decisions = od.decision_function(Examples)
    # print decisions
    # print max(decisions), min(decisions)

    print("Most likely windows with anomaly:")
    # find most likely windows, in desc order
    largest_indices = np.argsort((-np.absolute(decisions)).ravel())[:20]
    for large_index in largest_indices:
      print(large_index*winsize/60, "min (score:", decisions[large_index][0], ")")

    sys.stdout.flush()
	def remove_outliers_SVM(self):
		## Remove outliers using a OneClassSVM method

		print "Running SVM to remove outliers..."

		svm = OneClassSVM(kernel='rbf', nu=0.1, degree=3, verbose=1)
		fit = svm.fit(self.DataArray)
		decision = svm.decision_function(self.DataArray)
		_indices = []

		# If a value is below the decision hyperplane, eliminate it
		for i in range(len(decision)):
			if decision[i] < 0:
				_indices.append(i)
		print self.DataArray.shape
		self.DataArray = np.delete(self.DataArray, _indices, axis=0)
		self.TargetArray = np.delete(self.TargetArray, _indices, axis=0)
		print self.DataArray.shape
예제 #16
0
def plot_scatter(X_dict, y_dict, col1, col2, max_error, max_filled_gap, insens, 
        f_colors = ['yellow', 'red', 'blue'], nu=0.98, high=0.95):

    planes = sorted(X_dict.keys())
    planes_with_failures = sorted([key for key in X_dict.keys() if y_dict[key].sum()>0])

    ocsvm = OneClassSVM(kernel='linear', nu=0.98)
    X_train = pd.concat(dict([(plane, X_dict[plane][[col1, col2]].dropna()) 
                              for plane in planes_with_failures]))
    ocsvm.fit(X_train.values)

    qb = QuantileBinarizer(low=0.0, high=0.95, each_side=False)
    qb.fit(X_train)

    mask_pref = pd.concat(dict(
            [(plane, get_mask_pref(y_dict[plane], max_error)) for plane in planes]), axis=0)
    mask_norm = pd.concat(dict(
            [(plane, get_mask_norm(y_dict[plane], max_error, insens)) for plane in planes]), axis=0) 

    fig = plt.figure(figsize=(15,15), dpi=100)
    # plt.xlabel('Norm of res. phase: %s, group: %s' % (col1[0], str(col_groups[col1[0]][int(col1[1][-1])])))
    # plt.ylabel('Norm of res. phase: %s, group: %s' % (col2[0], str(col_groups[col2[0]][int(col2[1][-1])])))
    plt.xlabel(col1)
    plt.ylabel(col2)

    plot_norm = plt.scatter(pd.concat(X_dict)[col1].loc[mask_norm], 
                pd.concat(X_dict)[col2].loc[mask_norm], c='lightgrey', zorder=1, s=6)
    plot_pref = []
    for i, plane in enumerate(planes_with_failures):        
        plot_pref.append(plt.scatter(X_dict[plane][col1].loc[get_mask_pref(y_dict[plane], max_error)], 
                    X_dict[plane][col2].loc[get_mask_pref(y_dict[plane], max_error)], 
                        c=f_colors[i], zorder=2, s=30))
    x_min, x_max, y_min, y_max = plt.axis('tight')

    plt.axvline(qb._thresholds[col1]['high'], c='green')
    plt.axhline(qb._thresholds[col2]['high'], c='green')
    plot_line = plt.plot([x_min, x_max], 
                         [(ocsvm.intercept_ - ocsvm.coef_[0][0] * x_min) / ocsvm.coef_[0][1],
                          (ocsvm.intercept_ - ocsvm.coef_[0][0] * x_max) / ocsvm.coef_[0][1]],
                         c='red')

    # # plt.legend((plot_norm, plot_pref), ('No-failure', 'Pre-failure'),
    # #            scatterpoints=1, loc='upper right', ncol=1)
    # #plt.savefig('./scatter/pair_group_of_fours3.png')
예제 #17
0
 def fit(self,data=None):
     self.data = data
     self.check_finite(data)
     if(self._is_using_pandas(data)==True):
         self.data.interpolate(inplace=True)
     # self.datareshap = data.reshape(-1,1)
     self.clf = OneClassSVM(nu=self.nu, kernel=self.kernel, gamma=self.gamma)
     self.clf.fit(data.reshape(-1,1))
     # print "done"
     return self
예제 #18
0
 def predict_header_features(self, pkt_featurizer):
     group_id = pkt_featurizer.pkt_type
     features = pkt_featurizer.features
     arrival_time = pkt_featurizer.arrival_time
     try:
         vectorizer = DictVectorizer()
         vectorizer.fit(self.training_data[group_id])
         training_data_vectorized = vectorizer.transform(self.training_data[group_id])
         features_vectorized = vectorizer.transform(features)
         scaler = preprocessing.StandardScaler(with_mean=False)
         training_data_vectorized = scaler.fit_transform(training_data_vectorized)
         features_vectorized = scaler.transform(features_vectorized)
         classifier = OneClassSVM()
         classifier.fit(training_data_vectorized)
         result = classifier.predict(features_vectorized)
         distance = classifier.decision_function(features_vectorized)
     except KeyError:
         result = 0
         distance = 0
     return result, distance
예제 #19
0
class TwoStage(object):

    def __init__(self, *args, **kwargs):
        super(TwoStage, self).__init__(*args, **kwargs)
        self._oneCls = OneClassSVM(nu=NU, gamma=GAMMA)
        self._clf = RandomForestClassifier(n_estimators=30)
        self._scaler = StandardScaler()

    def fit(self, data, labels):
        sdata = self._scaler.fit_transform(data)
        self._oneCls.fit(sdata)
        self._clf.fit(sdata, labels)
        return self

    def predict(self, data):
        sdata = self._scaler.transform(data)
        is_known_cls = self._oneCls.predict(sdata)
        cls = self._clf.predict(sdata)
        cls[is_known_cls == -1] = "zother"        
        classes = list(self._clf.classes_) + ["zother"]
        return cls, classes
예제 #20
0
class NoveltySeparator(BaseEstimator):

    def get_params(self, deep=True):
        return {}

    def fit(self, X, y):
        # lets treat users spending something in the rest of the month as outliers
        inliers = y - X[:, 0]
        inliers = np.where(inliers < 0.1, True, False)

        self.detector = OneClassSVM(nu=0.05, cache_size=2000, verbose=True)

        # training only on inliers
        print("Training detector")
        self.detector.fit(X[inliers])
        results = self.detector.predict(X).reshape(X.shape[0])
        # predicted
        inliers = results == 1
        outliers = results == -1

        print("Training estimators")
        self.est_inliers = Ridge(alpha=0.05)
        self.est_outliers = Ridge(alpha=0.05)
        self.est_inliers.fit(X[inliers], y[inliers])
        self.est_inliers.fit(X[outliers], y[outliers])

    def predict(self, X):

        y = np.zeros(X.shape[0])

        labels = self.detector.predict(X).reshape(X.shape[0])
        inliers = lables == 1
        outliers = lables == -1

        y[inliers] = self.est_inliers.predict(X[inliers])
        y[outliers] = self.est_outliers.predict(X[outliers])

        return y
예제 #21
0
 def predict_pkt_length_features(self, pkt_featurizer):
     group_id = pkt_featurizer.pkt_type
     try:
         dbscan = DBSCAN()
         pkt_lengths = np.array(list(self.pkt_lengths[group_id])+[pkt_featurizer.len_bytes]).reshape(-1,1)
         labels = dbscan.fit_predict(pkt_lengths)
         dbscan_prediction = labels[-1] == -1
         if self.plot:
             self.plot_1d_dbscan(pkt_lengths, labels, range(len(pkt_lengths)), self.pkt_lengths_fig_dbscan, 
                                 "", "Pkt Length", "Pkt Length DBSCAN Clustering - Anomalous Pkts in Black")
         one_class_svm = OneClassSVM()
         scaler = preprocessing.StandardScaler()
         pkt_lengths_scaled = scaler.fit_transform(np.array(self.pkt_lengths[group_id]).reshape(-1,1))
         features_scaled = scaler.transform(np.array(pkt_featurizer.len_bytes).reshape(1,-1))
         one_class_svm.fit(pkt_lengths_scaled)
         svm_prediction = one_class_svm.predict(features_scaled)
         if self.plot and len(pkt_lengths_scaled) > 2:
             self.plot_1d_svm(self.pkt_lengths[group_id], one_class_svm, range(len(self.pkt_lengths[group_id])), scaler, self.pkt_lengths_fig_svm,  
                              "Pkt", "Pkt Length", "Pkt Length One Class SVM Classification")
     except (KeyError, IndexError) as e:
         print e
         dbscan_prediction = 0
     return dbscan_prediction
예제 #22
0
def check_authors_vocabulary(category):
    """Use 80% of the authors as training set and the rest as test set. A good score validates the
    assumption that there is a defined vocabulary for a given category.
    """
    print category
    with open('%s_tweets.json' % category, 'r') as f:
        tweets = json.load(f)

    tweets_by_author = defaultdict(list)
    for tweet in tweets['tweets']:
        tweets_by_author[tweet['author_name']].append(tweet)

    authors = tweets_by_author.keys()
    training_set_count = len(authors) * 80 / 100

    training_authors = random.sample(authors, training_set_count)
    test_authors = list(set(authors) - set(training_authors))

    train_set = []
    test_set = []
    for author, tweets in tweets_by_author.items():
        if author in training_authors:
            train_set.extend([prepare_tweet(t['text']) for t in tweets])
        else:
            test_set.extend([prepare_tweet(t['text']) for t in tweets])

    vectorizer = CountVectorizer(
        max_features=10000,
        #stop_words='english',
        max_df=0.7)
    classifier = OneClassSVM()
    text_clf = Pipeline([('vect', vectorizer),
                         ('tfidf',
                          TfidfTransformer(sublinear_tf=True, norm='l2')),
                         ('clf', classifier)])

    text_clf = text_clf.fit(train_set)

    predicted = text_clf.predict(test_set)

    print np.mean(predicted == 1)
    print classification_report([1 for _ in range(len(test_set))], predicted)
예제 #23
0
    def __init__(self,
                 embedder,
                 detector,
                 G2V_nhid=128,
                 G2V_wl_iter=2,
                 FGSD_hist_bins=200,
                 IF_n_trees=200,
                 IF_sample_ratio=0.5,
                 LOF_n_neighbors=20,
                 LOF_n_leaf=30,
                 normalize_embedding=False,
                 **kwargs):
        embedders = {
            'Graph2Vec':
            Graph2Vec(wl_iterations=G2V_wl_iter,
                      dimensions=G2V_nhid,
                      attributed=True,
                      epochs=50),
            'FGSD':
            FGSD(hist_bins=FGSD_hist_bins, hist_range=20)
        }
        detectors = {
            'IF':
            IsolationForest(n_estimators=IF_n_trees,
                            max_samples=IF_sample_ratio,
                            contamination=0.1),
            'LOF':
            LocalOutlierFactor(n_neighbors=LOF_n_neighbors,
                               leaf_size=LOF_n_leaf,
                               contamination=0.1),
            'OCSVM':
            OneClassSVM(gamma='scale', nu=0.1)
        }

        assert embedder in embedders.keys()
        assert detector in detectors.keys()

        self.embedder = embedders[embedder]
        self.detector = detectors[detector]
        self.embedder_name = embedder
        self.detector_name = detector
        self.normalize_embedding = normalize_embedding
예제 #24
0
    def thread_monitoring_pre_train(self):
        ########################################################
        ## Normalize and apply PCA to the training data
        result = self.pca.fit_transform(
            self.scaler.fit_transform(self.anomaly_data))

        ## First element of the Tuple is True or False either if it is a neighbourhood-based method or not
        self.anomaly_algorithms[0][2] = EllipticEnvelope(
            support_fraction=1, contamination=self.contamination)
        self.anomaly_algorithms[1][2] = DBSCAN(eps=self.avg_dist(
            result[:, 0], result[:, 1]),
                                               metric='euclidean',
                                               min_samples=2)
        self.anomaly_algorithms[2][2] = OneClassSVM(kernel='rbf',
                                                    nu=self.contamination,
                                                    gamma=0.05)

        ########################################################
        ## Predict outliers - use DBSCAN (unsupervised technique) for first fitering of outliers
        ## get predictions for all training data
        DBSCAN_index = 1
        predictions_temp = self.anomaly_algorithms[DBSCAN_index][
            2].fit_predict(result)

        #########################################################
        ## Filter data - for each element of the training data
        filtered_anomaly = np.array([])
        for temp_i in np.arange(len(self.anomaly_data)):
            ## If sample is not outlier
            if predictions_temp[temp_i] != -1:
                if len(filtered_anomaly) == 0:
                    filtered_anomaly = self.anomaly_data[temp_i]
                else:
                    filtered_anomaly = np.vstack(
                        (filtered_anomaly, self.anomaly_data[temp_i]))

        ##########################################################
        ## Update data
        self.anomaly_data = filtered_anomaly

        ## Train algorithms
        self.thread_monitoring_train()
예제 #25
0
def test():
    np.random.seed(42)
    dataset = datasets.load_iris()

    test_indices = np.random.choice(150, 10)
    test_set = dataset.data[test_indices, :]
    test_set = np.vstack((test_set, np.array([[0.0, 0.0, 0.0, 0.0]])))
    test_set = np.vstack((test_set, np.array([[10.0, 10.0, 10.0, 10.0]])))

    print "GMM"
    gmm = GMM(n_components=3, covariance_type='diag')
    bc_gmm = BackgroundCheck(estimator=gmm, mu=0.0, m=1.0)
    bc_gmm.fit(dataset.data)
    print bc_gmm.predict_proba(test_set)

    print "OneClassSVM"
    sv = OneClassSVM()
    bc_sv = BackgroundCheck(estimator=sv, mu=0.0, m=1.0)
    bc_sv.fit(dataset.data)
    print bc_sv.predict_proba(test_set)
예제 #26
0
    def fit(self, X, y):
        # lets treat users spending something in the rest of the month as outliers
        inliers = y - X[:, 0]
        inliers = np.where(inliers < 0.1, True, False)

        self.detector = OneClassSVM(nu=0.05, cache_size=2000, verbose=True)

        # training only on inliers
        print("Training detector")
        self.detector.fit(X[inliers])
        results = self.detector.predict(X).reshape(X.shape[0])
        # predicted
        inliers = results == 1
        outliers = results == -1

        print("Training estimators")
        self.est_inliers = Ridge(alpha=0.05)
        self.est_outliers = Ridge(alpha=0.05)
        self.est_inliers.fit(X[inliers], y[inliers])
        self.est_inliers.fit(X[outliers], y[outliers])
예제 #27
0
def COSVM(training_data, testing_data, nu_list, kernel_list):
    # Build SVM model
    clf = OneClassSVM(nu=nu_list, kernel=kernel_list, gamma=0.1)
    clf.fit(training_data)
    y_pred_test = clf.predict(testing_data)
    n_error_test = y_pred_test[y_pred_test == -1].size
    testing_accuracy = 1 - 1.0 * n_error_test / testing_data.shape[0]
    #
    #    print(n_error_test)
    #    print('final accuracy on testing data: ', testing_accuracy, '\n')

    test_score = clf.decision_function(testing_data)
    test_score = test_score.reshape(-1)

    return test_score
예제 #28
0
    def model(self, 
              nu = [0.001, 0.01, 0.001], 
              contamination = [0.001, 0.001, 0.001]):

        if self.verbose: print(datetime.now(), 'the model is being created ...')
            
        self.ocsvm_rbf = OneClassSVM(gamma = 'scale', kernel = 'rbf', nu = nu[0]) 
        self.ocsvm_sigmoid = OneClassSVM(gamma = 'auto', kernel = 'sigmoid', nu = nu[1]) 
        self.ocsvm_linear = OneClassSVM(kernel = 'linear', nu = nu[2]) 

        self.ifo = IsolationForest(contamination = contamination[0])  
        self.lof = LocalOutlierFactor(contamination = contamination[1], novelty = True)
        self.ee = EllipticEnvelope(contamination = contamination[2])
        
        if self.verbose: print(datetime.now(), 'the model is ready.')
예제 #29
0
def data_learn(name="song_data"):
    """GET endpoint for training model

    Arguments:
    name -- the name of file to get the data from, default song_data
    """
    # load data
    df = pd.read_pickle("./" + url_prefix + "data/" + name + ".pkl").drop(
        columns=['analysis_url', 'track_href', 'type', 'uri'])

    # drop "useless" columns
    df = df.drop(columns=['duration_ms', 'key', 'mode', 'time_signature'])

    # split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        df.drop(columns=['name', 'id']), df['id'], test_size=0.30)

    # fit estimator
    #clf = IsolationForest(n_estimators = 500, contamination = 0.11)

    clf = make_pipeline(StandardScaler(), OneClassSVM(nu=0.11, gamma=0.04))
    clf.fit(X_train, y_train)

    # Predict off test data and create array of outliers
    predictions = clf.predict(X_test)
    outliers = [id for id, predict in zip(y_test, predictions) if predict < 0]
    count = len(outliers)

    with open("./" + url_prefix + "model/" + "clf.pkl", 'wb') as fid:
        pickle.dump(clf, fid, 2)

    return jsonify({
        'success':
        True,
        "test_outliers":
        json.loads(df.loc[df['id'].isin(outliers)].to_json(orient='index')),
        "test_outliers_count":
        count,
        "test_outliers_%":
        count / len(y_test) * 100
    })
예제 #30
0
def oneclass(c: Config):
    normal_traffic_array, traffic_scenario = load_normal_traffic_array(c)
    bridge_scenarios = [HealthyScenario()] + each_pier_scenarios(c)
    response_type = ResponseType.YTranslation
    points = [
        Point(x=x, y=0, z=z)
        for x, z in itertools.product(
            np.linspace(c.bridge.x_min, c.bridge.x_max / 2, 20),
            np.linspace(c.bridge.z_min, c.bridge.z_max / 2, 3),
        )
    ]
    results = []

    for b, bridge_scenario in enumerate(bridge_scenarios):
        print_i(f"One class: bridge scenario {bridge_scenario.name}")
        responses = responses_to_traffic_array(
            c=c,
            traffic_array=normal_traffic_array,
            response_type=response_type,
            bridge_scenario=bridge_scenario,
            points=points,
            fem_runner=OSRunner(c),
        ).T
        print(len(normal_traffic_array))
        print(responses.shape)

        # Fit on the healthy scenario.
        if b == 0:
            assert len(responses) == len(points)
            clfs = []
            for r, rs in enumerate(responses):
                print_i(f"Training classifier {r} / {len(responses)}")
                clfs.append(OneClassSVM().fit(rs.reshape(-1, 1)))

        scenario_results = []
        for p, _ in enumerate(points):
            print_i(f"Predicting points {p} / {len(points)}")
            prediction = clfs[p].predict(responses[p].reshape(-1, 1))
            print(prediction)
            print(len(prediction[prediction < 0]))
            print(len(prediction[prediction > 0]))
예제 #31
0
def Estimators(num_extimators=100,
               max_samples=0.25,
               contamination=0.2,
               eps=0.2):
    ifsf = IsolationForest(max_samples=max_samples,
                           random_state=0,
                           contamination=contamination,
                           n_estimators=num_extimators,
                           n_jobs=-1)
    lofsf = LocalOutlierFactor(n_neighbors=15,
                               metric='euclidean',
                               algorithm='auto',
                               contamination=contamination,
                               n_jobs=-1)
    ocsvm = OneClassSVM(nu=contamination, kernel="rbf", gamma=0.1)
    dbscan = DBSCAN(eps=eps,
                    min_samples=10,
                    metric='euclidean',
                    algorithm='auto',
                    n_jobs=-1)
    return {"if": ifsf, "lof": lofsf, "dbs": dbscan, "svm": ocsvm}
예제 #32
0
def tau_to_npy(a, domain='xs'):
    tau_a = np.array([])
    for i in range(a.shape[1]):
        # print(i)
        temp1 = a[:, i].reshape(-1, 1)
        y_pred = OneClassSVM(nu=0.1).fit(temp1).predict(temp1)
        index = np.where(y_pred == 1)[0].tolist()
        length = len(index)
        average = np.sum(temp1[index]) / length
        tau_a = np.append(tau_a, average)
    tau_a = tau_a.reshape(1, -1)
    if domain == 'xs':
        np.save("Xs.npy", tau_a)
    elif domain == 'xt':
        np.save("Xt.npy", tau_a)
    elif domain == 'xs_add':
        np.save("Xs_add.npy", tau_a)
    elif domain == 'xt_add':
        np.save("Xt_add.npy", tau_a)
    else:
        print("data save error")
예제 #33
0
def main():
    print('------------01')
    iris = load_iris()
    pca = PCA(n_components=2)
    data = pca.fit_transform(iris.data)
    print(type(data))
    print(data)
    # nuで異常値の割合を指定。predictすると正常値=1,異常値=-1。
    ocsvm = OneClassSVM(nu=0.1, gamma="auto")
    ocsvm.fit(data)
    preds = ocsvm.predict(data)
    print(preds)
    plt.scatter(data[:, 0], data[:, 1], c=preds, cmap=plt.cm.RdBu)
    plt.show()

    print('------------02A')
    x = np.linspace(-5, 5, 500)
    y = np.linspace(-1.5, 1.5, 250)
    X, Y = np.meshgrid(x, y)
    print('X.ravel():')
    print(X.ravel())
    print(X.shape)
    print(Y.shape)
    z1 = np.array([X.ravel(), Y.ravel()])
    print(z1.shape)
    z2 = ocsvm.decision_function(np.array([X.ravel(), Y.ravel()]).T)
    print(z2.shape)
    # (250, 500)
    # (250, 500)
    # (2, 125000)
    # (125000,)
    # (250, 500)
    print(z2.reshape(X.shape).shape)
    df = ocsvm.decision_function(np.array([X.ravel(),
                                           Y.ravel()]).T).reshape(X.shape)
    plt.scatter(data[:, 0], data[:, 1], c=preds, cmap=plt.cm.RdBu, alpha=0.8)
    r = max([abs(df.min()), abs(df.max())])
    print('------------02B')
    print(df.min())
    print(max([abs(df.min()), abs(df.max())]))
    print(df)
    plt.contourf(X, Y, df, 10, vmin=-r, vmax=r, cmap=plt.cm.RdBu, alpha=.5)
    plt.show()
예제 #34
0
 def get_best_params(self, param_grid, n_iter=5):
     """
     This function compute a GridSearchCV for different training sets
     inputs:
         n_iter: number of iterations of the GridSearchCV in different training sets
         param_grid: dictionary with the name and values of the parameter to change.
                     ex: {"nu": [.2, .5, .7]}
     return:
     
     """
     self.train_score, self.test_score = pd.DataFrame(), pd.DataFrame()
     self.train_score["best_nu"] = np.zeros(len(param_grid["nu"]))
     self.test_score["best_nu"] = np.zeros(len(param_grid["nu"]))
     #set index
     self.train_score = self.train_score.set_index(param_grid["nu"])
     self.test_score = self.test_score.set_index(param_grid["nu"])
     count_cv = 0
     for i in range(n_iter):
         #self.X = self.X.sample(self.X.shape[0]) #shuffle pandas dataframe is very slow
         np.random.shuffle(self.X)
         self.ocsvm = OneClassSVM(kernel="rbf", gamma="auto")
         self.gsCV = GridSearchCV(self.ocsvm,
                                  param_grid=param_grid,
                                  cv=self.k_folds,
                                  scoring=self.ocsvm_score,
                                  return_train_score=True,
                                  n_jobs=3)  #idd=False
         self.gsCV.fit(self.X, self.y)
         #self.cv_results["iter_"+str(i)] = self.gsCV.cv_results_
         for cv in range(self.k_folds):
             self.train_score["score_cv_" +
                              str(count_cv)] = self.gsCV.cv_results_[
                                  "split" + str(cv) + "_train_score"]
             self.test_score["score_cv_" +
                             str(count_cv)] = self.gsCV.cv_results_[
                                 "split" + str(cv) + "_test_score"]
             count_cv += 1
         self.train_score.loc[self.gsCV.best_params_["nu"], "best_nu"] += 1
         self.test_score.loc[self.gsCV.best_params_["nu"], "best_nu"] += 1
     return self.train_score, self.test_score
예제 #35
0
    def update_event(self, input_called=-1):
        if input_called == 0:
            clf = OneClassSVM()
            if self.input(1) != None:
                clf.set_params(**self.input(1))
            
            try:
                X = self.input(2)

                clf.fit(X)
            except:
                pass
            
            self.set_output_val(1, clf)

            self.exec_output(0)
예제 #36
0
    def __init__(self):
        rospy.init_node('svm_imu_test')
        self.is_training = True

        rospy.Subscriber('/base_state',
                         BaseState,
                         self.base_state_CB,
                         queue_size=1)

        self.pub = list()
        self.clf = list()

        for i in range(4):
            self.clf.append(OneClassSVM(nu=0.4, kernel="poly", gamma=0.4))
            self.pub.append(
                rospy.Publisher('/observer_' + str(i),
                                sensorFusionMsg,
                                queue_size=1))

        rospy.loginfo("Training period starting")
        rospy.Timer(rospy.Duration(30), self.timer_cb, oneshot=True)
        rospy.spin()
예제 #37
0
def outlier_rejection(X=None,
                      y=None,
                      method='IsolationForest',
                      contamination=0.1):
    """This will be our function used to resample our dataset.
    """
    outlier_model = (
        IsolationForest(contamination=contamination),
        LocalOutlierFactor(contamination=contamination),
        OneClassSVM(nu=contamination),
        EllipticEnvelope(contamination=contamination),
    )

    outlier_model = {i.__class__.__name__: i for i in outlier_model}

    if X is None:
        return outlier_model.keys()
    model = outlier_model.get(method)
    if model is None:
        raise ValueError("method '{}' is invalid".format(method))
    y_pred = model.fit_predict(X)
    return X[y_pred == 1], y[y_pred == 1]
예제 #38
0
def runOCSVMGridSearch(data_folder, cfg):
    #Gather the dataset
    #train_x[:len(train_x)/2] = (training) 70% of reg samples
    #train_x[len(train_x)/2:] = (training) 70% of facet samples
    #test_x[:len(test_x)/2] = (testing) 30% of reg samples
    #test_x[len(test_x)/2:] = (testing) 30% of facet samples
    train_x, train_y, test_x, test_y = gatherHoldoutData(data_folder, cfg)

    # PreProcess data
    train_x, test_x = preprocessData(train_x, test_x, "soft_scaling")

    parameters = {'nu': np.linspace(0.01, 1, 100)}

    # fit the model
    #clf = OneClassSVM(nu=0.95 * 0.5 + 0.05, kernel="rbf")
    clf = GridSearchCV(OneClassSVM(kernel="rbf"),
                       parameters,
                       cv=5,
                       scoring='recall')
    clf.fit(train_x, train_y[:len(train_y) / 2])  #fit to normal samples only

    print("Best parameters set found on development set:")
    print(clf.best_params_)
    print("Grid scores on training set:")
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

    print("Classification results on test set:")
    y_true, y_pred = test_y, clf.predict(test_x)
    print y_true
    print y_pred
    print(classification_report(y_true, y_pred))
    print accuracy_score(y_true, y_pred)
    print("accuracy: ", accuracy_score(y_true, y_pred))
    print("precision: ", precision_score(y_true, y_pred))
    print("recall: ", recall_score(y_true, y_pred))
    print("area under curve (auc): ", roc_auc_score(y_true, y_pred))
예제 #39
0
def train_CSD_SVM(args):
    '''
    Train a SVM outlier detector using real images
    :param real_img_dir: A directory contains real images
    :param svm_model_path: A path for saving trained model
    :return:
    '''
    train_paths = list(
        map(lambda x: args.real_img_dir + x, os.listdir(args.real_img_dir)))
    logging.info("Training file paths: {}".format(len(train_paths)))
    train_feat = get_color_feat(train_paths)
    train_feat = np.squeeze(train_feat, axis=1)
    y_true = [1] * np.shape(train_feat)[0]
    # train SVM
    parameters = {'gamma': [0.001, 0.0001, 1 / 588, 0.01, 0.1]}
    svm_model = OneClassSVM(nu=0.1, kernel="rbf")
    clf = GridSearchCV(svm_model, parameters, cv=5, scoring='accuracy')
    clf.fit(train_feat, y_true)
    logging.info(clf.best_estimator_.get_params())
    # save the model
    joblib.dump(clf.best_estimator_, args.svm_model_path)
    logging.info('model saved')
예제 #40
0
def detect_outlier(data,
                   classifier="Robust Covariance",
                   outlier_fraction=0.005):
    classifiers = {
        "Empirical Covariance":
        EllipticEnvelope(support_fraction=1., contamination=outlier_fraction),
        "Robust Covariance":
        EllipticEnvelope(contamination=outlier_fraction),
        "OCSVM":
        OneClassSVM(nu=outlier_fraction, gamma=0.05)
    }
    # colors = ['m', 'g', 'b']
    legend = {}
    # Learn a frontier for outlier detection with several classifiers
    xx1, yy1 = np.meshgrid(np.linspace(5, 10, 500), np.linspace(10, 15, 500))
    plt.figure(1)
    clf = classifiers[classifier]
    clf.fit(data)
    scores = clf.decision_function(np.c_[xx1.ravel(),
                                         yy1.ravel()]).reshape(xx1.shape)
    legend[classifier] = plt.contour(xx1,
                                     yy1,
                                     scores,
                                     levels=[0],
                                     linewidths=2,
                                     colors='m',
                                     linestyles='dashed')
    legend_key = list(legend.keys())
    # Plot the results (= shape of the data points cloud)
    plt.figure(1)  # two clusters
    plt.title("Identify potential outliers")
    plt.xlabel('log: Above grade (ground) living area square feet')
    plt.ylabel('log: Sales Price')
    plt.scatter(data.iloc[:, 0], data.iloc[:, 1], color='black')
    plt.xlim((xx1.min(), xx1.max()))
    plt.ylim((yy1.min(), yy1.max()))
    plt.legend([legend_key[0]]).legendHandles[0].set_color('m')
    plt.show()
예제 #41
0
    def __init__(self,
                 kernel,
                 detector,
                 labeled=True,
                 WL_iter=5,
                 PK_bin_width=1,
                 LOF_n_neighbors=20,
                 LOF_n_leaf=30,
                 **kwargs):
        kernels = {
            'WL':
            WeisfeilerLehman(n_iter=WL_iter,
                             normalize=True,
                             base_graph_kernel=VertexHistogram),
            'PK':
            Propagation(t_max=WL_iter, w=PK_bin_width, normalize=True)
            if labeled else PropagationAttr(
                t_max=WL_iter, w=PK_bin_width, normalize=True),
        }
        detectors = {
            'OCSVM':
            OneClassSVM(kernel='precomputed', nu=0.1),
            'LOF':
            LocalOutlierFactor(n_neighbors=LOF_n_neighbors,
                               leaf_size=LOF_n_leaf,
                               metric='precomputed',
                               contamination=0.1),
            # 'IF': current similarity forest also has problem
        }

        assert kernel in kernels.keys()
        assert detector in detectors.keys()

        self.kernel = kernels[kernel]
        self.detector = detectors[detector]
        self.kernel_name = kernel
        self.detector_name = detector
        self.labeled = labeled
예제 #42
0
def trainLocalModel(r, key, datas):

    r_list = datas
    requests = pipeline.group_requests(
        r_list, lambda r: '{} {}'.format(r.method, r.url))

    for i, (k, v_list) in enumerate(sorted(requests.items())):

        d2 = pipeline.group_requests(v_list, lambda r: r.label_type)

        normal_request = d2.get('normal', [])
        anormal_request = d2.get('anormal', [])

        anormal_size = int(0.01 * len(normal_request))

        train_list, _ = train_test_split(normal_request +
                                         anormal_request[-anormal_size:-1],
                                         random_state=RANDOM_STATE,
                                         train_size=TRAIN_SIZE)

        clf_svm = make_pipeline(
            make_union(*[class_() for class_ in TF_LIST]),
            OneClassSVM(kernel='sigmoid', nu=NU, gamma='auto'))

        clf_isolation = make_pipeline(
            make_union(*[class_() for class_ in TF_LIST]),
            IsolationForest(n_estimators=128,
                            max_samples=400,
                            max_features=0.7,
                            random_state=rng))

        #clf_lof =  make_pipeline(
        #    make_union(*[class_() for class_ in TF_LIST]),
        #    LocalOutlierFactor(n_neighbors=20))

        models = [clf_svm.fit(train_list),
                  clf_isolation.fit(train_list)]  #, clf_lof.fit(train_list)]
        return models
예제 #43
0
def fit_selected_classifier(X_data):
    """ Fits the selected classifier

        Parameters:
            X_data (np.ndarray) - input dataset

        Returns:
            None
    """
    global classifier

    if sel_classifier == Classifier.OCSVM:
        #classifier = OneClassSVM(kernel='sigmoid', gamma='scale', nu=0.1).fit(X_data)
        classifier = OneClassSVM(gamma='scale', verbose=True).fit(X_data)

    if sel_classifier == Classifier.IFOREST:
        classifier = IsolationForest(random_state=0, contamination=0.1).fit(X_data)

    if sel_classifier == Classifier.EllipticEnvelope:
        classifier = EllipticEnvelope(random_state=0, contamination=0.1).fit(X_data)

    if sel_classifier == Classifier.LocalOutlierFactor:
        classifier = LocalOutlierFactor(contamination=0.1, novelty=True).fit(X_data)
예제 #44
0
 def fit(self, params):
     if len(self.data) <= 1:
         self.DEBUG_INFO = "No samples are there!"
         self.changed("alert_generated")
         return
     X = np.asarray(self.data)[:,0:2]
     y = np.asarray(self.data)[:,2]
     if np.unique(y).size == 1:
         # only one-class
         self.clf = OneClassSVM(nu=params['nu'],
                          gamma=params['gamma'],
                          degree=params['degree'],
                          coef0=params['coef0'],
                          kernel=params['kernel'])
         self.clf.fit(X)
     else:
         self.clf = SVC(C=params['C'],
                        gamma=params['gamma'],
                        degree=params['degree'],
                        coef0=params['coef0'],
                        kernel=params['kernel'])
         self.clf.fit(X, y)
     self.is_fitted = True
     self.changed("model_fitted")
    def __init__(
        self,
        trainable_invertible_ica,
        predictor_model,
        novelty_detector=OneClassSVM(nu=0.1, gamma="auto"),
        aug_max_iter: Optional[int] = None,
        augmentation_size: Optional[int] = None,
    ):
        """Build CausalMechanismTransfer object.

        Parameters
        ----------
        trainable_invertible_ica : object
            Trainable invertible ICA model for estimating the mechanism function.
            Required to implement ``train()`` and ``inv()``.

        predictor_model : object
            Trainable predictor model to be trained on the augmented data. Needs to implement ``fit()`` and ``predict()``.

        aug_max_iter : int or None
            The maximum number of iterations for performing the augmentation.

        augmentation_size : int or None
            The size of the augmentation. Fully augmented if ``None``.

        Returns
        ----------
        None : None
        """
        self.trainable_invertible_ica = trainable_invertible_ica
        self.augmenter = ICATransferAugmenter(
            self.trainable_invertible_ica.get_invertible_ica_model(),
            novelty_detector=novelty_detector,
            max_iter=aug_max_iter)
        self.predictor_model = predictor_model
        self.augmentation_size = augmentation_size
예제 #46
0
def single_eval_one_class(X_train, X_test, y_test, species_train, args,
                          class_list):

    if args.classifier == 'GaussianMixed':
        # define the gaussian mixed model, fit to training data and make predictions
        clf = GaussianMixed(class_list,
                            threshold=args.threshold,
                            mixmodel=True,
                            epsilon=1e-6)
        clf.fit(X_train, species_train)
        y_preds, y_score = clf.predict(X_test)

    else:
        # define the one class svm, and fit it to the training data
        clf = OneClassSVM(kernel=args.classifier, gamma='auto')
        clf.fit(X_train)
        #make predictions and calculate the roc curve
        y_preds = clf.predict(X_test)
        y_score = clf.decision_function(X_test)

    # calculate the false and true positive rate, followed by the AUROC
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)
    return y_preds, roc_auc, fpr, tpr
예제 #47
0
    def _load_detection_models(self):
        self._detection_models = {}

        for ds_url in data_sets.DS_URL_LIST[TEST_CONFIG['DS_URL_SLICE']]:
            try:
                normal_list = self._get_from_data_server(ds_url, 'n')
            except ValueError as err:
                self.log_debug(
                    '_load_detection_models',
                    'could not get req_list "{}": {}'.format(ds_url, err))
                return

            train_list, _ = train_test_split(normal_list,
                                             random_state=RANDOM_STATE,
                                             train_size=TRAIN_SIZE)

            clf = make_pipeline(
                make_union(*[class_() for class_ in TF_LIST]),
                OneClassSVM(random_state=0, nu=NU, gamma=GAMMA))
            clf.fit(train_list)

            key = str(normal_list[0])
            self._detection_models[key] = clf
            self.log_debug('_load_detection_models', 'loaded "{}"'.format(key))
예제 #48
0
def fit_sklearn_model(embeddings, model_name, output_filename, n_neighbors=4):
    logger.info('final size of the collected embeddings: {}'.format(
        len(embeddings)))
    embedding_array = np.concatenate(embeddings)

    if model_name == 'local_outlier_factor':
        logger.info('using local outlier factor with n_neighbour {}'.format(
            n_neighbors))
        clf = LocalOutlierFactor(n_neighbors=n_neighbors,
                                 novelty=True,
                                 contamination=0.1)
    elif model_name == 'isolation_forest':
        clf = IsolationForest(contamination=0.1)
    elif model_name == 'svm':
        clf = OneClassSVM(kernel='linear')
    else:
        raise ValueError('model {} not supported'.format(model_name))
    clf.fit(embedding_array)

    logger.info('Saving OOD model to {}'.format(output_filename))
    with open(output_filename, "wb") as out_stream:
        pickle.dump(clf, out_stream)

    return clf
예제 #49
0
    def __generate_probas(self, samples, resolution, affinity_matrix, number_of_questions):
        print(
            f"📞 Looks like there's a probability distribution ({self.name}) that wants to phone in an expert (that's "
            f"you)\n"
        )
        clf = OneClassSVM(kernel='precomputed')
        samples_and_weights = {0: 0.5}
        for nq in range(number_of_questions):
            indices = list(samples_and_weights.keys())
            if nq == 0:
                idx = np.random.choice(range(1, len(samples)))
            else:
                preds = clf.decision_function(affinity_matrix[:, indices])
                idx = [i for i, _ in sorted(enumerate(preds), key=lambda x: x[1]) if i not in samples_and_weights][
                    0]
            sample = samples[idx]

            print('Score the sample below with a number between 0 and 1 (higher is better)\n')
            if hasattr(sample, '_repr_html_'):
                print(sample)
            else:
                print(sample)
            weight = float(input('Score: '))
            assert 0 <= weight <= 1

            samples_and_weights[idx] = weight
            indices = list(samples_and_weights.keys())
            clf.fit(
                affinity_matrix[indices, :][:, indices],
                sample_weight=list(samples_and_weights.values())
            )

        indices = list(samples_and_weights.keys())
        preds = clf.decision_function(affinity_matrix[:, indices])
        scores = KernelDiscretizedMethod.discretized_scores(
            resolution,
            samples,
            affinity_matrix,
            lambda mask, _idx: preds[mask].mean())

        Z = logsumexp([s for s in scores.values()])

        return {idx: s - Z for idx, s in scores.items()}
예제 #50
0
def main():

    print "loading data"

    train = read_data(0)
    print train.shape
    label = np.ones(train.shape[0], )
    print label.shape

    # train = train.tolist()
    # label = label.tolist()

    print "training the one-class SVM"

    model = OneClassSVM(kernel='rbf',
                        nu=0.2,
                        degree=3,
                        gamma=0.009,
                        shrinking=1)
    model.fit(train)

    print "predicting the test data"

    label_test = np.ones(200, )
    test = read_data(1)

    pred1 = model.predict(train)
    print pred1[np.where(pred1 > 0)].sum() / pred1.shape[0]
    pred2 = model.predict(test)
    print pred2[np.where(pred2 > 0)].sum() / pred2.shape[0]
    # pred = np.zeros(200,)
    pred = []

    for i in range(200):
        #print 'the iteration:', i, p_label[i]
        if pred2[i] == 1:
            pred.append('healthy')
        elif pred2[i] == -1:
            pred.append('dzs_1r+dzs_1l')

    print len(pred)

    np.save('corrcoef_predict_9.npy', pred)
예제 #51
0
def outlier_detection_SVM():
    df_X, df_y = load_confirmed()
    df_X = remove_not_numeric(df_X)

    index = df_X[df_y == 1].index

    X = df_X.drop(index)
    y = df_y.drop(index)
    print(X)
    print(len(index), index)
    div = int(len(X) * 0.7)
    X_train = X[:div]
    X_test = X[div:]
    X_outliers = df_X.ix[index]
    print(X_outliers)

    pipe = Pipeline([("imputer", Imputer()), ("scaler", MinMaxScaler()),
                     ("decomposition", PCA(n_components=100))])
    X_train = pipe.fit_transform(X_train)
    clf = OneClassSVM(gamma=(1 / len(X_train)), nu=0.37)
    clf.fit(X_train)

    X_test = pipe.transform(X_test)
    X_outliers = pipe.transform(X_outliers)
    print(X_outliers)

    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    print("**********  " + str(len(X_test)) + "   " + str(len(y_pred_test)))
    y_pred_outliers = clf.predict(X_outliers)
    n_error_train = y_pred_train[y_pred_train == -1].size
    n_error_test = y_pred_test[y_pred_test == -1].size
    n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size

    print("train error:", n_error_train / len(y_pred_train))
    print("test error:", n_error_test / len(y_pred_test))
    print("outliers error:", n_error_outliers / len(y_pred_outliers))
    return y_pred_train, y_pred_test, y_pred_outliers, y[div:]
def slice_probability_space_selection(data, nu=0.05, all_gammas=2 ** np.linspace(-10, 10, 50),
    rho=0.05, outlier_distribution = np.random.rand, folds_count=7):
    kf_iterator = KFold(len(data), n_folds=folds_count)
    all_errors = []
    for gamma in all_gammas:
        error = 0.0
        clf = OneClassSVM(nu=nu, gamma=gamma)
        for train, test in kf_iterator:
            train_data = data[train,:]
            test_data = data[test,:]
            clf = OneClassSVM(nu=nu, gamma=gamma)
            clf.fit(train_data)
            prediction = clf.predict(test_data)
            inlier_metric_part = np.mean(prediction == -1)
            inlier_metric_part = inlier_metric_part / (1 + rho) / len(data)
            outliers = outlier_distribution(*data.shape) - 0.5
            outliers *= 8 * np.std(data)
            outlier_metric_part = np.mean(clf.predict(outliers) == 1) * rho / (1 + rho) / len(outliers)
            error += inlier_metric_part + outlier_metric_part
        all_errors.append(error / folds_count)
    index = np.argmin(all_errors)
    #best_index = pd.Series(all_errors).pct_change().argmax() - 1
    return int(index), all_errors
예제 #53
0
def base_experiment(pct_noise=0.15, noverlap_bits=0, exp_name='1-1',
	ntrials=10, verbose=True, seed=123456789):
	"""
	Run a single experiment, locally.
	
	@param pct_noise: The percentage of noise to add to the dataset.
	
	@param noverlap_bits: The number of bits the base class should overlap
	with the novelty class.
	
	@param exp_name: The name of the experiment.
	
	@param ntrials: The number of times to repeat the experiment.
	
	@param verbose: If True print the results.
	
	@param seed: The random seed to use.
	
	@return: A tuple containing the percentage errors for the SP's training
	and testing results and the SVM's training and testing results,
	respectively.
	"""
	
	# Base parameters
	ntrain, ntest = 800, 200
	nsamples, nbits, pct_active = ntest + ntrain, 100, 0.4
	clf_th = 0.5
	log_dir = os.path.join(os.path.expanduser('~'), 'scratch',
		'novelty_experiments', exp_name)
	
	# Configure the SP
	config = {
		'ninputs': 100,
		'trim': 1e-4,
		'disable_boost': True,
		'seed': seed,
		'pct_active': None,
		'random_permanence': True,
		'pwindow': 0.5,
		
		'global_inhibition': True,
		
		'ncolumns': 200,
		'nactive': 50,
		
		
		'nsynapses': 75,
		'seg_th': 15,
		
		'syn_th': 0.5,
		
		'pinc': 0.001,
		'pdec': 0.001,
		
		'nepochs': 10,
		
		'log_dir': log_dir
	}
	
	# Seed numpy
	np.random.seed(seed)
	
	# Create the base dataset
	x_ds = SPDataset(nsamples, nbits, pct_active, pct_noise, seed=seed)
	x_tr, x_te = x_ds.data[:ntrain], x_ds.data[ntrain:]
	
	# Create the outlier dataset
	base_indexes = set(np.where(x_ds.base_class == 1)[0])
	choices = [x for x in xrange(nbits) if x not in base_indexes]
	outlier_base = np.zeros(nbits, dtype='bool')
	outlier_base[np.random.choice(choices, x_ds.nactive - noverlap_bits,
		False)] = 1
	outlier_base[np.random.permutation(list(base_indexes))[:noverlap_bits]] = 1
	y_ds = SPDataset(ntest, nbits, pct_active, pct_noise, outlier_base, seed)
	y_te = y_ds.data
	
	if verbose:
		print "\nBase class' test noise: {0:2.2f}".format(1 - (np.mean(x_te, 0)
			* x_ds.base_class.astype('i')).sum() / 40.)
		print "Outlier's class noise: {0:2.2f}".format(1 - (np.mean(y_te, 0) *
			outlier_base.astype('i')).sum() / 40.)
		print 'Overlap between two classes: {0}'.format(np.dot(
			x_ds.base_class.astype('i'), outlier_base.astype('i')))
	
	# Metrics
	metrics = SPMetrics()
	
	# Get the metrics for the datasets
	u_x_tr = metrics.compute_uniqueness(x_tr)
	o_x_tr = metrics.compute_overlap(x_tr)
	c_x_tr = 1 - metrics.compute_distance(x_tr)
	u_x_te = metrics.compute_uniqueness(x_te)
	o_x_te = metrics.compute_overlap(x_te)
	c_x_te = 1 - metrics.compute_distance(x_te)
	u_y_te = metrics.compute_uniqueness(y_te)
	o_y_te = metrics.compute_overlap(y_te)
	c_y_te = 1 - metrics.compute_distance(y_te)
	
	# Initialize the overall results
	sp_x_results = np.zeros(ntrials)
	sp_y_results = np.zeros(ntrials)
	svm_x_results = np.zeros(ntrials)
	svm_y_results = np.zeros(ntrials)
	
	# Iterate across the trials:
	for i in xrange(ntrials):
		# Make a new seed
		seed2 = np.random.randint(1000000)
		config['seed'] = seed2
		config['log_dir'] = '{0}-{1}'.format(log_dir, i + 1)
		
		# Create the SP
		sp = SPRegion(**config)
		
		# Fit the SP
		sp.fit(x_tr)
		
		# Get the SP's output
		sp_x_tr = sp.predict(x_tr)
		sp_x_te = sp.predict(x_te)
		sp_y_te = sp.predict(y_te)
		
		# Get the metrics for the SP's results
		u_sp_x_tr = metrics.compute_uniqueness(sp_x_tr)
		o_sp_x_tr = metrics.compute_overlap(sp_x_tr)
		c_sp_x_tr = 1 - metrics.compute_distance(sp_x_tr)
		u_sp_x_te = metrics.compute_uniqueness(sp_x_te)
		o_sp_x_te = metrics.compute_overlap(sp_x_te)
		c_sp_x_te = 1 - metrics.compute_distance(sp_x_te)
		u_sp_y_te = metrics.compute_uniqueness(sp_y_te)
		o_sp_y_te = metrics.compute_overlap(sp_y_te)
		c_sp_y_te = 1 - metrics.compute_distance(sp_y_te)
		
		# Log all of the metrics
		sp._log_stats('Input Base Class Train Uniqueness', u_x_tr)
		sp._log_stats('Input Base Class Train Overlap', o_x_tr)
		sp._log_stats('Input Base Class Train Correlation', c_x_tr)
		sp._log_stats('Input Base Class Test Uniqueness', u_x_te)
		sp._log_stats('Input Base Class Test Overlap', o_x_te)
		sp._log_stats('Input Base Class Test Correlation', c_x_te)
		sp._log_stats('Input Novelty Class Test Uniqueness', u_y_te)
		sp._log_stats('Input Novelty Class Test Overlap', o_y_te)
		sp._log_stats('Input Novelty Class Test Correlation', c_y_te)	
		sp._log_stats('SP Base Class Train Uniqueness', u_sp_x_tr)
		sp._log_stats('SP Base Class Train Overlap', o_sp_x_tr)
		sp._log_stats('SP Base Class Train Correlation', c_sp_x_tr)
		sp._log_stats('SP Base Class Test Uniqueness', u_sp_x_te)
		sp._log_stats('SP Base Class Test Overlap', o_sp_x_te)
		sp._log_stats('SP Base Class Test Correlation', c_sp_x_te)
		sp._log_stats('SP Novelty Class Test Uniqueness', u_sp_y_te)
		sp._log_stats('SP Novelty Class Test Overlap', o_sp_y_te)
		sp._log_stats('SP Novelty Class Test Correlation', c_sp_y_te)
		
		# Print the results
		fmt_s = '{0}:\t{1:2.4f}\t{2:2.4f}\t{3:2.4f}\t{4:2.4f}\t{5:2.4f}\t{5:2.4f}'
		if verbose:
			print '\nDescription\tx_tr\tx_te\ty_te\tsp_x_tr\tsp_x_te\tsp_y_te'
			print fmt_s.format('Uniqueness', u_x_tr, u_x_te, u_y_te, u_sp_x_tr,
				u_sp_x_te, u_sp_y_te)
			print fmt_s.format('Overlap', o_x_tr, o_x_te, o_y_te, o_sp_x_tr, o_sp_x_te,
				o_sp_y_te)
			print fmt_s.format('Correlation', c_x_tr, c_x_te, c_y_te, c_sp_x_tr,
				c_sp_x_te, c_sp_y_te)
		
		# Get average representation of the base class
		sp_base_result = np.mean(sp_x_tr, 0)
		sp_base_result[sp_base_result >= 0.5] = 1
		sp_base_result[sp_base_result < 1] = 0
		
		# Averaged results for each metric type
		u_sp_base_to_x_te = 0.
		o_sp_base_to_x_te = 0.
		c_sp_base_to_x_te = 0.
		u_sp_base_to_y_te = 0.
		o_sp_base_to_y_te = 0.
		c_sp_base_to_y_te = 0.
		for x, y in zip(sp_x_te, sp_y_te):
			# Refactor
			xt = np.vstack((sp_base_result, x))
			yt = np.vstack((sp_base_result, y))
			
			# Compute the sums
			u_sp_base_to_x_te += metrics.compute_uniqueness(xt)
			o_sp_base_to_x_te += metrics.compute_overlap(xt)
			c_sp_base_to_x_te += 1 - metrics.compute_distance(xt)
			u_sp_base_to_y_te += metrics.compute_uniqueness(yt)
			o_sp_base_to_y_te += metrics.compute_overlap(yt)
			c_sp_base_to_y_te += 1 - metrics.compute_distance(yt)
		u_sp_base_to_x_te /= ntest
		o_sp_base_to_x_te /= ntest
		c_sp_base_to_x_te /= ntest
		u_sp_base_to_y_te /= ntest
		o_sp_base_to_y_te /= ntest
		c_sp_base_to_y_te /= ntest
		
		# Log the results
		sp._log_stats('Base Train to Base Test Uniqueness',
			u_sp_base_to_x_te)
		sp._log_stats('Base Train to Base Test Overlap', o_sp_base_to_x_te)
		sp._log_stats('Base Train to Base Test Correlation', c_sp_base_to_x_te)
		sp._log_stats('Base Train to Novelty Test Uniqueness',
			u_sp_base_to_y_te)
		sp._log_stats('Base Train to Novelty Test Overlap', o_sp_base_to_y_te)
		sp._log_stats('Base Train to Novelty Test Correlation',
			c_sp_base_to_y_te)
		
		# Print the results
		if verbose:
			print '\nDescription\tx_tr->x_te\tx_tr->y_te'
			print 'Uniqueness:\t{0:2.4f}\t{1:2.4f}'.format(u_sp_base_to_x_te,
				u_sp_base_to_y_te)
			print 'Overlap:\t{0:2.4f}\t{1:2.4f}'.format(o_sp_base_to_x_te,
				o_sp_base_to_y_te)
			print 'Correlation:\t{0:2.4f}\t{1:2.4f}'.format(c_sp_base_to_x_te,
				c_sp_base_to_y_te)
		
		# Create an SVM
		clf = OneClassSVM(kernel='linear', nu=0.1, random_state=seed2)
		
		# Evaluate the SVM's performance
		clf.fit(x_tr)
		svm_x_te = len(np.where(clf.predict(x_te) == 1)[0]) / float(ntest) * \
			100
		svm_y_te = len(np.where(clf.predict(y_te) == -1)[0]) / float(ntest) * \
			100
		
		# Perform classification using overlap as the feature
		# -- The overlap must be above 50%
		clf_x_te = 0.
		clf_y_te = 0.
		for x, y in zip(sp_x_te, sp_y_te):
			# Refactor
			xt = np.vstack((sp_base_result, x))
			yt = np.vstack((sp_base_result, y))
			
			# Compute the accuracy
			xo = metrics.compute_overlap(xt)
			yo = metrics.compute_overlap(yt)
			if xo >= clf_th: clf_x_te += 1
			if yo < clf_th: clf_y_te += 1
		clf_x_te = (clf_x_te / ntest) * 100
		clf_y_te = (clf_y_te / ntest) * 100
		
		# Store the results as errors
		sp_x_results[i] = 100 - clf_x_te
		sp_y_results[i] = 100 - clf_y_te
		svm_x_results[i] = 100 - svm_x_te
		svm_y_results[i] = 100 - svm_y_te
		
		# Log the results
		sp._log_stats('SP % Correct Base Class', clf_x_te)
		sp._log_stats('SP % Correct Novelty Class', clf_y_te)
		sp._log_stats('SVM % Correct Base Class', svm_x_te)
		sp._log_stats('SVM % Correct Novelty Class', svm_y_te)
		
		# Print the results
		if verbose:
			print '\nSP Base Class Detection     : {0:2.2f}%'.format(clf_x_te)
			print 'SP Novelty Class Detection  : {0:2.2f}%'.format(clf_y_te)
			print 'SVM Base Class Detection    : {0:2.2f}%'.format(svm_x_te)
			print 'SVM Novelty Class Detection : {0:2.2f}%'.format(svm_y_te)
	
	return sp_x_results, sp_y_results, svm_x_results, svm_y_results
예제 #54
0
def main():
	
	usage="refine2d using simmx information "
	parser = EMArgumentParser(usage=usage,version=EMANVERSION)
	parser.add_argument("--ptcls", type=str,help="particle file", default=None)
	parser.add_argument("--simmx", type=str,help="simmx", default=None)
	parser.add_argument("--npca", type=int,help="number of pca factors", default=10)
	parser.add_argument("--niter", type=int,help="number of iterations", default=5)
	parser.add_argument("--outlier", type=float,help="outlier fraction", default=0.1)
	parser.add_argument("--ncls", type=int,help="number of centers", default=128)
	parser.add_argument("--nref", type=int,help="number of references", default=32)
	(options, args) = parser.parse_args()
	logid=E2init(sys.argv)
	
	simmxfile=options.simmx
	for itr in range(options.niter):
		### start from the simmx
		print "Pre-processing simmx"
		e=EMData(simmxfile)
		pts=e.numpy().T.copy()
		for i in range(len(pts)):
			pts[i]-=np.mean(pts[i])
			pts[i]/=np.std(pts[i])
		pts=pts.astype(np.float).copy();
		#e=from_numpy(pts.T.copy())
		#e.write_image("simmx_tmp.hdf")
		#exit()
		
		print "Doing PCA"
		(nptcl, ncls) = pts.shape;
		#nfac=options.npca
		pca=PCA(options.npca)
		pts_pca=pca.fit_transform(pts)
		bs=pts_pca
		bs/=np.std(bs)
		print bs.shape,pts.shape
		np.savetxt("test_pca_{:02d}".format(itr),pts_pca)
		
		print "Removing outliers"
		outliers_fraction=options.outlier
		svm=OneClassSVM(nu=0.95 * outliers_fraction + 0.05,kernel="rbf", gamma=0.1)
		svm.fit(bs)
		y_pred = svm.decision_function(bs).ravel()
		nkeep=int(len(bs)*(1-outliers_fraction))
		st=np.argsort(y_pred)[::-1]
		st=st[:nkeep]
		
		print "Clustering"
		ncnt=options.ncls
		centroids,_ = kmeans(bs[st],ncnt)
		l,_ = vq(bs[st],centroids)
		
		labels=np.zeros(len(bs))-1
		labels[st]=l
		
		print "Class averaging"
		e=EMData(1,len(labels))
		for i in range(len(labels)):
			e.set_value_at(0,i,labels[i])
		clsmxfile="clsmx_{:02d}.hdf".format(itr)
		e.write_image(clsmxfile)
		
		clsout="classes_{:02d}.hdf".format(itr)
		run("e2classaverage.py --input={} --classmx={} --output={} --force --center xform.center --iter=5 --align=rotate_translate_flip:maxshift=32 --averager=mean --keep=.6 --cmp=ccc --aligncmp=ccc --normproc=normalize --parallel=thread:12".format(options.ptcls,clsmxfile,clsout))
		
		simmxfile="simmx_{:02d}.hdf".format(itr)
		run("e2simmx.py {} {} {} --align rotate_translate_flip --aligncmp ccc --cmp ccc --saveali --parallel thread:12".format(options.ptcls, clsout, simmxfile))
	

	E2end(logid)
예제 #55
0
 def decision_function(self, data):
     return -OneClassSVM.decision_function(self, data)
예제 #56
0
파일: bench_ocsvm.py 프로젝트: ngoix/OCRF
            # indices = np.arange(X.shape[0])
            # np.random.shuffle(indices)  # shuffle the dataset
            # X = X[indices]
            # y = y[indices]

            X_train = X[:n_samples_train, :]
            X_test = X[n_samples_train:, :]
            y_train = y[:n_samples_train]
            y_test = y[n_samples_train:]

            # # training only on normal data:
            # X_train = X_train[y_train == 0]
            # y_train = y_train[y_train == 0]

            print('OneClassSVM processing...')
            model = OneClassSVM(cache_size=500)
            tstart = time()
            model.fit(X_train)
            fit_time += time() - tstart
            tstart = time()

            scoring = -model.decision_function(X_test)  # the lower,the more normal
            predict_time += time() - tstart
            fpr_, tpr_, thresholds_ = roc_curve(y_test, scoring)

            if fit_time + predict_time > max_time:
                raise TimeoutError

            f = interp1d(fpr_, tpr_)
            tpr += f(x_axis)
            tpr[0] = 0.
예제 #57
0
    X_train = X[:n_samples_train, :]
    X_test = X[n_samples_train:, :]
    y_train = y[:n_samples_train]
    y_test = y[n_samples_train:]

    # training and testing only on normal data:
    X_train = X_train[y_train == 0]
    y_train = y_train[y_train == 0]
    X_test = X_test[y_test == 0]
    y_test = y_test[y_test == 0]

    # define models:
    iforest = IsolationForest()
    lof = LocalOutlierFactor(n_neighbors=20)
    ocsvm = OneClassSVM()

    lim_inf = X.min(axis=0)
    lim_sup = X.max(axis=0)
    volume_support = (lim_sup - lim_inf).prod()
    t = np.arange(0, 100 / volume_support, 0.01 / volume_support)
    axis_alpha = np.arange(alpha_min, alpha_max, 0.0001)
    unif = np.random.uniform(lim_inf, lim_sup,
                             size=(n_generated, n_features))

    # fit:
    print('IsolationForest processing...')
    iforest = IsolationForest()
    iforest.fit(X_train)
    s_X_iforest = iforest.decision_function(X_test)
    print('LocalOutlierFactor processing...')
예제 #58
0
ax.set_xlabel( 'Ratio' )
ax.set_ylabel( 'Margin' )
ax.set_zlabel( 'Similarity of Neighboring Districts' )
ax.set_zlim( [ 0., 1. ] )
ax.set_xlim( [ 0., 500. ] )
ax.set_ylim( [ 0., 1. ] )

fig.show()

angles = np.linspace(0,360,41)[:-1] # Take 20 angles between 0 and 360
rotanimate(ax, angles,'movie.gif',delay=20, width = 6., height = 5.) 

# do outlier search using one-class SVM
data[ 0, : ] = preprocessing.scale( data[ 0, : ] )

model = OneClassSVM( gamma = .001, nu = .1 )
fit = model.fit( data )
preds = model.predict( data )

inlier = np.where( preds == 1. )[ 0 ]
outlier = np.where( preds == -1. )[ 0 ]

fig = plt.figure()
ax = fig.add_subplot( 111, projection = '3d' )
ax.scatter( data[ inlier, 0 ], data[ inlier, 1 ], data[ inlier, 2 ], c = 'b' )
ax.scatter( data[ outlier, 0 ], data[ outlier, 1 ], data[ outlier, 2 ], c = 'k' )
ax.set_xlabel( '$P^2/A$' )
ax.set_ylabel( 'Margin' )
ax.set_zlabel( 'Similarity of Neighboring Districts' )

ax.set_ylim( [0., 1 ] )
예제 #59
0
def classifier(data):
    from sklearn.covariance import EllipticEnvelope
    from sklearn.svm import OneClassSVM
    from sklearn.datasets import load_boston
    from sklearn import preprocessing
    # Get data

    # Define "classifiers" to be used
    legend1 = {}
    legend2 = {}
    evaluation = [[val["coverage"], val["num_exons"], val["distance_to_next"]] for val in data] 
    X = [[val["coverage"], val["num_exons"], val["distance_to_next"]] for val in data]  
    X = preprocessing.scale(X)
    evaluation = preprocessing.scale(evaluation)
    # Learn a frontier for outlier detection with several classifiers
    sample = random.sample(X, 20000)
    clf = OneClassSVM(nu=.1, kernel='rbf')
    test = random.sample(evaluation, 2000)
    print >> sys.stderr, "fitting data"    
    clf.fit(sample)
    print >> sys.stderr, "predicting data"
    Y = clf.predict(test)
    print >> sys.stderr, "plotting data"
    fig, axes = subplots()
    
    for i in range(len(test)):
        if Y[i] == 1:
            color = 'blue'
        else:
            color = 'red'
        axes.scatter(test[i][2], test[i][1], c=color)
    #ylim([50,2000]) #num exons
    ylabel("distance")
    #xlim([3,10])
    xlabel("coverage")
    savefig("DistanceVCoverage.pdf")

    fig, axes = subplots()
    """
    for i in range(len(test)):
        if Y[i] == 1:
            color = 'blue'
        else:
            color = 'red'
        axes.scatter(test[i][1], test[i][0], c=color)
    #xlim([0,10]) #num exons
    xlabel("number of exons")
    #ylim([3,15])
    ylabel("coverage")
    savefig("ExonsvsCoverage.pdf")
    """
    full_test = clf.predict(evaluation)
    novel, regular = [],[]
    for i in range(len(full_test)):
        result = full_test[i]
        if result == -1:
            print data[i]["id"]
            novel.append(data[i]["num_exons"])
        else:
            regular.append(data[i]["num_exons"])
    multi_exon_novel = [val for val in novel if val > 1]
    multi_exon_regular = [val for val in regular if val > 1]
    print >> sys.stderr, "novel, regular"
    print >> sys.stderr, len(novel), len(regular)
    print >> sys.stderr, mean(multi_exon_novel), mean(multi_exon_regular), len(multi_exon_novel), len(multi_exon_regular)