示例#1
0
class svm_model():
    def train(self, X, ker):
        self.model = OneClassSVM(kernel=ker, shrinking=True,random_state=1)
        self.model.fit(X)

    def predict(self, X):
        return self.model.predict(X)
示例#2
0
def main():
	n = 1000
	data = []
	for i in range(n):
		data.append(np.array([np.random.randint(0, 5000) for i in range(np.random.randint(20, 150))]))
	data = np.array(data)

	# making all the data into 5 dimensions
	# howto : boxplot
	x = []
	y = []
	for i in data:
		sorted_i = sorted(i)
		x.append([max(sorted_i), np.percentile(sorted_i, 75), np.median(sorted_i), np.percentile(sorted_i, 25), min(sorted_i)])
		y.append(0)
	x = np.array(x)

	'''
	# making all the data into 5 dimensions
	# howto : distance
	start = time.time()
	data_i = 0
	cnt = 1
	x = np.zeros((n, n))
	for i in data:
		data_j = data_i
		for j in data[cnt:]:
			dist = dtw(i, j, dist=lambda i, j: norm(i - j, ord=1))[0]
			x[data_i][data_j+1], x[data_j+1][data_i] = dist, dist
			data_j += 1
		cnt += 1
		data_i += 1
	end = time.time()
	print(end - start)
	'''

	# build model with x
	model = OneClassSVM()
	model.fit(x)

	# create test dataset
	test = []
	for i in range(10):
		test.append(np.array([np.random.randint(0, 10000) for i in range(np.random.randint(20000, 30000))]))
	test = np.array(test)

	# transform test dataset
	x = []
	y = []
	for i in test:
		sorted_i = sorted(i)
		x.append([max(sorted_i), np.percentile(sorted_i, 75), np.median(sorted_i), np.percentile(sorted_i, 25), min(sorted_i)])
		y.append(0)
	x = np.array(x)

	# predict test dataset
	pred = model.predict(x)

	'''
示例#3
0
文件: learn.py 项目: cmcneil/openepoc
class Cluster(object):

    def __init__(self, name):
        self.name = name
        self.raw_dataset = []
        self.dataset = []
        self.dataset_red = []
    
    def get_featurevec(self, data):
            '''Takes in data in the form of an array of EmoPackets, and outputs
                a list of feature vectors.'''
            # CHECKED, all good :) 
            num_bins = (len(data)/int(dsp.SAMPLE_RATE*dsp.STAGGER) -
                        int(dsp.BIN_SIZE / dsp.STAGGER) + 1)
            size = int(dsp.BIN_SIZE*dsp.SAMPLE_RATE)
            starts = int(dsp.SAMPLE_RATE*dsp.STAGGER)
            points = []
            for i in range(num_bins):
                points.append(dsp.get_features(data[i*starts:i*starts+size]))
            return points

    def add_data(self, raw):
        '''Allows the addition of new data. Will retrain upon addition.
            Expects a list of EmoPackets.'''
        self.dataset.extend(self.get_featurevec(raw))

    def extract_features(self):
        '''Does feature extraction for all of the datasets.'''
        self.dataset = []
        for sess in self.raw_dataset:
            self.dataset.extend(self.get_featurevec(sess))

    def reduce_dim(self, NDIM=5):
        '''Reduces the dimension of the extracted feature vectors.'''
        X = np.array(self.dataset)
        self.pca = RandomizedPCA(n_components=NDIM).fit(X)
        self.dataset_red = self.pca.transform(X)
        
    def train(self):
        '''Trains the classifier.'''
        self.svm = OneClassSVM()
        self.svm.fit(self.dataset_red)

    def is_novel(self, pt):
        '''Says whether or not the bin is novel. Expects an array of EmoPackets'''
        X = self.pca.transform(np.array(self.get_featurevec(data)[0]))
        ans = self.svm.predict(X)
        self.dataset_red.append(X)
        self.train()
        return ans
                    
    def save(self):
        '''Saves this classifier to a data directory.'''
        this_dir, this_filename = os.path.split(__file__)
        DATA_PATH = os.path.join(this_dir, "data", self.name+'.pkl')
        dumpfile = open(DATA_PATH, "wb")
        pickle.dump(self, dumpfile, pickle.HIGHEST_PROTOCOL)
        dumpfile.close()
def select_best_support_vectors(data, nu=0.01, all_gammas=2 ** np.arange(-10, 10, 1)):
    all_errors = []
    for gamma in all_gammas:
        clf = OneClassSVM(nu=nu, gamma=gamma)
        clf.fit(data)
        prediction = clf.predict(data)
        out_of_class_count = np.sum(prediction == -1)
        support_vectors_count = len(clf.support_vectors_)
        error = (float(out_of_class_count) / len(data) - nu) ** 2
        error += (float(support_vectors_count) / len(data) - nu) ** 2
        all_errors.append(error)
    index = np.argmin(all_errors)
    return all_gammas[index], all_errors
示例#5
0
class NoveltySeparator(BaseEstimator):

    def get_params(self, deep=True):
        return {}

    def fit(self, X, y):
        # lets treat users spending something in the rest of the month as outliers
        inliers = y - X[:, 0]
        inliers = np.where(inliers < 0.1, True, False)

        self.detector = OneClassSVM(nu=0.05, cache_size=2000, verbose=True)

        # training only on inliers
        print("Training detector")
        self.detector.fit(X[inliers])
        results = self.detector.predict(X).reshape(X.shape[0])
        # predicted
        inliers = results == 1
        outliers = results == -1

        print("Training estimators")
        self.est_inliers = Ridge(alpha=0.05)
        self.est_outliers = Ridge(alpha=0.05)
        self.est_inliers.fit(X[inliers], y[inliers])
        self.est_inliers.fit(X[outliers], y[outliers])

    def predict(self, X):

        y = np.zeros(X.shape[0])

        labels = self.detector.predict(X).reshape(X.shape[0])
        inliers = lables == 1
        outliers = lables == -1

        y[inliers] = self.est_inliers.predict(X[inliers])
        y[outliers] = self.est_outliers.predict(X[outliers])

        return y
def slice_probability_space_selection(data, nu=0.05, all_gammas=2 ** np.linspace(-10, 10, 50),
    rho=0.05, outlier_distribution = np.random.rand, folds_count=7):
    kf_iterator = KFold(len(data), n_folds=folds_count)
    all_errors = []
    for gamma in all_gammas:
        error = 0.0
        clf = OneClassSVM(nu=nu, gamma=gamma)
        for train, test in kf_iterator:
            train_data = data[train,:]
            test_data = data[test,:]
            clf = OneClassSVM(nu=nu, gamma=gamma)
            clf.fit(train_data)
            prediction = clf.predict(test_data)
            inlier_metric_part = np.mean(prediction == -1)
            inlier_metric_part = inlier_metric_part / (1 + rho) / len(data)
            outliers = outlier_distribution(*data.shape) - 0.5
            outliers *= 8 * np.std(data)
            outlier_metric_part = np.mean(clf.predict(outliers) == 1) * rho / (1 + rho) / len(outliers)
            error += inlier_metric_part + outlier_metric_part
        all_errors.append(error / folds_count)
    index = np.argmin(all_errors)
    #best_index = pd.Series(all_errors).pct_change().argmax() - 1
    return int(index), all_errors
示例#7
0
def outlier_detect(data_frame):
    #pandas to numpy - digestible by scikit
    columns = ['blm_tag_count','protest_count','justice_count','riot_count','breathe_count']
    features = data_frame[list(columns)].values

    clf = OneClassSVM(nu=0.008, gamma=0.05)
    clf.fit(features)
    y_pred = clf.predict(features)

    mask=[y_pred==-1]
    oak_array = np.asarray(data_frame.hourly)
    protest_predict = oak_array[mask]
    protest_hours = list(protest_predict)
    
    return protest_hours
示例#8
0
文件: svm.py 项目: bondarchukYV/AD
def svm(data, fraction=0.05, kernel='poly', degree=3, gamma=0, coeff=0):
    svm = OneClassSVM(kernel=kernel, degree=degree, gamma=gamma, nu=fraction, coeff0=coeff)
    svm.fit(data)

    score = svm.predict(data)
    numeration = [[i] for i in xrange(1, len(data)+1, 1)]
    numeration = np.array(numeration)
    y = np.hstack((numeration, score))

    anomalies = numeration
    for num,s in y:
        if (y == 1):
            y = np.delete(anomalies, num-1, axis=0)

    return anomalies
def select_best_outlier_fraction_cross_val(data, nu=0.05, all_gammas=2 ** np.arange(-10, 10, 50), folds_count=7):
    all_errors = []
    kf_iterator = KFold(len(data), n_folds=folds_count)
    for gamma in all_gammas:
        error = 0
        for train, test in kf_iterator:
            train_data = data[train,:]
            test_data = data[test,:]
            clf = OneClassSVM(nu=nu, gamma=gamma)
            clf.fit(train_data)
            prediction = clf.predict(test_data)
            outlier_fraction = np.mean(prediction == -1)
            error += (nu - outlier_fraction) ** 2 + (float(clf.support_vectors_.shape[0]) / len(data) - nu) ** 2
        all_errors.append(error / folds_count)
    best_index = np.argmin(error)
    return int(best_index), all_errors
示例#10
0
class OneClassSVMDetector(BaseOutlier):
    @staticmethod
    def get_attributes():
        return {
            "nu":0.1,
            "kernel":['rbf','linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
            "gamma":0.1,
        }
    def __init__(self,nu=0.1,kernel='rbf',gamma=0.1):
        self.nu = nu
        self.kernel = kernel
        self.gamma = gamma
    def fit(self,data=None):
        self.data = data
        self.check_finite(data)
        if(self._is_using_pandas(data)==True):
            self.data.interpolate(inplace=True)
        # self.datareshap = data.reshape(-1,1)
        self.clf = OneClassSVM(nu=self.nu, kernel=self.kernel, gamma=self.gamma)
        self.clf.fit(data.reshape(-1,1))
        # print "done"
        return self
    def predict(self, X_test):
        y_pred_train = self.clf.predict(X_test.reshape(-1,1))

        outlier_idx = np.where(y_pred_train == -1)
        inlier_idx = np.where(y_pred_train == 1)
        d = {
            'timestamp': self.data.index[outlier_idx],
            'anoms': self.data.iloc[outlier_idx]
        }
        anoms = pd.DataFrame(d)
        self.anomaly_idx = anoms.index
        self.anom_val = anoms['anoms']
        return anoms
    def fit_predict(self, data=None):
        self.fit(data)
        return self.predict(data)
    def plot(self):
        import matplotlib.pyplot as plt
        f, ax = plt.subplots(1, 1)
        ax.plot(self.data, 'b')
        ax.plot(self.anomaly_idx, self.anom_val, 'ro')
        ax.set_title('Detected Anomalies')
        ax.set_ylabel('Count')
        f.tight_layout()
        return f
def cross_validate():
    #for tinkering with the model
    #read data
    all_df = pd.read_csv('./data/train.csv',index_col = 'ID')

    #split data
    zeros_df = all_df[all_df.TARGET == 0]
    ones_df = all_df[all_df.TARGET == 1]
    
    num_ones = ones_df.shape[0]
    msk = np.random.permutation(len(zeros_df)) < num_ones
    
    zeros_train_df = zeros_df[~msk]
    zeros_test_df = zeros_df[msk]


    ones_test_df = ones_df
    
    train_df = zeros_train_df
    test_df = pd.concat([zeros_test_df,ones_test_df])
    
    train_X = np.array(train_df.drop('TARGET', axis = 1))
    train_Y = np.array(train_df.TARGET)
    
    test_X = np.array(test_df.drop('TARGET',axis = 1))
    test_Y = np.array(test_df.TARGET) #true target values
    
    
    #init svm 
    print('training svm')
    my_svm = OneClassSVM(verbose = True)
    my_svm.fit(train_X)
    
    
    #predict
    print('predicting')
    predictions = my_svm.predict(test_X)
    
    

    conf_matrix = confusion_matrix(test_Y,predictions)
    print('confusion matrix:')
    print(pd.DataFrame(conf_matrix,columns = [0,1]))
    
    print('accuracy:')
    print(sum(test_Y.reshape(predictions.shape) == predictions)/len(test_Y))
示例#12
0
 def predict_header_features(self, pkt_featurizer):
     group_id = pkt_featurizer.pkt_type
     features = pkt_featurizer.features
     arrival_time = pkt_featurizer.arrival_time
     try:
         vectorizer = DictVectorizer()
         vectorizer.fit(self.training_data[group_id])
         training_data_vectorized = vectorizer.transform(self.training_data[group_id])
         features_vectorized = vectorizer.transform(features)
         scaler = preprocessing.StandardScaler(with_mean=False)
         training_data_vectorized = scaler.fit_transform(training_data_vectorized)
         features_vectorized = scaler.transform(features_vectorized)
         classifier = OneClassSVM()
         classifier.fit(training_data_vectorized)
         result = classifier.predict(features_vectorized)
         distance = classifier.decision_function(features_vectorized)
     except KeyError:
         result = 0
         distance = 0
     return result, distance
示例#13
0
class TwoStage(object):

    def __init__(self, *args, **kwargs):
        super(TwoStage, self).__init__(*args, **kwargs)
        self._oneCls = OneClassSVM(nu=NU, gamma=GAMMA)
        self._clf = RandomForestClassifier(n_estimators=30)
        self._scaler = StandardScaler()

    def fit(self, data, labels):
        sdata = self._scaler.fit_transform(data)
        self._oneCls.fit(sdata)
        self._clf.fit(sdata, labels)
        return self

    def predict(self, data):
        sdata = self._scaler.transform(data)
        is_known_cls = self._oneCls.predict(sdata)
        cls = self._clf.predict(sdata)
        cls[is_known_cls == -1] = "zother"        
        classes = list(self._clf.classes_) + ["zother"]
        return cls, classes
示例#14
0
 def predict_pkt_length_features(self, pkt_featurizer):
     group_id = pkt_featurizer.pkt_type
     try:
         dbscan = DBSCAN()
         pkt_lengths = np.array(list(self.pkt_lengths[group_id])+[pkt_featurizer.len_bytes]).reshape(-1,1)
         labels = dbscan.fit_predict(pkt_lengths)
         dbscan_prediction = labels[-1] == -1
         if self.plot:
             self.plot_1d_dbscan(pkt_lengths, labels, range(len(pkt_lengths)), self.pkt_lengths_fig_dbscan, 
                                 "", "Pkt Length", "Pkt Length DBSCAN Clustering - Anomalous Pkts in Black")
         one_class_svm = OneClassSVM()
         scaler = preprocessing.StandardScaler()
         pkt_lengths_scaled = scaler.fit_transform(np.array(self.pkt_lengths[group_id]).reshape(-1,1))
         features_scaled = scaler.transform(np.array(pkt_featurizer.len_bytes).reshape(1,-1))
         one_class_svm.fit(pkt_lengths_scaled)
         svm_prediction = one_class_svm.predict(features_scaled)
         if self.plot and len(pkt_lengths_scaled) > 2:
             self.plot_1d_svm(self.pkt_lengths[group_id], one_class_svm, range(len(self.pkt_lengths[group_id])), scaler, self.pkt_lengths_fig_svm,  
                              "Pkt", "Pkt Length", "Pkt Length One Class SVM Classification")
     except (KeyError, IndexError) as e:
         print e
         dbscan_prediction = 0
     return dbscan_prediction
if __name__ == '__main__':

	############### OUTLIER DETECTION ###############
	if (outlier_detection) :
		# humans_data = load_data_from_csv("D:/Kaggle/HumanVRobot/train_humans_ef_38f.csv", train = True)
		humans_data = load_data_from_csv("D:/Kaggle/HumanVRobot/train_humans_ef_21f_selrlr.csv", train = True)
	
		# Discard category information because it is a sparse matrix and only consider top 28 features
		bidder_ids, features = extract_features_for_anomaly_det (humans_data)

		# clf = OneClassSVM(nu = 0.0025, gamma = 0.0001)
		clf = OneClassSVM(nu = 0.0005, gamma = 0.0033)
		clf.fit(features)
		# clf.decision_function(features)
		pred = np.array(clf.predict(features))
		num_outliers = 0
		outlier_idx = []
		anomaly_bidders = []
		if (manual_handcode == False):
			for i, p in enumerate(pred) :
				if (p == -1):
					num_outliers += 1
					outlier_idx.append([i])
					anomaly_bidders.append(bidder_ids[i])
					# print (" i = ", i, features[i, :])
		else: 	
			print ("WARNING: Handcoding anomaly indices!")
			outlier_idx = [1079, 1807, 184, 564, 1228, 1497]								# These look bot-ish by manual inspeection
			for idx in outlier_idx: 
				anomaly_bidders.append(bidder_ids[idx])
from sklearn import preprocessing

scaler = preprocessing.StandardScaler().fit(X1_train)
X1_train_n=scaler.transform(X1_train)  
X1_test_n=scaler.transform(X1_test)  
X0_outliers_n=scaler.transform(X0)

                         
clf=OneClassSVM(gamma='auto', nu=0.1)

clf.fit(X1_train_n)



Y1_pred_train=clf.predict(X1_train_n)
Y1_pred_test=clf.predict(X1_test_n)
Y0_pred_outliers=clf.predict(X0_outliers_n)


#VALUTAZIONE

#TRAIN SET

#matrice di confusione

confmat = confusion_matrix(y_true=Y1_train, y_pred=Y1_pred_train)

fig, ax = plt.subplots(figsize=(2.5, 2.5))
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
for i in range(confmat.shape[0]):
示例#17
0
from sklearn.svm import OneClassSVM

if __name__ == '__main__':

    dataset_pos = data.load_pos_eviction()
    dataset_neg = data.load_neg_eviction()
    dataset_all = data.load_eviction()

    # nu: The proportion of outliers we expect in our data.
    model_pos = OneClassSVM(kernel='linear', nu=0.9)
    model_pos.fit(dataset_pos.X_train)

    model_neg = OneClassSVM(kernel='linear', nu=0.1)
    model_neg.fit(dataset_neg.X_train)

    predictions_pos = model_pos.predict(dataset_all.X_train)
    predictions_neg = model_neg.predict(dataset_all.X_train)

    # +1 is inlier, -1 is outlier. We want those who are evicted, to be +1
    # and those who are not evicted to be 0.

    # Outliers, those evicted, to be 1.
    predictions_neg = (predictions_neg == -1).astype(int)

    # Inliers, those evicted, to be 1.
    predictions_pos = (predictions_pos == 1).astype(int)

    # Print results and mean squared error.
    utils.evaluate(dataset_all.y_train, predictions_pos,
                   model_pos.__class__.__name__)
    utils.evaluate(dataset_all.y_train, predictions_neg,
#Implement k fold cross validation
kf = KFold(n_splits=kFold, shuffle=True)
for trainIndex, testIndex in kf.split(xNormalData):
    #Training data (normal) for every k
    xTrain = xNormalData[trainIndex]
    #Test data (normal and all anomaly data) for every k
    xTest = np.concatenate((xNormalData[testIndex], xAnomalyData), axis=0)
    yTest = np.concatenate(
        (np.zeros(np.size(xNormalData[testIndex], axis=0)) + 1,
         -1 * np.ones(np.size(xAnomalyData, axis=0))),
        axis=0)
    #Create Support Vector Machines model
    svm = OneClassSVM(nu=nu, kernel=kernel, gamma=gamma)
    svm.fit(xTrain)
    #Make predictions
    predictions = svm.predict(xTest)
    #Calculate metrics for every k
    accuracy = metrics.accuracy_score(yTest, predictions)
    recall = metrics.recall_score(yTest, predictions)
    precision = metrics.precision_score(yTest, predictions)
    f1Score = metrics.f1_score(yTest, predictions)
    #Partial calculations of overal metrics
    accuracies += accuracy
    recalls += recall
    precisions += precision
    f1Scores += f1Score
    kIndex += 1
    #Print metrics for every k
    print(str(kIndex) + " Fold Iteration:")
    print("Accuracy: " + str(accuracy * 100) + "%")
    print("Recall: " + str(recall * 100) + "%")
示例#19
0
del train_data['exercise']
del train_data['minute']
del train_data['second']
# print(train_data)

## print(test_data)
# remove from testing data : bp_systolic,bp_diastolic,drink_coffee,eating,sleeping,exercise
# del test_data['bp_systolic']
# del test_data['bp_diastolic']
del test_data['drink_coffee']
del test_data['eating']
del test_data['sleeping']
del test_data['exercise']
del test_data['minute']
del test_data['second']
print(test_data)

clf = OneClassSVM()
output_training = clf.fit(train_data)

y_pred = clf.predict(test_data)
# print(y_pred)

i = 0
for idx, data in enumerate(y_pred):
    if data > 0:
        print(idx, data)
        print(train_data.iloc[[idx]])
        i += 1

print(i)
示例#20
0
gscv.fit(X_train, y_train)
print_gscv_score(gscv)

y_pred = gscv.predict(X_train)
print('train data: ', end="")
print_score_rgr(y_train, y_pred)
# visualize
fig = yyplot(y_train, y_pred)

#%%
# Novelty detection by One Class SVM with optimized hyperparameter
clf = OneClassSVM(nu=0.003,
                  kernel=gscv.best_params_['model__kernel'],
                  gamma=gscv.best_params_['model__gamma'])
clf.fit(X_train)
reliability1 = clf.predict(X_test)  # outliers = -1

# Novelty detection by One Class SVM with optimized hyperparameter
optgamma = optimize_gamma(X_train, range_g)
clf = OneClassSVM(nu=0.003,
                  kernel=gscv.best_params_['model__kernel'],
                  gamma=optgamma)
clf.fit(X_train)
reliability2 = clf.predict(X_test)  # outliers = -1

print("gamma1, 2 = ", gscv.best_params_['model__gamma'], optgamma)

y_pred = gscv.predict(X_test)  # predicted y

data = []
for i in range(len(X_test)):
示例#21
0
def remove_outliers(features, max_fraction=0.1, min_fraction=0.25, verbose=False):
	"""
	Remove outliers from feature set. Since this is an unsupervised approach we iterate
	over many nu/gamma settings for the one-class SVM. For each setting, a certain fraction
	of the subjects will be classified as outliers. For some settings, this fraction will
	be very large, e.g., 90% which is not realistic. For this reason, you can set a maximum
	fraction, e.g., 10%. Only those parameter combinations that result in 10% or less outliers
	are considered for further analysis. Within those combinations we simply count how often
	a given subject is classified as an outlier. We then use a minimum fraction to determine
	when a subject is truly an outlier.
	:param features:
	:param max_fraction: Upper bound on number of outliers allowed
	:param min_fraction: Lower bound on number of times a subject is classified as outlier
	:param verbose: Verbosity.
	:return: Filtered feature set
	"""
	X, y = util.get_xy(
		features,
		target_column='diagnosis',
		exclude_columns=['age', 'gender', 'diagnosis'])

	subjects = {}
	nr_ok_fractions = 0

	for nu in np.linspace(0.01, 1.0, num=20):

		for gamma in [2**x for x in range(-15, 4, 2)]:

			# Train classifier
			classifier = OneClassSVM(kernel='rbf', gamma=gamma, nu=nu)
			classifier.fit(X)
			y_pred = classifier.predict(X)

			# Calculate fraction of outliers
			count = 0.0
			for i in range(len(y_pred)):
				if y_pred[i] == -1:
					count += 1.0
			fraction = count / len(y_pred)

			# If fraction is less than threshold run through list again to find
			# which subjects are considered outliers. Each outlying subject is
			# added to the table and its value incremented by one
			if fraction < max_fraction:
				nr_ok_fractions += 1
				for i in range(len(y_pred)):
					if y_pred[i] == -1:
						subject = features.index[i]
						if subject not in subjects.keys():
							subjects[subject] = 0
						subjects[subject] += 1

	# Print number of times each subject is identified as outlier
	outliers = []
	for subject in subjects.keys():
		fraction = subjects[subject] / float(nr_ok_fractions)
		if fraction >= min_fraction:
			outliers.append(subject)

	# Remove outlying subjects

	if verbose:
		print('Removing {} outliers...'.format(len(outliers)))
	features.drop(outliers, axis=0, inplace=True)

	return features
standard_x = standard_scaler.transform(x)
minmax_x = minmax_scaler.transform(x)
pca_x = pca_scaler.transform(x)


# ## SVM - choose the number of cluster

# In[31]:


from sklearn.svm import OneClassSVM


svm_clf = OneClassSVM(gamma='auto',nu = 0.25).fit(standard_x)
y_pred =svm_clf.predict(standard_x)


# In[32]:


#check result using PCA
from mpl_toolkits.mplot3d import Axes3D

pca = PCA(n_components=2)
pca.fit(standard_x)  
x_pca = pca.transform(standard_x)

pca_cluster_center = PCA(n_components=2)

#2D plot
# 导入库
from sklearn.svm import OneClassSVM  # 导入OneClassSVM
import numpy as np  # 导入numpy库
import matplotlib.pyplot as plt  # 导入Matplotlib
from mpl_toolkits.mplot3d import Axes3D  # 导入3D样式库

# 数据准备
raw_data = np.loadtxt('outlier.txt', delimiter=' ')  # 读取数据
train_set = raw_data[:900, :]  # 训练集
test_set = raw_data[900:, :]  # 测试集

# 异常数据检测
model_onecalsssvm = OneClassSVM(nu=0.1, kernel="rbf",
                                random_state=0)  # 创建异常检测算法模型对象
model_onecalsssvm.fit(train_set)  # 训练模型
pre_test_outliers = model_onecalsssvm.predict(test_set)  # 异常检测

# 异常结果统计
toal_test_data = np.hstack(
    (test_set, pre_test_outliers.reshape(test_set.shape[0], 1)))  # 将测试集和检测结果合并
normal_test_data = toal_test_data[toal_test_data[:, -1] == 1]  # 获得异常检测结果中正常数据集
outlier_test_data = toal_test_data[toal_test_data[:,
                                                  -1] == -1]  # 获得异常检测结果中异常数据
n_test_outliers = outlier_test_data.shape[0]  # 获得异常的结果数量
total_count_test = toal_test_data.shape[0]  # 获得测试集样本量
print('outliers: {0}/{1}'.format(n_test_outliers,
                                 total_count_test))  # 输出异常的结果数量
print('{:*^60}'.format(' all result data (limit 5) '))  # 打印标题
print(toal_test_data[:5])  # 打印输出前5条合并后的数据集

# 异常检测结果展示
示例#24
0
 def run(self, x, knownFeatures):
     trainSet = x[:, knownFeatures].T
     print(trainSet.shape)
     clf = OneClassSVM()
     clf.fit(trainSet)
     self.selected_features = clf.predict(x.T)
示例#25
0
def eval(cfg, model, train_dataset, test_dataset, criterion, publisher="test"):
    model.eval()

    # get global features using a training dataset
    train_loader = DataLoader(train_dataset,
                              batch_size=cfg.batch_size,
                              num_workers=cfg.nworkers,
                              pin_memory=True)
    train_loader = tqdm(train_loader, ncols=100, desc="get train GF")
    train_global_features = []
    with torch.no_grad():
        for lidx, (inputs, targets) in enumerate(train_loader):
            inputs = inputs.to(cfg.device, non_blocking=True)
            inputs = torch.transpose(
                inputs, 1,
                2)[:, :3]  # inputs.shape: Batch_size, num_channels, num_points

            # model encoder processing
            outputs, _, _ = model.encoder(inputs)

            # add a global feature to a list
            train_global_features.append(PytorchTools.t2n(outputs))

        train_global_features = np.concatenate(
            train_global_features, axis=0)  # shape (num_train_data, 1024)

    # get global features using a validation dataset
    test_loader = DataLoader(test_dataset,
                             batch_size=cfg.batch_size,
                             num_workers=cfg.nworkers,
                             pin_memory=True)
    test_loader = tqdm(test_loader, ncols=100, desc="get eval GF")
    test_global_features = []
    eval_labels = []
    loss_list = []
    with torch.no_grad():
        for lidx, (inputs, targets) in enumerate(test_loader):
            inputs = inputs.to(cfg.device, non_blocking=True)
            inputs = torch.transpose(
                inputs, 1,
                2)[:, :3]  # inputs.shape: Batch_size, num_channels, num_points

            # model encoder processing
            outputs, _, _ = model.encoder(inputs)

            # get reconstructions for loss of true data
            reconstructions = model.decoder(outputs)

            # compute loss
            inputs = torch.transpose(inputs, 1, 2)
            dist1, dist2 = criterion["chamfer_distance"](inputs,
                                                         reconstructions)
            dist1 = np.mean(PytorchTools.t2n(dist1), axis=1)
            dist2 = np.mean(PytorchTools.t2n(dist2), axis=1)
            dist_loss = dist1 + dist2

            # add dist_losses to a list
            loss_list.append(dist_loss)

            # add a global feature to a list
            test_global_features.append(PytorchTools.t2n(outputs))

            # get eval labels
            eval_labels.append(targets)

        test_global_features = np.concatenate(
            test_global_features, axis=0)  # shape (num_eval_data, 1024)
        eval_labels = np.squeeze(np.concatenate(eval_labels, axis=0),
                                 axis=-1)  # shape (num_data)
        loss_list = np.concatenate(loss_list, axis=0)

    # use one class classification
    classifier = OneClassSVM(kernel='rbf', nu=0.1, gamma='auto')
    classifier.fit(train_global_features)
    pred_labels = classifier.predict(test_global_features)

    # get training data label
    _, true_label = train_dataset[0]
    # convert eval labels other than true labels to -1
    eval_labels[eval_labels != true_label] = -1
    # convert true labels to 1
    eval_labels[eval_labels == true_label] = 1

    # get loss of true data
    dist_loss = np.mean(loss_list[eval_labels]).item()
    # get a accuracy
    acc = np.mean(pred_labels == eval_labels).item() * 100

    return acc, dist_loss
def remove_outliers(data):
    clf = OneClassSVM(nu=0.2, kernel="rbf", gamma=0.00001)
    clf.fit(data)
    logging.info("%s outliers removed from %s elements" % ((clf.predict(data) == -1).sum(), len(data)))
    return data[clf.predict(data) == 1]
示例#27
0
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import OneClassSVM
from sklearn.svm import NuSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
df = pd.read_csv('iris.csv', sep=',')
features = list(df.columns[:4])
X = df.drop('variety', axis=1)
y = df['variety']
classifier = OneClassSVM(gamma=1.1, kernel='linear')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
text_file = open("generated/result8.txt", "w")
print("accuracy_score= " + str(accuracy_score(y_test, y_pred)), flush=True)
text_file.write("accuracy_score= " + str(accuracy_score(y_test, y_pred)))
text_file.close()
示例#28
0
文件: detector.py 项目: mesquita/mafa
class OSVM(TransformerMixin):
    """
	One-class SVM used for outlier and novelty detection. Wrapper for
	sklearn implementation of Scholkopf2000.
	"""

    def __init__(self, kernel='rbf', degree=3, gamma='auto', coef0=0.0,\
     tol=0.001, nu=0.5, shrinking=True, cache_size=200, verbose=False,\
      max_iter=-1, random_state=None):
        """
		Inits OSVM.
		
		@param kernel Kernel type. String ['linear', 'poly', 'rbf', 'sigmoid'].
		@param degree Polynomial kernel degree. Integer.
		@param gamma Kernel coefficient.
		@param coef0 Independent term in kernel function. Scalar.
		@param tol Tolerance for stopping criterion. Scalar float.
		@param nu Error upper bound and SV upper bound. Scalar [0,1].
		@param shrinking Whether to use the shrinking heuristic. Boolean.
		@param cache_size Specify the size of the kernel cache (in MB).
		@param verbose Enable verbose output. Boolean.
		@param max_iter Hard limit on iterations within solver. -1 for no limit.
		@param random_state Random seed.
		"""

        # Setting parameters for classifier
        self.__mdl = OneClassSVM(kernel, degree, gamma, coef0, tol, nu,\
         shrinking, cache_size, verbose, max_iter, random_state)

    def fit(self, X, y=None, w=None):
        """
		Detects the soft boundary of the set of samples X.
		
		@param X Input matrix [n_samples, n_features].
		@param y Labels vector [n_samples].
		@param w Per-sample weights [n_samples].
		
		@return self
		"""

        # Fit classifier
        self.__mdl = self.__mdl.fit(X, y=y, sample_weight=w)

        # Return self for sklearn API
        return self

    def predict(self, X):
        """
		Estimates input data class (normal, novelty or outlier)
		
		@param X Input matrix [n_samples, n_features].
		
		@return Data labels (+1 or -1).
		"""

        # Predict
        labels = self.__mdl.predict(X)

        # Return
        return labels

    def transform(self, X):
        """
		Returns the data class given the detection model.
		
		@param X Input matrix [n_samples, n_features].
		
		@return Data with detection labels data.
		"""

        # Computing error
        Xlbs = self.predict(X)

        # Concatenating errors to x
        Xlbs = np.hstack((X, Xlbs))

        # Returning
        return Xlbs

    def save(self, path):
        """
		Saves current model.
		
		@param path File path.
		"""
        # Opening file
        with open(path, 'w') as fp:

            # Saving on disk
            pickle.dump(self.__dict__, fp, 2)

    def load(self, path):
        """
		Loads a saved model.
		
		@param path File path.
		"""
        # Opening file
        with open(path, 'r') as fp:

            # Loading from disk
            tmp_dict = pickle.load(fp)
            self.__dict__.update(tmp_dict)
示例#29
0
	def oneClass(self):
		model = OneClassSVM()
		model.fit(self.arr)
		model.predict(self.arr)
示例#30
0
ax.set_zlabel( 'Similarity of Neighboring Districts' )
ax.set_zlim( [ 0., 1. ] )
ax.set_xlim( [ 0., 500. ] )
ax.set_ylim( [ 0., 1. ] )

fig.show()

angles = np.linspace(0,360,41)[:-1] # Take 20 angles between 0 and 360
rotanimate(ax, angles,'movie.gif',delay=20, width = 6., height = 5.) 

# do outlier search using one-class SVM
data[ 0, : ] = preprocessing.scale( data[ 0, : ] )

model = OneClassSVM( gamma = .001, nu = .1 )
fit = model.fit( data )
preds = model.predict( data )

inlier = np.where( preds == 1. )[ 0 ]
outlier = np.where( preds == -1. )[ 0 ]

fig = plt.figure()
ax = fig.add_subplot( 111, projection = '3d' )
ax.scatter( data[ inlier, 0 ], data[ inlier, 1 ], data[ inlier, 2 ], c = 'b' )
ax.scatter( data[ outlier, 0 ], data[ outlier, 1 ], data[ outlier, 2 ], c = 'k' )
ax.set_xlabel( '$P^2/A$' )
ax.set_ylabel( 'Margin' )
ax.set_zlabel( 'Similarity of Neighboring Districts' )

ax.set_ylim( [0., 1 ] )
ax.set_zlim( [ 0., 1. ] )
示例#31
0
slicer = featurizer.FirstSlicer(2)
X = slicer.transform(X0)

Xf0 = np.concatenate(X)
Xf = Xf0[::50]



hexbin(Xf0[:, 0], Xf0[:, 1], bins='log')

svm = OneClassSVM(nu=0.15)
svm.fit(Xf)


y = svm.predict(Xf)

plot(Xf[y==1][:, 0], Xf[y==1][:, 1], 'kx')
plot(Xf[y==-1][:, 0], Xf[y==-1][:, 1], 'wx')

clusterer = cluster.GMM(n_components=3)


yi = map(lambda x: svm.predict(x), X)


from msmbuilder.cluster import MultiSequenceClusterMixin, BaseEstimator
from sklearn.svm import OneClassSVM

class OneClassSVMTrimmer(MultiSequenceClusterMixin, OneClassSVM, BaseEstimator):
    def partial_transform(self, traj):
def classifier(data):
    from sklearn.covariance import EllipticEnvelope
    from sklearn.svm import OneClassSVM
    from sklearn.datasets import load_boston
    from sklearn import preprocessing
    # Get data

    # Define "classifiers" to be used
    legend1 = {}
    legend2 = {}
    evaluation = [[val["coverage"], val["num_exons"], val["distance_to_next"]] for val in data] 
    X = [[val["coverage"], val["num_exons"], val["distance_to_next"]] for val in data]  
    X = preprocessing.scale(X)
    evaluation = preprocessing.scale(evaluation)
    # Learn a frontier for outlier detection with several classifiers
    sample = random.sample(X, 20000)
    clf = OneClassSVM(nu=.1, kernel='rbf')
    test = random.sample(evaluation, 2000)
    print >> sys.stderr, "fitting data"    
    clf.fit(sample)
    print >> sys.stderr, "predicting data"
    Y = clf.predict(test)
    print >> sys.stderr, "plotting data"
    fig, axes = subplots()
    
    for i in range(len(test)):
        if Y[i] == 1:
            color = 'blue'
        else:
            color = 'red'
        axes.scatter(test[i][2], test[i][1], c=color)
    #ylim([50,2000]) #num exons
    ylabel("distance")
    #xlim([3,10])
    xlabel("coverage")
    savefig("DistanceVCoverage.pdf")

    fig, axes = subplots()
    """
    for i in range(len(test)):
        if Y[i] == 1:
            color = 'blue'
        else:
            color = 'red'
        axes.scatter(test[i][1], test[i][0], c=color)
    #xlim([0,10]) #num exons
    xlabel("number of exons")
    #ylim([3,15])
    ylabel("coverage")
    savefig("ExonsvsCoverage.pdf")
    """
    full_test = clf.predict(evaluation)
    novel, regular = [],[]
    for i in range(len(full_test)):
        result = full_test[i]
        if result == -1:
            print data[i]["id"]
            novel.append(data[i]["num_exons"])
        else:
            regular.append(data[i]["num_exons"])
    multi_exon_novel = [val for val in novel if val > 1]
    multi_exon_regular = [val for val in regular if val > 1]
    print >> sys.stderr, "novel, regular"
    print >> sys.stderr, len(novel), len(regular)
    print >> sys.stderr, mean(multi_exon_novel), mean(multi_exon_regular), len(multi_exon_novel), len(multi_exon_regular)
def main():
    args = parse_arguments()
    random.seed(args.seed)
    X, y = load_data(args)
    if args.scale:
        scaler = MinMaxScaler()
        X = scaler.fit_transform(X)

    y_value = np.unique(y)

    f_index = np.where(y == y_value[0])[0]
    s_index = np.where(y == y_value[1])[0]

    target_X, target_y = X[f_index], np.ones(len(f_index))
    outlier_X, outlier_y = X[s_index], -np.ones(len(s_index))
    target_X_train, target_X_test, target_y_train, target_y_test = train_test_split(target_X, target_y, shuffle=True,
                                                                                    random_state=args.seed, test_size=1/3)

    self_adaptive_shifting = SelfAdaptiveShifting(target_X_train)
    self_adaptive_shifting.edge_pattern_detection(args.threshold)
    pseudo_outlier_X = self_adaptive_shifting.generate_pseudo_outliers()
    pseudo_target_X = self_adaptive_shifting.generate_pseudo_targets()
    pseudo_outlier_y = -np.ones(len(pseudo_outlier_X))
    pseudo_target_y = np.ones(len(pseudo_target_X))

    gamma_candidates = [1e-4, 1e-3, 1e-2, 1e-1, 1e-0, 1e+1, 1e+2, 1e+3, 1/np.size(target_X, -1)]
    nu_candidates = [0.005, 0.01, 0.05, 0.1, 0.5]

    best_err = 1.0
    best_gamma, best_nu = 1/np.size(target_X, -1), 0.5
    for gamma in tqdm(gamma_candidates):
        for nu in tqdm(nu_candidates):
            model = OneClassSVM(gamma=gamma, nu=nu).fit(target_X_train)
            err_o = 1 - np.mean(model.predict(pseudo_outlier_X) == pseudo_outlier_y)
            err_t = 1 - np.mean(model.predict(pseudo_target_X) == pseudo_target_y)
            err = float((err_o + err_t) / 2)
            if err < best_err:
                best_err = err
                best_gamma = gamma
                best_nu = nu

    best_model = OneClassSVM(kernel=args.kernel, gamma=best_gamma, nu=best_nu).fit(target_X_train)
    target_pred = best_model.predict(target_X_test)
    outlier_pred = best_model.predict(outlier_X)
    y_pred = np.concatenate((target_pred, outlier_pred))
    y_true = np.concatenate((target_y_test, outlier_y))
    f1 = f1_score(y_true, y_pred, average="binary")
    mcc = matthews_corrcoef(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    print("\n[%s] (gamma: %.4f, nu: %.4f, err: %.4f) \nf1-score: %.4f, mcc: %.4f, acc: %.4f" % (args.data, best_gamma, best_nu, best_err, f1, mcc, acc))

    model = OneClassSVM(kernel=args.kernel).fit(target_X_train)
    target_pred = model.predict(target_X_test)
    outlier_pred = model.predict(outlier_X)
    y_pred = np.concatenate((target_pred, outlier_pred))
    y_true = np.concatenate((target_y_test, outlier_y))
    f1 = f1_score(y_true, y_pred, average="binary")
    mcc = matthews_corrcoef(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    print("\n[%s] (default setting) \nf1-score: %.4f, mcc: %.4f, acc: %.4f" % (args.data, f1, mcc, acc))

    if args.visualize:
        self_adaptive_shifting.visualize()
示例#34
0
url = "C:/Users/Βασίλης/IdeaProjects/MyThesisApp/Data sets/Total_Vehicle_Sales.csv"
dataset = pd.read_csv(url)

outliers_fraction = 0.05

data = dataset[['Value']]
scaler = StandardScaler()
np_scaled = scaler.fit_transform(data)
data = pd.DataFrame(np_scaled)
# train oneclassSVM
model = OneClassSVM(nu=outliers_fraction, kernel='rbf', gamma=0.01)

model.fit(data)

dataset['anomaly'] = pd.Series(model.predict(data))

print(dataset)

a = dataset.loc[dataset['anomaly'] == -1, ['Date', 'Value']]  #anomaly

fig, ax = plt.subplots(figsize=(15, 10))
ax.plot(dataset['Date'], dataset['Value'], color='blue')
ax.scatter(a['Date'],
           a['Value'],
           color='red',
           label='Anomaly Detection OneClassSVM')
plt.show()

# original = []
# anomalies = []
示例#35
0
class OutlierRemover():
    """
    strategy:
        z_score
        inter_quartile_range
        isolation_forest
        elliptic_envelope
        local_outlier_factor
        one_class_svm
    params:
        Isolation Forest:
            n_estimators
        EllipticEnvelope:
            contamination
        LocalOutlierFactor:
            n_neighbors
        OneClassSVM   
            kernel
            degree
            gamma
            

    """
    def __init__(self, strategy, **params):
        self.all_strategies = [
            'z_score', 'inter_quartile_range', 'isolation_forest',
            'elliptic_envelope', 'local_outlier_factor', 'one_class_svm'
        ]

        if strategy not in self.all_strategies:
            raise Exception(
                'Invalid Strategy... strategy can be one of the follwing:\n',
                *self.all_strategies)

        self.strategy = strategy
        self.params = params

        if strategy == 'isolation_forest':
            self.outlier_remover = IsolationForest(n_estimators=params.get(
                'n_estimators', 100),
                                                   bootstrap=True,
                                                   random_state=19)
        if strategy == 'elliptic_envelope':
            self.outlier_remover = EllipticEnvelope(contamination=params.get(
                'contamination', 0.1),
                                                    random_state=19)
        if strategy == 'local_outlier_factor':
            self.outlier_remover = LocalOutlierFactor(contamination=params.get(
                'n_neighbors', 20),
                                                      random_state=19)
        if strategy == 'one_class_svm':
            self.outlier_remover = OneClassSVM(
                kernel=params.get('kernel', 'rbf'),
                degree=params.get('degree', 3),
                gamma=params.get('gamma', 'scale'))

    def fit(self, X, y):
        if self.strategy not in ['z_score', 'inter_quartile_range']:
            return self.outlier_remover.fit(X)
        return self

    def transform(self, X, y):
        if self.strategy not in ['z_score', 'inter_quartile_range']:
            y_hat = self.outlier_remover.predict(X)
            mask = y_hat != -1
            X, y = X.iloc[mask, :], y.iloc[mask]
            return X, y
        if self.strategy == 'z_score':
            z = pd.DataFrame(np.abs(stats.zscore(X)))
            idx = X[z <= 3].dropna().index,
            return X.iloc[idx], y.iloc[idx]
        if self.strategy == 'inter_quartile_range':
            Q1 = X.quantile(0.25)
            Q3 = X.quantile(0.75)
            IQR = Q3 - Q1
            idx = X[(X >= (Q1 - 1.5 * IQR))
                    & (X <= (Q3 + 1.5 * IQR))].dropna().index
            return X.iloc[idx], y.iloc[idx]

    def fit_transform(self, X, y):
        self.fit(X, y)
        return self.transform(X, y)
示例#36
0
def decision_tree_classify(
    data_dir: str,
    train_attack_name: str = "FGSM",
    train_transform_name: str = "noop",
    test_attack_name: str = "FGSM",
    test_transform_name: str = "noop",
):

    train_dir = os.path.join(data_dir, f"train")
    test_dir = os.path.join(data_dir, f"test")
    train_original_dir = os.path.join(train_dir,
                                      f"original_{train_transform_name}")
    train_adversarial_dir = os.path.join(
        train_dir, f"{train_attack_name}_{train_transform_name}")
    test_original_dir = os.path.join(test_dir,
                                     f"original_{test_transform_name}")
    test_adversarial_dir = os.path.join(
        test_dir, f"{test_attack_name}_{test_transform_name}")

    train_model_by_key_fn = partial(
        train_model_by_key,
        train_original_dir,
        test_original_dir,
        test_adversarial_dir,
    )

    keys = [
        "channel_relation",
        "channel_birelation",
        "spatial_relation",
        "weight_relation",
        "hieght_width_relation",
        "channel_weight",
    ]
    clf_list = []
    train_pred_list = []
    for key in keys:
        clf = train_model_by_key_fn(key)
        clf_list.append(clf)

        train_original_data, train_original_label, \
        train_original_model_pred = load_data_and_label(
            train_original_dir,
            training_images_per_class,
            key = key,
        )
        if len(train_pred_list) == 0:
            train_pred_list.append(np.expand_dims(train_original_model_pred,
                                                  1))

        train_pred_list.append(
            np.expand_dims(clf.predict(train_original_data), 1))
    return

    # One class classification based on predictions of each classifier
    pred = np.concatenate(train_pred_list, axis=1)
    one_class_clf = OneClassSVM(
        # max_depth = 10,
        # min_samples_leaf = 20,
    )
    one_class_clf = one_class_clf.fit(pred)

    test_original_preds = []
    test_adversarial_preds = []
    for clf, key in zip(clf_list, keys):
        test_original_data, test_original_label, _ = load_data_and_label(
            test_original_dir,
            test_images_per_class,
            key=key,
        )
        test_adversarial_data, test_adversarial_label, test_adversarial_pred = load_data_and_label(
            test_adversarial_dir,
            test_images_per_class,
            key=key,
        )
        if len(test_original_preds) == 0:
            test_original_preds.append(np.expand_dims(test_original_label, 1))
        original_pred = clf.predict(test_original_data)
        original_pred = np.expand_dims(original_pred, 1)
        test_original_preds.append(original_pred)

        if len(test_adversarial_preds) == 0:
            test_adversarial_preds.append(
                np.expand_dims(test_adversarial_pred, 1))
        adversarial_pred = clf.predict(test_adversarial_data)
        adversarial_pred = np.expand_dims(adversarial_pred, 1)
        test_adversarial_preds.append(adversarial_pred)

    original_input = np.concatenate(test_original_preds, axis=1)
    adversarial_input = np.concatenate(test_adversarial_preds, axis=1)

    original_pred = one_class_clf.predict(original_input)
    adversarial_pred = one_class_clf.predict(adversarial_input)

    acc = (original_pred == 1).sum() / len(original_pred)
    fpr = 1 - acc
    tpr = (adversarial_pred == -1).sum() / len(adversarial_pred)
    roc_tpr = [0, tpr, 1]
    roc_fpr = [0, fpr, 1]
    auc = metrics.auc(roc_fpr, roc_tpr)
    print(f"One class pred tpr: {tpr:.3f}, fpr: {fpr:.3f}, auc: {auc:.3f}")
def base_experiment(config,
                    pct_noise=0.15,
                    noverlap_bits=0,
                    ntrials=10,
                    verbose=False,
                    seed=123456789):
    """Run a single experiment, locally.

    @param config: The configuration parameters.

    @param pct_noise: The percentage of noise to add to the dataset.

    @param noverlap_bits: The number of bits the base class should overlap
    with the novelty class.

    @param ntrials: The number of times to repeat the experiment.

    @param verbose: If True print the results.

    @param seed: The random seed to use.
    """
    # Base parameters
    ntrain, ntest = 800, 200
    nsamples, nbits, pct_active = ntest + ntrain, 100, 0.4
    clf_th = 0.5

    # Build the directory, if needed
    base_dir = config['log_dir']
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)

    # Seed numpy
    np.random.seed(seed)

    # Create the base dataset
    x_ds = SPDataset(nsamples, nbits, pct_active, pct_noise, seed=seed)
    x_tr, x_te = x_ds.data[:ntrain], x_ds.data[ntrain:]

    # Create the outlier dataset
    base_indexes = set(np.where(x_ds.base_class == 1)[0])
    choices = [x for x in range(nbits) if x not in base_indexes]
    outlier_base = np.zeros(nbits, dtype='bool')
    outlier_base[np.random.choice(choices, x_ds.nactive - noverlap_bits,
                                  False)] = 1
    outlier_base[np.random.permutation(list(base_indexes))[:noverlap_bits]] = 1
    y_ds = SPDataset(ntest, nbits, pct_active, pct_noise, outlier_base, seed)
    y_te = y_ds.data

    if verbose:
        bctn = 1 - (np.mean(x_te, 0) * x_ds.base_class.astype('i')).sum() / 40.
        ocn = 1 - (np.mean(y_te, 0) * outlier_base.astype('i')).sum() / 40.
        overlap = (np.dot(x_ds.base_class.astype('i'),
                          outlier_base.astype('i')))

        print(f"\nBase class' test noise: {bctn:2.2f}")
        print(f"Outlier's class noise: {ocn:2.2f}")
        print(f'Overlap between two classes: {overlap}')

    # Metrics
    metrics = SPMetrics()

    # Get the metrics for the datasets
    u_x_tr = metrics.compute_uniqueness(x_tr)
    o_x_tr = metrics.compute_overlap(x_tr)
    u_x_te = metrics.compute_uniqueness(x_te)
    o_x_te = metrics.compute_overlap(x_te)
    u_y_te = metrics.compute_uniqueness(y_te)
    o_y_te = metrics.compute_overlap(y_te)

    # Initialize the overall results
    sp_x_results = np.zeros(ntrials)
    sp_y_results = np.zeros(ntrials)
    svm_x_results = np.zeros(ntrials)
    svm_y_results = np.zeros(ntrials)

    # Iterate across the trials:
    for i, seed2 in enumerate(generate_seeds(ntrials, seed)):
        # Create the SP
        config['seed'] = seed2
        sp = SPRegion(**config)

        # Fit the SP
        sp.fit(x_tr)

        # Get the SP's output
        sp_x_tr = sp.predict(x_tr)
        sp_x_te = sp.predict(x_te)
        sp_y_te = sp.predict(y_te)

        # Get the metrics for the SP's results
        u_sp_x_tr = metrics.compute_uniqueness(sp_x_tr)
        o_sp_x_tr = metrics.compute_overlap(sp_x_tr)
        u_sp_x_te = metrics.compute_uniqueness(sp_x_te)
        o_sp_x_te = metrics.compute_overlap(sp_x_te)
        u_sp_y_te = metrics.compute_uniqueness(sp_y_te)
        o_sp_y_te = metrics.compute_overlap(sp_y_te)

        # Log all of the metrics
        sp._log_stats('Input Base Class Train Uniqueness', u_x_tr)
        sp._log_stats('Input Base Class Train Overlap', o_x_tr)
        sp._log_stats('Input Base Class Test Uniqueness', u_x_te)
        sp._log_stats('Input Base Class Test Overlap', o_x_te)
        sp._log_stats('Input Novelty Class Test Uniqueness', u_y_te)
        sp._log_stats('Input Novelty Class Test Overlap', o_y_te)
        sp._log_stats('SP Base Class Train Uniqueness', u_sp_x_tr)
        sp._log_stats('SP Base Class Train Overlap', o_sp_x_tr)
        sp._log_stats('SP Base Class Test Uniqueness', u_sp_x_te)
        sp._log_stats('SP Base Class Test Overlap', o_sp_x_te)
        sp._log_stats('SP Novelty Class Test Uniqueness', u_sp_y_te)
        sp._log_stats('SP Novelty Class Test Overlap', o_sp_y_te)

        # Print the results
        fmt_s = '{0}:\t{1:2.4f}\t{2:2.4f}\t{3:2.4f}\t{4:2.4f}\t{5:2.4f}\t{6:2.4f}'
        if verbose:
            print('\nDescription\tx_tr\tx_te\ty_te\tsp_x_tr\tsp_x_te\tsp_y_te')
            print((fmt_s.format('Uniqueness', u_x_tr, u_x_te, u_y_te,
                                u_sp_x_tr, u_sp_x_te, u_sp_y_te)))
            print((fmt_s.format('Overlap', o_x_tr, o_x_te, o_y_te, o_sp_x_tr,
                                o_sp_x_te, o_sp_y_te)))

        # Get average representation of the base class
        sp_base_result = np.mean(sp_x_tr, 0)
        sp_base_result[sp_base_result >= 0.5] = 1
        sp_base_result[sp_base_result < 1] = 0

        # Averaged results for each metric type
        u_sp_base_to_x_te = 0.
        o_sp_base_to_x_te = 0.
        u_sp_base_to_y_te = 0.
        o_sp_base_to_y_te = 0.
        for x, y in zip(sp_x_te, sp_y_te):
            # Refactor
            xt = np.vstack((sp_base_result, x))
            yt = np.vstack((sp_base_result, y))

            # Compute the sums
            u_sp_base_to_x_te += metrics.compute_uniqueness(xt)
            o_sp_base_to_x_te += metrics.compute_overlap(xt)
            u_sp_base_to_y_te += metrics.compute_uniqueness(yt)
            o_sp_base_to_y_te += metrics.compute_overlap(yt)
        u_sp_base_to_x_te /= ntest
        o_sp_base_to_x_te /= ntest
        u_sp_base_to_y_te /= ntest
        o_sp_base_to_y_te /= ntest

        # Log the results
        sp._log_stats('Base Train to Base Test Uniqueness', u_sp_base_to_x_te)
        sp._log_stats('Base Train to Base Test Overlap', o_sp_base_to_x_te)
        sp._log_stats('Base Train to Novelty Test Uniqueness',
                      u_sp_base_to_y_te)
        sp._log_stats('Base Train to Novelty Test Overlap', o_sp_base_to_y_te)

        # Print the results
        if verbose:
            print('\nDescription\tx_tr->x_te\tx_tr->y_te')
            print(f'Uniqueness:\t\
                    {u_sp_base_to_x_te:2.4f}\t{u_sp_base_to_y_te:2.4f}')
            print('Overlap:\t\
                    {o_sp_base_to_x_te:2.4f}\t{o_sp_base_to_y_te:2.4f}')
        # Create an SVM
        clf = OneClassSVM(kernel='linear', nu=0.1, random_state=seed2)

        # Evaluate the SVM's performance
        clf.fit(x_tr)
        svm_x_te = len(np.where(clf.predict(x_te) == 1)[0]) / float(ntest) * \
            100
        svm_y_te = len(np.where(clf.predict(y_te) == -1)[0]) / float(ntest) * \
            100

        # Perform classification using overlap as the feature
        # -- The overlap must be above 50%
        clf_x_te = 0.
        clf_y_te = 0.
        for x, y in zip(sp_x_te, sp_y_te):
            # Refactor
            xt = np.vstack((sp_base_result, x))
            yt = np.vstack((sp_base_result, y))

            # Compute the accuracy
            xo = metrics.compute_overlap(xt)
            yo = metrics.compute_overlap(yt)
            if xo >= clf_th:
                clf_x_te += 1
            if yo < clf_th:
                clf_y_te += 1
        clf_x_te = (clf_x_te / ntest) * 100
        clf_y_te = (clf_y_te / ntest) * 100

        # Store the results as errors
        sp_x_results[i] = 100 - clf_x_te
        sp_y_results[i] = 100 - clf_y_te
        svm_x_results[i] = 100 - svm_x_te
        svm_y_results[i] = 100 - svm_y_te

        # Log the results
        sp._log_stats('SP % Correct Base Class', clf_x_te)
        sp._log_stats('SP % Correct Novelty Class', clf_y_te)
        sp._log_stats('SVM % Correct Base Class', svm_x_te)
        sp._log_stats('SVM % Correct Novelty Class', svm_y_te)

        # Print the results
        if verbose:
            print(f'\nSP Base Class Detection   : {clf_x_te:2.2f}%')
            print(f'SP Novelty Class Detection  : {clf_y_te:2.2f}%')
            print(f'SVM Base Class Detection    : {svm_x_te:2.2f}%')
            print(f'SVM Novelty Class Detection : {svm_y_te:2.2f}%')

    # Save the results
    with open(os.path.join(base_dir, 'results.pkl'), 'wb') as f:
        pickle.dump((sp_x_results, sp_y_results, svm_x_results, svm_y_results),
                    f, pickle.HIGHEST_PROTOCOL)
示例#38
0

# Data Training
SVMModel = OneClassSVM()


# Deleted log1p, because there are too many labels
# we cannot cover when checking precision, recall, f_score
# yLabelsLog = np.log(yLabels+3)


SVMModel.fit(dataTrain)


# Test Trained Random Forest Regressor
preds = SVMModel.predict(X=dataTest)
# testLog = np.log(testYLabels+3)
# testLog = testLog.values.ravel()


# evaluation values (or matrix)
aScore = accuracy_score(testYLabels, preds.round())
cMatrix = confusion_matrix(testYLabels, preds.round())

# ignore '0' value for displaying
cMatrixDP = np.delete(cMatrix, 2, 0)
cMatrixDP = np.delete(cMatrixDP, 2, 1)

precisionList = precision(cMatrix)
recallList = recall(cMatrix)
precisionList = np.delete(precisionList, 2)
示例#39
0
def base_experiment(pct_noise=0.15,
                    noverlap_bits=0,
                    exp_name='1-1',
                    ntrials=10,
                    verbose=True,
                    seed=123456789):
    """Run a single experiment, locally.

    @param pct_noise: The percentage of noise to add to the dataset.

    @param noverlap_bits: The number of bits the base class should overlap
    with the novelty class.

    @param exp_name: The name of the experiment.

    @param ntrials: The number of times to repeat the experiment.

    @param verbose: If True print the results.

    @param seed: The random seed to use.

    @return: A tuple containing the percentage errors for the SP's training
    and testing results and the SVM's training and testing results,
    respectively.
    """
    # Base parameters
    ntrain, ntest = 800, 200
    nsamples, nbits, pct_active = ntest + ntrain, 100, 0.4
    clf_th = 0.5
    log_dir = os.path.join(os.path.expanduser('~'), 'scratch',
                           'novelty_experiments', exp_name)

    # Configure the SP
    config = {
        'ninputs': 100,
        'trim': 1e-4,
        'disable_boost': True,
        'seed': seed,
        'pct_active': None,
        'random_permanence': True,
        'pwindow': 0.5,
        'global_inhibition': True,
        'ncolumns': 200,
        'nactive': 50,
        'nsynapses': 75,
        'seg_th': 15,
        'syn_th': 0.5,
        'pinc': 0.001,
        'pdec': 0.001,
        'nepochs': 10,
        'log_dir': log_dir
    }

    # Seed numpy
    np.random.seed(seed)

    # Create the base dataset
    x_ds = SPDataset(nsamples, nbits, pct_active, pct_noise, seed=seed)
    x_tr, x_te = x_ds.data[:ntrain], x_ds.data[ntrain:]

    # Create the outlier dataset
    base_indexes = set(np.where(x_ds.base_class == 1)[0])
    choices = [x for x in range(nbits) if x not in base_indexes]
    outlier_base = np.zeros(nbits, dtype='bool')
    outlier_base[np.random.choice(choices, x_ds.nactive - noverlap_bits,
                                  False)] = 1
    outlier_base[np.random.permutation(list(base_indexes))[:noverlap_bits]] = 1
    y_ds = SPDataset(ntest, nbits, pct_active, pct_noise, outlier_base, seed)
    y_te = y_ds.data

    if verbose:  # copied from novelty_detection_slurm.py
        bctn = 1 - (np.mean(x_te, 0) * x_ds.base_class.astype('i')).sum() / 40.
        ocn = 1 - (np.mean(y_te, 0) * outlier_base.astype('i')).sum() / 40.
        overlap = (np.dot(x_ds.base_class.astype('i'),
                          outlier_base.astype('i')))

        print(f"\nBase class' test noise: {bctn:2.2f}")
        print(f"Outlier's class noise: {ocn:2.2f}")
        print(f'Overlap between two classes: {overlap}')

    # Metrics
    metrics = SPMetrics()

    # Get the metrics for the datasets
    u_x_tr = metrics.compute_uniqueness(x_tr)
    o_x_tr = metrics.compute_overlap(x_tr)
    c_x_tr = 1 - metrics.compute_distance(x_tr)
    u_x_te = metrics.compute_uniqueness(x_te)
    o_x_te = metrics.compute_overlap(x_te)
    c_x_te = 1 - metrics.compute_distance(x_te)
    u_y_te = metrics.compute_uniqueness(y_te)
    o_y_te = metrics.compute_overlap(y_te)
    c_y_te = 1 - metrics.compute_distance(y_te)

    # Initialize the overall results
    sp_x_results = np.zeros(ntrials)
    sp_y_results = np.zeros(ntrials)
    svm_x_results = np.zeros(ntrials)
    svm_y_results = np.zeros(ntrials)

    # Iterate across the trials:
    for i in range(ntrials):
        # Make a new seed
        seed2 = np.random.randint(1000000)
        config['seed'] = seed2
        config['log_dir'] = '{0}-{1}'.format(log_dir, i + 1)

        # Create the SP
        sp = SPRegion(**config)

        # Fit the SP
        sp.fit(x_tr)

        # Get the SP's output
        sp_x_tr = sp.predict(x_tr)
        sp_x_te = sp.predict(x_te)
        sp_y_te = sp.predict(y_te)

        # Get the metrics for the SP's results
        u_sp_x_tr = metrics.compute_uniqueness(sp_x_tr)
        o_sp_x_tr = metrics.compute_overlap(sp_x_tr)
        c_sp_x_tr = 1 - metrics.compute_distance(sp_x_tr)
        u_sp_x_te = metrics.compute_uniqueness(sp_x_te)
        o_sp_x_te = metrics.compute_overlap(sp_x_te)
        c_sp_x_te = 1 - metrics.compute_distance(sp_x_te)
        u_sp_y_te = metrics.compute_uniqueness(sp_y_te)
        o_sp_y_te = metrics.compute_overlap(sp_y_te)
        c_sp_y_te = 1 - metrics.compute_distance(sp_y_te)

        # Log all of the metrics
        sp._log_stats('Input Base Class Train Uniqueness', u_x_tr)
        sp._log_stats('Input Base Class Train Overlap', o_x_tr)
        sp._log_stats('Input Base Class Train Correlation', c_x_tr)
        sp._log_stats('Input Base Class Test Uniqueness', u_x_te)
        sp._log_stats('Input Base Class Test Overlap', o_x_te)
        sp._log_stats('Input Base Class Test Correlation', c_x_te)
        sp._log_stats('Input Novelty Class Test Uniqueness', u_y_te)
        sp._log_stats('Input Novelty Class Test Overlap', o_y_te)
        sp._log_stats('Input Novelty Class Test Correlation', c_y_te)
        sp._log_stats('SP Base Class Train Uniqueness', u_sp_x_tr)
        sp._log_stats('SP Base Class Train Overlap', o_sp_x_tr)
        sp._log_stats('SP Base Class Train Correlation', c_sp_x_tr)
        sp._log_stats('SP Base Class Test Uniqueness', u_sp_x_te)
        sp._log_stats('SP Base Class Test Overlap', o_sp_x_te)
        sp._log_stats('SP Base Class Test Correlation', c_sp_x_te)
        sp._log_stats('SP Novelty Class Test Uniqueness', u_sp_y_te)
        sp._log_stats('SP Novelty Class Test Overlap', o_sp_y_te)
        sp._log_stats('SP Novelty Class Test Correlation', c_sp_y_te)

        # Print the results
        fmt_s = '{0}:\t{1:2.4f}\t{2:2.4f}\t{3:2.4f}\t{4:2.4f}\t{5:2.4f}\t{5:2.4f}'
        if verbose:
            print('\nDescription\tx_tr\tx_te\ty_te\tsp_x_tr\tsp_x_te\tsp_y_te')
            print((fmt_s.format('Uniqueness', u_x_tr, u_x_te, u_y_te,
                                u_sp_x_tr, u_sp_x_te, u_sp_y_te)))
            print((fmt_s.format('Overlap', o_x_tr, o_x_te, o_y_te, o_sp_x_tr,
                                o_sp_x_te, o_sp_y_te)))
            print((fmt_s.format('Correlation', c_x_tr, c_x_te, c_y_te,
                                c_sp_x_tr, c_sp_x_te, c_sp_y_te)))

        # Get average representation of the base class
        sp_base_result = np.mean(sp_x_tr, 0)
        sp_base_result[sp_base_result >= 0.5] = 1
        sp_base_result[sp_base_result < 1] = 0

        # Averaged results for each metric type
        u_sp_base_to_x_te = 0.
        o_sp_base_to_x_te = 0.
        c_sp_base_to_x_te = 0.
        u_sp_base_to_y_te = 0.
        o_sp_base_to_y_te = 0.
        c_sp_base_to_y_te = 0.
        for x, y in zip(sp_x_te, sp_y_te):
            # Refactor
            xt = np.vstack((sp_base_result, x))
            yt = np.vstack((sp_base_result, y))

            # Compute the sums
            u_sp_base_to_x_te += metrics.compute_uniqueness(xt)
            o_sp_base_to_x_te += metrics.compute_overlap(xt)
            c_sp_base_to_x_te += 1 - metrics.compute_distance(xt)
            u_sp_base_to_y_te += metrics.compute_uniqueness(yt)
            o_sp_base_to_y_te += metrics.compute_overlap(yt)
            c_sp_base_to_y_te += 1 - metrics.compute_distance(yt)
        u_sp_base_to_x_te /= ntest
        o_sp_base_to_x_te /= ntest
        c_sp_base_to_x_te /= ntest
        u_sp_base_to_y_te /= ntest
        o_sp_base_to_y_te /= ntest
        c_sp_base_to_y_te /= ntest

        # Log the results
        sp._log_stats('Base Train to Base Test Uniqueness', u_sp_base_to_x_te)
        sp._log_stats('Base Train to Base Test Overlap', o_sp_base_to_x_te)
        sp._log_stats('Base Train to Base Test Correlation', c_sp_base_to_x_te)
        sp._log_stats('Base Train to Novelty Test Uniqueness',
                      u_sp_base_to_y_te)
        sp._log_stats('Base Train to Novelty Test Overlap', o_sp_base_to_y_te)
        sp._log_stats('Base Train to Novelty Test Correlation',
                      c_sp_base_to_y_te)

        # Print the results
        if verbose:
            print('\nDescription\tx_tr->x_te\tx_tr->y_te')
            print(f'Uniqueness:\t\
                {u_sp_base_to_x_te:2.4f}\t{u_sp_base_to_y_te:2.4f}')
            print('Overlap:\t\
                {o_sp_base_to_x_te:2.4f}\t{o_sp_base_to_y_te:2.4f}')
            print('Correlation:\t\
                {c_sp_base_to_x_te:2.4f}\t{c_sp_base_to_y_te:2.4f}')

        # Create an SVM
        clf = OneClassSVM(kernel='linear', nu=0.1, random_state=seed2)

        # Evaluate the SVM's performance
        clf.fit(x_tr)
        svm_x_te = len(np.where(clf.predict(x_te) == 1)[0]) / float(ntest) * \
            100
        svm_y_te = len(np.where(clf.predict(y_te) == -1)[0]) / float(ntest) * \
            100

        # Perform classification using overlap as the feature
        # -- The overlap must be above 50%
        clf_x_te = 0.
        clf_y_te = 0.
        for x, y in zip(sp_x_te, sp_y_te):
            # Refactor
            xt = np.vstack((sp_base_result, x))
            yt = np.vstack((sp_base_result, y))

            # Compute the accuracy
            xo = metrics.compute_overlap(xt)
            yo = metrics.compute_overlap(yt)
            if xo >= clf_th:
                clf_x_te += 1
            if yo < clf_th:
                clf_y_te += 1
        clf_x_te = (clf_x_te / ntest) * 100
        clf_y_te = (clf_y_te / ntest) * 100

        # Store the results as errors
        sp_x_results[i] = 100 - clf_x_te
        sp_y_results[i] = 100 - clf_y_te
        svm_x_results[i] = 100 - svm_x_te
        svm_y_results[i] = 100 - svm_y_te

        # Log the results
        sp._log_stats('SP % Correct Base Class', clf_x_te)
        sp._log_stats('SP % Correct Novelty Class', clf_y_te)
        sp._log_stats('SVM % Correct Base Class', svm_x_te)
        sp._log_stats('SVM % Correct Novelty Class', svm_y_te)

        # Print the results
        if verbose:
            print(f'\nSP Base Class Detection   : {clf_x_te:2.2f}%')
            print(f'SP Novelty Class Detection  : {clf_y_te:2.2f}%')
            print(f'SVM Base Class Detection    : {svm_x_te:2.2f}%')
            print(f'SVM Novelty Class Detection : {svm_y_te:2.2f}%')

    return sp_x_results, sp_y_results, svm_x_results, svm_y_results
Users_26JS_pca = pca.fit_transform(Users_26JS)
Users_26JS_pca_nor = Normalizer().fit_transform(Users_26JS_pca)
print 'PCA与归一化完成...\n'
X_train_lst = []
X_test_lst = []
for index_2 in X_train:
    X_train_lst.append(Users_26JS_pca_nor[index_2])
for index_3 in Users_not_All_Jobs_index:
    X_test_lst.append(Users_26JS_pca_nor[index_3])
X_train_array= np.array(X_train_lst)
X_test_array = np.array(X_test_lst)

print 'OCSVM开始训练...\n'
clf = OneClassSVM(kernel='rbf', tol=0.01, nu=0.5, gamma='auto')
clf.fit(X_train_array)
pred = clf.predict(X_test_array)

print '开始输出分类结果...\n'
# 考虑标题行在内
# ACM2278:line 2841;
# CMP2946:line 2331;
# PLJ1771:line 1283;
# CDE1846:line 656;
# MBG3183:line 1495;
# print 'ACM2278 is ', clf.predict(Users_26JS_pca_nor[2839]), '\t', clf.decision_function(Users_26JS_pca_nor[2839]), '\n'
# print 'CMP2946 is ', clf.predict(Users_26JS_pca_nor[2329]), '\n', clf.decision_function(Users_26JS_pca_nor[2329]), '\n'
# print 'PLJ1771 is ', clf.predict(Users_26JS_pca_nor[1281]), '\t', clf.decision_function(Users_26JS_pca_nor[1281]), '\n'
# print 'CDE1846 is ', clf.predict(Users_26JS_pca_nor[654]), '\n',  clf.decision_function(Users_26JS_pca_nor[654]), '\n'
# print 'MBG3183 is ', clf.predict(Users_26JS_pca_nor[1493]), '\n', clf.decision_function(Users_26JS_pca_nor[1493]), '\n'

scaler = preprocessing.StandardScaler().fit(tr_data)

tr_data = scaler.transform(tr_data)
cv_data = scaler.transform(cv_data)
bot_data = scaler.transform(bot_data)

#Hard coded for testing. will change
gt_data = [+1]*23

# Outlier detection code using multi variate gaussian

# mu, sigma = estimateGaussian(tr_data)
# p = multivariateGaussian(tr_data,mu,sigma)
# p_cv = multivariateGaussian(cv_data,mu,sigma)
# fscore, ep = selectThresholdByCV(p_cv,gt_data)
# mu, sigma = estimateGaussian(bot_data)
# p_bot = multivariateGaussian(bot_data,mu,sigma)

# outliers = p_bot < ep

# print (outliers)
# <codecell>

# Novelty detection using one class SVM

outlierDetector = OneClassSVM()
outlierDetector.fit(cv_data, gt_data)

bot_preds = outlierDetector.predict(bot_data)

print(bot_preds)
示例#42
0
def base_experiment(config, pct_noise=0.15, noverlap_bits=0, ntrials=10,
	verbose=False, seed=123456789):
	"""
	Run a single experiment, locally.
	
	@param config: The configuration parameters.
	
	@param pct_noise: The percentage of noise to add to the dataset.
	
	@param noverlap_bits: The number of bits the base class should overlap
	with the novelty class.
	
	@param ntrials: The number of times to repeat the experiment.
	
	@param verbose: If True print the results.
	
	@param seed: The random seed to use.
	"""
	
	# Base parameters
	ntrain, ntest = 800, 200
	nsamples, nbits, pct_active = ntest + ntrain, 100, 0.4
	clf_th = 0.5
	
	# Build the directory, if needed
	base_dir = config['log_dir']
	if not os.path.exists(base_dir): os.makedirs(base_dir)
	
	# Seed numpy
	np.random.seed(seed)
	
	# Create the base dataset
	x_ds = SPDataset(nsamples, nbits, pct_active, pct_noise, seed=seed)
	x_tr, x_te = x_ds.data[:ntrain], x_ds.data[ntrain:]
	
	# Create the outlier dataset
	base_indexes = set(np.where(x_ds.base_class == 1)[0])
	choices = [x for x in xrange(nbits) if x not in base_indexes]
	outlier_base = np.zeros(nbits, dtype='bool')
	outlier_base[np.random.choice(choices, x_ds.nactive - noverlap_bits,
		False)] = 1
	outlier_base[np.random.permutation(list(base_indexes))[:noverlap_bits]] = 1
	y_ds = SPDataset(ntest, nbits, pct_active, pct_noise, outlier_base, seed)
	y_te = y_ds.data
	
	if verbose:
		print "\nBase class' test noise: {0:2.2f}".format(1 - (np.mean(x_te, 0)
			* x_ds.base_class.astype('i')).sum() / 40.)
		print "Outlier's class noise: {0:2.2f}".format(1 - (np.mean(y_te, 0) *
			outlier_base.astype('i')).sum() / 40.)
		print 'Overlap between two classes: {0}'.format(np.dot(
			x_ds.base_class.astype('i'), outlier_base.astype('i')))
	
	# Metrics
	metrics = SPMetrics()
	
	# Get the metrics for the datasets
	u_x_tr = metrics.compute_uniqueness(x_tr)
	o_x_tr = metrics.compute_overlap(x_tr)
	u_x_te = metrics.compute_uniqueness(x_te)
	o_x_te = metrics.compute_overlap(x_te)
	u_y_te = metrics.compute_uniqueness(y_te)
	o_y_te = metrics.compute_overlap(y_te)
	
	# Initialize the overall results
	sp_x_results = np.zeros(ntrials)
	sp_y_results = np.zeros(ntrials)
	svm_x_results = np.zeros(ntrials)
	svm_y_results = np.zeros(ntrials)
	
	# Iterate across the trials:
	for i, seed2 in enumerate(generate_seeds(ntrials, seed)):
		# Create the SP
		config['seed'] = seed2
		sp = SPRegion(**config)
		
		# Fit the SP
		sp.fit(x_tr)
		
		# Get the SP's output
		sp_x_tr = sp.predict(x_tr)
		sp_x_te = sp.predict(x_te)
		sp_y_te = sp.predict(y_te)
		
		# Get the metrics for the SP's results
		u_sp_x_tr = metrics.compute_uniqueness(sp_x_tr)
		o_sp_x_tr = metrics.compute_overlap(sp_x_tr)
		u_sp_x_te = metrics.compute_uniqueness(sp_x_te)
		o_sp_x_te = metrics.compute_overlap(sp_x_te)
		u_sp_y_te = metrics.compute_uniqueness(sp_y_te)
		o_sp_y_te = metrics.compute_overlap(sp_y_te)
		
		# Log all of the metrics
		sp._log_stats('Input Base Class Train Uniqueness', u_x_tr)
		sp._log_stats('Input Base Class Train Overlap', o_x_tr)
		sp._log_stats('Input Base Class Test Uniqueness', u_x_te)
		sp._log_stats('Input Base Class Test Overlap', o_x_te)
		sp._log_stats('Input Novelty Class Test Uniqueness', u_y_te)
		sp._log_stats('Input Novelty Class Test Overlap', o_y_te)
		sp._log_stats('SP Base Class Train Uniqueness', u_sp_x_tr)
		sp._log_stats('SP Base Class Train Overlap', o_sp_x_tr)
		sp._log_stats('SP Base Class Test Uniqueness', u_sp_x_te)
		sp._log_stats('SP Base Class Test Overlap', o_sp_x_te)
		sp._log_stats('SP Novelty Class Test Uniqueness', u_sp_y_te)
		sp._log_stats('SP Novelty Class Test Overlap', o_sp_y_te)
		
		# Print the results
		fmt_s = '{0}:\t{1:2.4f}\t{2:2.4f}\t{3:2.4f}\t{4:2.4f}\t{5:2.4f}\t{6:2.4f}'
		if verbose:
			print '\nDescription\tx_tr\tx_te\ty_te\tsp_x_tr\tsp_x_te\tsp_y_te'
			print fmt_s.format('Uniqueness', u_x_tr, u_x_te, u_y_te, u_sp_x_tr,
				u_sp_x_te, u_sp_y_te)
			print fmt_s.format('Overlap', o_x_tr, o_x_te, o_y_te, o_sp_x_tr,
				o_sp_x_te, o_sp_y_te)
		
		# Get average representation of the base class
		sp_base_result = np.mean(sp_x_tr, 0)
		sp_base_result[sp_base_result >= 0.5] = 1
		sp_base_result[sp_base_result < 1] = 0
		
		# Averaged results for each metric type
		u_sp_base_to_x_te = 0.
		o_sp_base_to_x_te = 0.
		u_sp_base_to_y_te = 0.
		o_sp_base_to_y_te = 0.
		for x, y in zip(sp_x_te, sp_y_te):
			# Refactor
			xt = np.vstack((sp_base_result, x))
			yt = np.vstack((sp_base_result, y))
			
			# Compute the sums
			u_sp_base_to_x_te += metrics.compute_uniqueness(xt)
			o_sp_base_to_x_te += metrics.compute_overlap(xt)
			u_sp_base_to_y_te += metrics.compute_uniqueness(yt)
			o_sp_base_to_y_te += metrics.compute_overlap(yt)
		u_sp_base_to_x_te /= ntest
		o_sp_base_to_x_te /= ntest
		u_sp_base_to_y_te /= ntest
		o_sp_base_to_y_te /= ntest
		
		# Log the results
		sp._log_stats('Base Train to Base Test Uniqueness',
			u_sp_base_to_x_te)
		sp._log_stats('Base Train to Base Test Overlap', o_sp_base_to_x_te)
		sp._log_stats('Base Train to Novelty Test Uniqueness',
			u_sp_base_to_y_te)
		sp._log_stats('Base Train to Novelty Test Overlap', o_sp_base_to_y_te)
		
		# Print the results
		if verbose:
			print '\nDescription\tx_tr->x_te\tx_tr->y_te'
			print 'Uniqueness:\t{0:2.4f}\t{1:2.4f}'.format(u_sp_base_to_x_te,
				u_sp_base_to_y_te)
			print 'Overlap:\t{0:2.4f}\t{1:2.4f}'.format(o_sp_base_to_x_te,
				o_sp_base_to_y_te)
		
		# Create an SVM
		clf = OneClassSVM(kernel='linear', nu=0.1, random_state=seed2)
		
		# Evaluate the SVM's performance
		clf.fit(x_tr)
		svm_x_te = len(np.where(clf.predict(x_te) == 1)[0]) / float(ntest) * \
			100
		svm_y_te = len(np.where(clf.predict(y_te) == -1)[0]) / float(ntest) * \
			100
		
		# Perform classification using overlap as the feature
		# -- The overlap must be above 50%
		clf_x_te = 0.
		clf_y_te = 0.
		for x, y in zip(sp_x_te, sp_y_te):
			# Refactor
			xt = np.vstack((sp_base_result, x))
			yt = np.vstack((sp_base_result, y))
			
			# Compute the accuracy
			xo = metrics.compute_overlap(xt)
			yo = metrics.compute_overlap(yt)
			if xo >= clf_th: clf_x_te += 1
			if yo < clf_th: clf_y_te += 1
		clf_x_te = (clf_x_te / ntest) * 100
		clf_y_te = (clf_y_te / ntest) * 100
		
		# Store the results as errors
		sp_x_results[i] = 100 - clf_x_te
		sp_y_results[i] = 100 - clf_y_te
		svm_x_results[i] = 100 - svm_x_te
		svm_y_results[i] = 100 - svm_y_te
		
		# Log the results
		sp._log_stats('SP % Correct Base Class', clf_x_te)
		sp._log_stats('SP % Correct Novelty Class', clf_y_te)
		sp._log_stats('SVM % Correct Base Class', svm_x_te)
		sp._log_stats('SVM % Correct Novelty Class', svm_y_te)
		
		# Print the results
		if verbose:
			print '\nSP Base Class Detection     : {0:2.2f}%'.format(clf_x_te)
			print 'SP Novelty Class Detection  : {0:2.2f}%'.format(clf_y_te)
			print 'SVM Base Class Detection    : {0:2.2f}%'.format(svm_x_te)
			print 'SVM Novelty Class Detection : {0:2.2f}%'.format(svm_y_te)
	
	# Save the results
	with open(os.path.join(base_dir, 'results.pkl'), 'wb') as f:
		cPickle.dump((sp_x_results, sp_y_results, svm_x_results,
			svm_y_results), f, cPickle.HIGHEST_PROTOCOL)
示例#43
0
X_test = X_test.toarray()

n_samples, n_features = X.shape
test_samples, test_features = X_test.shape
print "done in %fs" % (time() - t0)
print "Train set - n_samples: %d, n_features: %d" % (n_samples, n_features)
print "Test set  - n_samples: %d, n_features: %d" % (test_samples, test_features)
print


# fit the model
# when nu=0.01, gamma=0.0034607 is the smallest to generate >0 result
clf = OneClassSVM(nu=0.01, kernel="rbf", gamma=0.05) 
clf.fit(X)
# predit on X_test
y_pred = clf.predict(X_test)

# Count number of selected items given different gamma and nu
# This change is interesting
# Could further study systematically usign grid search
# 
count = 0
for i, pred in enumerate(y_pred):
    if pred != -1:
        count += 1
print count


csvWriter = csv.writer(open("detected.csv","wb"))
for i, pred in enumerate(y_pred):
    if pred != -1:
示例#44
0
import numpy as np
from rop_dataextract import *
from sklearn.svm import OneClassSVM
import sys

MAX_EVENT_COUNTERS = 4
TIME_DELTA = 10000
CLUSTER_POINTS = 32
TRAIN_POINTS = 100000
TEST_POINTS = -1

svm = OneClassSVM()

train_set, test_set = getSetNames(sys.argv)

print "aggregating data..."
obs = aggrTimeseries(train_set, TRAIN_POINTS, CLUSTER_POINTS, MAX_EVENT_COUNTERS, TIME_DELTA)
print len(obs)

print "fitting model..."
svm.fit(obs)

print "aggregating test..."
test = aggrTimeseries(test_set, TEST_POINTS, CLUSTER_POINTS, MAX_EVENT_COUNTERS, TIME_DELTA)

print "testing..."
prediction = svm.predict(test)
print sum(prediction)
print len(prediction)
def compute_scores(normal_users, queue, Ks=[]):

    '''
        Calculates the novelty scores (noise and strangeness) for the 4 algotithms
        Receives the list of normal users and the queue (all users) and the list of curiosity factors Ks
        Updates the global variables GMM_n, one_n, lsa_n, K_n, GMM_s, one_s, lsa_s with the results 
    '''
    
    global GMM_n, one_n, lsa_n, K_n, GMM_s, one_s, lsa_s, K_s #Novelty Scores for each algorithm, those ''_n are for noise score, ''_s are for strangeness score 

    GMM_n = []
    one_n = []
    lsa_n = []
    K_n = []
    GMM_s = []
    one_s = []
    lsa_s = []
    K_s = []

    K_GMM_n, K_KMeans_n, K_GMM_s, K_KMeans_s = Ks #K_GMM_n, K_KMeans_n are the noise curiosity factors for each algorithm
                                                  #K_GMM_s, K_KMeans_s are the strangeness curiosity factors for each algorithm
                                                  #Ks is a list containing the 4 above mentioned parameters
    

    '''
    
    For One_class_SVM and LSA, when asked to predict the new entry, a label is directly returned 
        LSA: 'anomaly' or '0' (normal)

        One One_class_SVM: -1 (anomaly) or 1 (normal)

    GMM and K means predict a fitting score. The novelty score is obtained calculating the zscore of the entry compared with the scores of all other entries, calling 
    the function get_score_last_item
        If the zscore returned >= 1 the new entry is anomalous

    '''

    '''
    Noise scores are computed with the queue as the base of knowledge, fitting all the entries but the last to the algorithm
    '''                                    
    B = GMM(covariance_type='full', n_components = 1)
    B.fit(queue[0:-1])
    x = [B.score([i]).mean() for i in queue]
    GMM_n.append(get_score_last_item(x, K_GMM_n))


    K = KMeans(n_clusters=1)
    K.fit(queue[0:-1])
    x = [K.score([i]) for i in queue]
    K_n.append(get_score_last_item(x, K_KMeans_n))

    oneClassSVM = OneClassSVM(nu=0.1)
    oneClassSVM.fit(queue[0:-1])
    x = oneClassSVM.predict(np.array([queue[-1]]))
    if x == -1:
        one_n.append(1)
    if x == 1:
        one_n.append(0)
    
    X = np.array(queue[0:-1])
    anomalymodel = lsanomaly.LSAnomaly()
    anomalymodel.fit(X)
    x = anomalymodel.predict(np.array([queue[-1]])) 
    if x == ['anomaly']:
        lsa_n.append(1)
    if x == [0]:
        lsa_n.append(0)

    '''
    Strangeness scores are computed with the normal users as the base of knowledge, fitting normal users to the algorithm
    ''' 

    normal_and_new = normal_users + [queue[-1]] #List to be passed to get_score_last_item to calculate the zscore of the last item, the new entry

    B = GMM(covariance_type='full', n_components = 1)
    B.fit(normal_users)
    x = [B.score([i]).mean() for i in normal_and_new]
    GMM_s.append(get_score_last_item(x, K_GMM_s))


    K = KMeans(n_clusters=1)
    K.fit(normal_users)
    x = [K.score([i]) for i in normal_and_new]
    K_s.append(get_score_last_item(x, K_KMeans_s))

    oneClassSVM = OneClassSVM(nu=0.1)
    oneClassSVM.fit(normal_users)
    x = oneClassSVM.predict(np.array([queue[-1]]))
    if x == -1:
        one_s.append(1)
    if x == 1:
        one_s.append(0)

    anomalymodel = lsanomaly.LSAnomaly()
    X = np.array(normal_users)
    anomalymodel.fit(X)
    x = anomalymodel.predict(np.array([queue[-1]])) 
    if x == ['anomaly']:
        lsa_s.append(1)
    if x == [0]:
        lsa_s.append(0)

    return GMM_n, one_n, lsa_n, K_n, GMM_s, one_s, lsa_s, K_s
示例#46
0
def base_experiment(config, ntrials=1, seed=123456789):
	"""
	Run a single experiment, locally.
		
	@param config: The configuration parameters to use for the SP.
	
	@param ntrials: The number of times to repeat the experiment.
	
	@param seed: The random seed to use.
	
	@return: A tuple containing the percentage errors for the SP's training
	and testing results and the SVM's training and testing results,
	respectively.
	"""
	
	# Base parameters
	ntrain, ntest = 800, 200
	clf_th = 0.5
	
	# Seed numpy
	np.random.seed(seed)
	
	# Get the data
	(tr_x, tr_y), (te_x, te_y) = load_mnist()
	tr_x_0 = np.random.permutation(tr_x[tr_y == 0])
	x_tr = tr_x_0[:ntrain]
	x_te = tr_x_0[ntrain:ntrain + ntest]
	outliers = [np.random.permutation(tr_x[tr_y == i])[:ntest] for i in
		xrange(1, 10)]
	
	# Metrics
	metrics = SPMetrics()
	
	# Get the metrics for the datasets
	u_x_tr = metrics.compute_uniqueness(x_tr)
	o_x_tr = metrics.compute_overlap(x_tr)
	c_x_tr = 1 - metrics.compute_distance(x_tr)
	u_x_te = metrics.compute_uniqueness(x_te)
	o_x_te = metrics.compute_overlap(x_te)
	c_x_te = 1 - metrics.compute_distance(x_te)
	u_y_te, o_y_te, c_y_te = [], [], []
	for outlier in outliers:
		u_y_te.append(metrics.compute_uniqueness(outlier))
		o_y_te.append(metrics.compute_overlap(outlier))
		c_y_te.append(1 - metrics.compute_distance(outlier))
	
	# Initialize the overall results
	sp_x_results = np.zeros(ntrials)
	sp_y_results = [np.zeros(ntrials) for _ in xrange(9)]
	svm_x_results = np.zeros(ntrials)
	svm_y_results = [np.zeros(ntrials) for _ in xrange(9)]
	
	# Iterate across the trials:
	for nt in xrange(ntrials):
		# Make a new seeod
		seed2 = np.random.randint(1000000)
		config['seed'] = seed2
		
		# Create the SP
		sp = SPRegion(**config)
		
		# Fit the SP
		sp.fit(x_tr)
		
		# Get the SP's output
		sp_x_tr = sp.predict(x_tr)
		sp_x_te = sp.predict(x_te)
		sp_y_te = [sp.predict(outlier) for outlier in outliers]
		
		# Get the metrics for the SP's results
		u_sp_x_tr = metrics.compute_uniqueness(sp_x_tr)
		o_sp_x_tr = metrics.compute_overlap(sp_x_tr)
		c_sp_x_tr = 1 - metrics.compute_distance(sp_x_tr)
		u_sp_x_te = metrics.compute_uniqueness(sp_x_te)
		o_sp_x_te = metrics.compute_overlap(sp_x_te)
		c_sp_x_te = 1 - metrics.compute_distance(sp_x_te)
		u_sp_y_te, o_sp_y_te, c_sp_y_te = [], [], []
		for y in sp_y_te:
			u_sp_y_te.append(metrics.compute_uniqueness(y))
			o_sp_y_te.append(metrics.compute_overlap(y))
			c_sp_y_te.append(1 - metrics.compute_distance(y))
		
		# Log all of the metrics
		sp._log_stats('Input Base Class Train Uniqueness', u_x_tr)
		sp._log_stats('Input Base Class Train Overlap', o_x_tr)
		sp._log_stats('Input Base Class Train Correlation', c_x_tr)
		sp._log_stats('Input Base Class Test Uniqueness', u_x_te)
		sp._log_stats('Input Base Class Test Overlap', o_x_te)
		sp._log_stats('Input Base Class Test Correlation', c_x_te)
		sp._log_stats('SP Base Class Train Uniqueness', u_sp_x_tr)
		sp._log_stats('SP Base Class Train Overlap', o_sp_x_tr)
		sp._log_stats('SP Base Class Train Correlation', c_sp_x_tr)
		sp._log_stats('SP Base Class Test Uniqueness', u_sp_x_te)
		sp._log_stats('SP Base Class Test Overlap', o_sp_x_te)
		sp._log_stats('SP Base Class Test Correlation', c_sp_x_te)
		for i, (a, b, c, d, e, f) in enumerate(zip(u_y_te, o_y_te, c_y_te,
			u_sp_y_te, o_sp_y_te, c_sp_y_te), 1):
			sp._log_stats('Input Novelty Class {0} Uniqueness'.format(i), a)
			sp._log_stats('Input Novelty Class {0} Overlap'.format(i), b)
			sp._log_stats('Input Novelty Class {0} Correlation'.format(i), c)	
			sp._log_stats('SP Novelty Class {0} Uniqueness'.format(i), d)
			sp._log_stats('SP Novelty Class {0} Overlap'.format(i), e)
			sp._log_stats('SP Novelty Class {0} Correlation'.format(i), f)
		
		# Get average representation of the base class
		sp_base_result = np.mean(sp_x_tr, 0)
		sp_base_result[sp_base_result >= 0.5] = 1
		sp_base_result[sp_base_result < 1] = 0
		
		# Averaged results for each metric type
		u_sp_base_to_x_te = 0.
		o_sp_base_to_x_te = 0.
		c_sp_base_to_x_te = 0.
		u_sp, o_sp, c_sp = np.zeros(9), np.zeros(9), np.zeros(9)
		for i, x in enumerate(sp_x_te):
			xt = np.vstack((sp_base_result, x))
			u_sp_base_to_x_te += metrics.compute_uniqueness(xt)
			o_sp_base_to_x_te += metrics.compute_overlap(xt)
			c_sp_base_to_x_te += 1 - metrics.compute_distance(xt)
			
			for j, yi in enumerate(sp_y_te):
				yt = np.vstack((sp_base_result, yi[i]))
				u_sp[j] += metrics.compute_uniqueness(yt)
				o_sp[j] += metrics.compute_overlap(yt)
				c_sp[j] += 1 - metrics.compute_distance(yt)
		u_sp_base_to_x_te /= ntest
		o_sp_base_to_x_te /= ntest
		c_sp_base_to_x_te /= ntest
		for i in xrange(9):
			u_sp[i] /= ntest
			o_sp[i] /= ntest
			c_sp[i] /= ntest
		
		# Log the results
		sp._log_stats('Base Train to Base Test Uniqueness',
			u_sp_base_to_x_te)
		sp._log_stats('Base Train to Base Test Overlap', o_sp_base_to_x_te)
		sp._log_stats('Base Train to Base Test Correlation', c_sp_base_to_x_te)
		for i, j in enumerate(xrange(1, 10)):
			sp._log_stats('Base Train to Novelty {0} Uniqueness'.format(j),
				u_sp[i])
			sp._log_stats('Base Train to Novelty {0} Overlap'.format(j),
				o_sp[i])
			sp._log_stats('Base Train to Novelty {0} Correlation'.format(j),
				c_sp[i])
		
		# Create an SVM
		clf = OneClassSVM(kernel='linear', nu=0.1, random_state=seed2)
		
		# Evaluate the SVM's performance
		clf.fit(x_tr)
		svm_x_te = len(np.where(clf.predict(x_te) == 1)[0]) / float(ntest) * \
			100
		svm_y_te = np.array([len(np.where(clf.predict(outlier) == -1)[0]) /
			float(ntest) * 100 for outlier in outliers])
		
		# Perform classification using overlap as the feature
		# -- The overlap must be above 50%
		clf_x_te = 0.
		clf_y_te = np.zeros(9)
		for i, x in enumerate(sp_x_te):
			xt = np.vstack((sp_base_result, x))
			xo = metrics.compute_overlap(xt)
			if xo >= clf_th: clf_x_te += 1
			
			for j, yi in enumerate(sp_y_te):
				yt = np.vstack((sp_base_result, yi[i]))
				yo = metrics.compute_overlap(yt)
				if yo < clf_th: clf_y_te[j] += 1
		clf_x_te = (clf_x_te / ntest) * 100
		clf_y_te = (clf_y_te / ntest) * 100
		
		# Store the results as errors
		sp_x_results[nt] = 100 - clf_x_te
		sp_y_results[nt] = 100 - clf_y_te
		svm_x_results[nt] = 100 - svm_x_te
		svm_y_results[nt] = 100 - svm_y_te
		
		# Log the results
		sp._log_stats('SP % Correct Base Class', clf_x_te)
		sp._log_stats('SVM % Correct Base Class', svm_x_te)
		for i, j in enumerate(xrange(1, 10)):
			sp._log_stats('SP % Correct Novelty Class {0}'.format(j),
				clf_y_te[i])
			sp._log_stats('SVM % Correct Novelty Class {0}'.format(j),
				svm_y_te[i])
		sp._log_stats('SP % Mean Correct Novelty Class', np.mean(clf_y_te))
		sp._log_stats('SVM % Mean Correct Novelty Class', np.mean(svm_y_te))
		sp._log_stats('SP % Adjusted Score', (np.mean(clf_y_te) * clf_x_te) /
			100)
		sp._log_stats('SVM % Adjusted Score', (np.mean(svm_y_te) * svm_x_te) /
			100)
	
	return sp_x_results, sp_y_results, svm_x_results, svm_y_results
示例#47
0
from sklearn.datasets import load_boston
from sklearn.svm import OneClassSVM
from scipy import stats

# Get the data
dataset = load_boston()
data = dataset["data"][:, [5, 12]]  # Banana-shaped data
contamination = 0.261
gamma = 0.1

# Fit the model
clf = OneClassSVM(nu=contamination, gamma=gamma)
clf.fit(data)

# Perform outlier detection
predicted_data = clf.predict(data)
inlier_predicted_data = data[predicted_data == 1]
outlier_predicted_data = data[predicted_data == -1]
num_inliers_predicted = inlier_predicted_data.shape[0]
num_outliers_predicted = outlier_predicted_data.shape[0]

# Plot decision function values
xr = np.linspace(3, 10, 500)
yr = np.linspace(-5, 45, 500)
xx, yy = np.meshgrid(xr, yr)
zz = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
zz = zz.reshape(xx.shape)
scores = clf.decision_function(data)
threshold = stats.scoreatpercentile(scores, 100 * contamination)
plt.contourf(xx,
             yy,
示例#48
0
    def Predict(self):

        if self.ID < 0:
            self.ErrorMessage.setIcon(QMessageBox.Information)
            self.ErrorMessage.setText("Your are not logged in")
            self.ErrorMessage.setWindowTitle("Warning!")
            self.ErrorMessage.exec_()
        elif self.String == self.Accounts[self.ID].AccountPassword:
            y = []
            for i in range(len(self.Accounts)):
                if self.Accounts[self.ID].AccountPassword == self.Accounts[
                        i].AccountPassword:
                    for x in range(len(self.Accounts[i].TrainData)):
                        y.append(self.Accounts[i].AccountName)

            sts = len(list(set(y)))

            self.ProcessData()

            Xset = []
            Yset = []
            sz = len(self.Accounts[self.ID].AccountPassword) * 2

            for j in range(len(self.Accounts[self.ID].TrainData)):
                Xset.append(array(self.Accounts[self.ID].TrainData)[j][sz:])
                Yset.append(1)

            Xset = array(Xset)
            Yset = array(Yset)

            trainx, testx, trainy, testy = train_test_split(Xset,
                                                            Yset,
                                                            test_size=0.3,
                                                            random_state=2)

            trainx = array(trainx)

            X = []
            multiy = []
            multi2y = []

            if sts > 1:

                for i in range(len(self.Accounts)):
                    if self.Accounts[self.ID].AccountPassword == self.Accounts[
                            i].AccountPassword and self.ID != i:
                        hold = []
                        for k in range(len(self.Accounts[i].TrainData)):
                            hold.append(self.Accounts[i].TrainData[k][16:])
                        X = X + hold
                        for x in range(len(self.Accounts[i].TrainData)):
                            multiy.append(-1)
                            multi2y.append(0)
                X = array(X)
                multiy = array(multiy)
                multi2y = array(multi2y)

                testx = np.concatenate((testx, X))
                testymone = np.concatenate((testy, multiy))
                testymzero = np.concatenate((testy, multi2y))

            if sts == 1:
                testymone = testy
                testymzero = testy

            Osvm = OneClassSVM(kernel='rbf', gamma="auto").fit(trainx)
            Ypredict = Osvm.predict(testx)
            score = f1_score(testymone, Ypredict, pos_label=1)

            kmeans = KMeans(n_clusters=2, random_state=0).fit(trainx)
            Ypredict = kmeans.predict(testx)
            score1 = f1_score(testymzero, Ypredict, pos_label=1)

            brc = Birch(n_clusters=2, threshold=0.01).fit(trainx)
            Ypredict = brc.predict(testx)
            score2 = f1_score(testymzero, Ypredict, pos_label=1)

            IsF = IsolationForest(contamination=0.01)
            IsF.fit(trainx)
            Ypredict = IsF.predict(testx)
            score3 = f1_score(testymone, Ypredict, pos_label=1)

            ev = EllipticEnvelope(contamination=0.01)
            ev.fit(trainx)
            Ypredict = ev.predict(testx)
            score4 = f1_score(testymone, Ypredict, pos_label=1)

            if Osvm.predict([self.Dwell + self.Flight]) == 1:
                OsvmResult = 'pass'
            else:
                OsvmResult = 'fail'

            if kmeans.predict([self.Dwell + self.Flight]) == 1:
                kmResult = 'pass'
            else:
                kmResult = 'fail'

            if brc.predict([self.Dwell + self.Flight]) == 1:
                brcResult = 'pass'
            else:
                brcResult = 'fail'

            if IsF.predict([self.Dwell + self.Flight]) == 1:
                IsFResult = 'pass'
            else:
                IsFResult = 'fail'

            if ev.predict([self.Dwell + self.Flight]) == 1:
                evResult = 'pass'
            else:
                evResult = 'fail'

            #print(score,score1,score2,score3,score4)

            self.TrainText.setText("Score/Model" + " \n" +
                                   str(round(score, 2)) + " Osvm: " +
                                   OsvmResult + " \n" + str(round(score1, 2)) +
                                   " Km: " + kmResult + " \n" +
                                   str(round(score2, 2)) + " Brc: " +
                                   brcResult + " \n " + str(round(score3, 2)) +
                                   " ISF: " + IsFResult + " \n" +
                                   str(round(score4, 2)) + " Ev: " + evResult)

            #if sts > 1:
            #    self.CompareText.setText(self.Accounts[self.ID].AccountPassword)
            #    self.Compare()
            #    prediction = self.clf.predict([self.Dwell+self.Flight])
            #    str1 = str(prediction)
            #    self.TrainText.setText(str(prediction))

            self.Reset()

        else:
            self.ErrorMessage.setIcon(QMessageBox.Information)
            self.ErrorMessage.setText("Your password is wrong")
            self.ErrorMessage.setWindowTitle("Warning!")
            self.ErrorMessage.exec_()
示例#49
0
import librosa

trainX, trainY = train_df[['mean','zc']], train_df['label']
testX, testY = test_df[['mean','zc']], test_df['label']

sc = StandardScaler()
sc.fit(trainX)

trainX = sc.transform(trainX)
testX = sc.transform(testX)


model = OneClassSVM()
model.fit(trainX)

y_pred = model.predict(testX)
pred = np.where(pred==-1, 1, 0)

def create_power_spectral(data):
    N = data.shape[1]
    dt = 10/N
    F = np.abs(np.fft.fft(data)/(N/2))
    fq = np.linspace(0,1/dt, N)
    return F[:, :int(N/2)+1], fq[:int(N/2)+1]

F, freq = create_power_spectral(train)

plt.plot(freq, F[0])

melspec = librosa.feature.melspectrogram(train[0])
示例#50
0
 def predict_rate_features(self, pkt_featurizer):
     group_id = pkt_featurizer.pkt_type
     features = pkt_featurizer.features
     arrival_time = pkt_featurizer.arrival_time
     try:
         if len(self.time_delta3[group_id]) <= 1:
             raise ValueError
         td1 = arrival_time - self.time_data[group_id][-1]
         td2 = td1 - self.time_delta1[group_id][-1]
         td3 = td2 - self.time_delta2[group_id][-1]
         """
         if self.plot:
             self.t_fig.cla()
             self.prep_figure(self.t_fig, "Time", "Pkt", grid=True)
             self.t_fig.scatter(self.time_data[group_id], range(len(self.time_data[group_id])))
             """
         dbscan1 = DBSCAN()
         dbscan2 = DBSCAN()
         dbscan3 = DBSCAN()
         td1_training = np.array(list(self.time_delta1[group_id]) + [td1]).reshape(-1,1)
         td2_training = np.array(list(self.time_delta2[group_id]) + [td2]).reshape(-1,1)
         td3_training = np.array(list(self.time_delta3[group_id]) + [td3]).reshape(-1,1)
         labels1 = dbscan1.fit_predict(td1_training)
         labels2 = dbscan2.fit_predict(td2_training)
         labels3 = dbscan3.fit_predict(td3_training)
         db_predict1 = labels1[-1] == -1
         db_predict2 = labels2[-1] == -1
         db_predict3 = labels3[-1] == -1
         if self.plot:
             self.plot_1d_dbscan(td1_training, labels1, 
                                 list(self.time_data[group_id])[(len(self.time_data[group_id])-len(self.time_delta1[group_id])) :]+[arrival_time],
                                 self.td1_fig_dbscan, "", "Pkt/Time", "Pkt Rate DBSCAN Clustering - Anomalous Pkts in Black")
             self.plot_1d_dbscan(td2_training, labels2, 
                                 list(self.time_data[group_id])[(len(self.time_data[group_id])-len(self.time_delta2[group_id])) :]+[arrival_time],
                                 self.td2_fig_dbscan, "", "Pkt/Time^2")
             self.plot_1d_dbscan(td3_training, labels3, 
                                 list(self.time_data[group_id])[(len(self.time_data[group_id])-len(self.time_delta3[group_id])) :]+[arrival_time],
                                 self.td3_fig_dbscan, "Time", "Pkt/Time^3")
     
         scaler1 = preprocessing.StandardScaler()
         scaler2 = preprocessing.StandardScaler()
         scaler3 = preprocessing.StandardScaler()
         time_training1 = scaler1.fit_transform(np.array(self.time_delta1[group_id]).reshape(-1,1))
         time_features1 = scaler1.transform(np.array(td1).reshape(1,-1))
         time_training2 = scaler2.fit_transform(np.array(self.time_delta2[group_id]).reshape(-1,1))
         time_features2 = scaler2.transform(np.array(td2).reshape(1,-1))
         time_training3 = scaler3.fit_transform(np.array(self.time_delta3[group_id]).reshape(-1,1))
         time_features3 = scaler3.transform(np.array(td3).reshape(1,-1))
         time_classifier1 = OneClassSVM().fit(time_training1)
         time_prediction1 = time_classifier1.predict(time_features1)
         time_classifier2 = OneClassSVM().fit(time_training2)
         time_prediction2 = time_classifier2.predict(time_features2)
         time_classifier3 = OneClassSVM().fit(time_training3)
         time_prediction3 = time_classifier3.predict(time_features3)                        
         if self.plot:
             self.plot_1d_svm(self.time_delta1[group_id], time_classifier1, 
                              list(self.time_data[group_id])[(len(self.time_data[group_id])-len(self.time_delta1[group_id])) :],
                              scaler1, self.td1_fig_svm, "", "Pkt/Time", "Pkt Rate One Class SVM Classification")
             self.plot_1d_svm(self.time_delta2[group_id], time_classifier2, 
                              list(self.time_data[group_id])[(len(self.time_data[group_id])-len(self.time_delta2[group_id])) :],
                              scaler2, self.td2_fig_svm, "", "Pkt/Time^2")
             self.plot_1d_svm(self.time_delta3[group_id], time_classifier3, 
                              list(self.time_data[group_id])[(len(self.time_data[group_id])-len(self.time_delta3[group_id])) :],
                              scaler3, self.td3_fig_svm, "Time", "Pkt/Time^3")
     except (KeyError, IndexError, ValueError) as e:
         print e
         db_predict1, db_predict2, db_predict3 = 0,0,0
     return db_predict1, db_predict2, db_predict3
示例#51
0
#读取数据
data_path1 = './data/13A17ProAll.xlsx'
data = []
read_xsls(data_path1)

clf = OneClassSVM(gamma='auto', nu=0.001).fit(data)

Y = []
data_xsls = xlrd.open_workbook("./data/13A17ProAll.xlsx")
sheet_name = data_xsls.sheets()[0]
#count_nrows = sheet_name.nrows
for i in range(30000, 60000):
    a = []
    for j in range(9):
        a.append(sheet_name.cell_value(i, j + 1))
    Y.append(clf.predict([a]))

Z = []
for i in range(len(Y)):
    n = 0
    if i < 19:
        Z.append(Y[i])
    else:
        for j in range(20):
            n += (Y[i - j] / 20)
        Z.append(n)
X = []
for i in range(len(Y)):
    X.append(i + 30000)
plt.plot(X, Z)
plt.show()
示例#52
0
def base_experiment(pct_noise=0.15, noverlap_bits=0, exp_name='1-1',
	ntrials=10, verbose=True, seed=123456789):
	"""
	Run a single experiment, locally.
	
	@param pct_noise: The percentage of noise to add to the dataset.
	
	@param noverlap_bits: The number of bits the base class should overlap
	with the novelty class.
	
	@param exp_name: The name of the experiment.
	
	@param ntrials: The number of times to repeat the experiment.
	
	@param verbose: If True print the results.
	
	@param seed: The random seed to use.
	
	@return: A tuple containing the percentage errors for the SP's training
	and testing results and the SVM's training and testing results,
	respectively.
	"""
	
	# Base parameters
	ntrain, ntest = 800, 200
	nsamples, nbits, pct_active = ntest + ntrain, 100, 0.4
	clf_th = 0.5
	log_dir = os.path.join(os.path.expanduser('~'), 'scratch',
		'novelty_experiments', exp_name)
	
	# Configure the SP
	config = {
		'ninputs': 100,
		'trim': 1e-4,
		'disable_boost': True,
		'seed': seed,
		'pct_active': None,
		'random_permanence': True,
		'pwindow': 0.5,
		
		'global_inhibition': True,
		
		'ncolumns': 200,
		'nactive': 50,
		
		
		'nsynapses': 75,
		'seg_th': 15,
		
		'syn_th': 0.5,
		
		'pinc': 0.001,
		'pdec': 0.001,
		
		'nepochs': 10,
		
		'log_dir': log_dir
	}
	
	# Seed numpy
	np.random.seed(seed)
	
	# Create the base dataset
	x_ds = SPDataset(nsamples, nbits, pct_active, pct_noise, seed=seed)
	x_tr, x_te = x_ds.data[:ntrain], x_ds.data[ntrain:]
	
	# Create the outlier dataset
	base_indexes = set(np.where(x_ds.base_class == 1)[0])
	choices = [x for x in xrange(nbits) if x not in base_indexes]
	outlier_base = np.zeros(nbits, dtype='bool')
	outlier_base[np.random.choice(choices, x_ds.nactive - noverlap_bits,
		False)] = 1
	outlier_base[np.random.permutation(list(base_indexes))[:noverlap_bits]] = 1
	y_ds = SPDataset(ntest, nbits, pct_active, pct_noise, outlier_base, seed)
	y_te = y_ds.data
	
	if verbose:
		print "\nBase class' test noise: {0:2.2f}".format(1 - (np.mean(x_te, 0)
			* x_ds.base_class.astype('i')).sum() / 40.)
		print "Outlier's class noise: {0:2.2f}".format(1 - (np.mean(y_te, 0) *
			outlier_base.astype('i')).sum() / 40.)
		print 'Overlap between two classes: {0}'.format(np.dot(
			x_ds.base_class.astype('i'), outlier_base.astype('i')))
	
	# Metrics
	metrics = SPMetrics()
	
	# Get the metrics for the datasets
	u_x_tr = metrics.compute_uniqueness(x_tr)
	o_x_tr = metrics.compute_overlap(x_tr)
	c_x_tr = 1 - metrics.compute_distance(x_tr)
	u_x_te = metrics.compute_uniqueness(x_te)
	o_x_te = metrics.compute_overlap(x_te)
	c_x_te = 1 - metrics.compute_distance(x_te)
	u_y_te = metrics.compute_uniqueness(y_te)
	o_y_te = metrics.compute_overlap(y_te)
	c_y_te = 1 - metrics.compute_distance(y_te)
	
	# Initialize the overall results
	sp_x_results = np.zeros(ntrials)
	sp_y_results = np.zeros(ntrials)
	svm_x_results = np.zeros(ntrials)
	svm_y_results = np.zeros(ntrials)
	
	# Iterate across the trials:
	for i in xrange(ntrials):
		# Make a new seed
		seed2 = np.random.randint(1000000)
		config['seed'] = seed2
		config['log_dir'] = '{0}-{1}'.format(log_dir, i + 1)
		
		# Create the SP
		sp = SPRegion(**config)
		
		# Fit the SP
		sp.fit(x_tr)
		
		# Get the SP's output
		sp_x_tr = sp.predict(x_tr)
		sp_x_te = sp.predict(x_te)
		sp_y_te = sp.predict(y_te)
		
		# Get the metrics for the SP's results
		u_sp_x_tr = metrics.compute_uniqueness(sp_x_tr)
		o_sp_x_tr = metrics.compute_overlap(sp_x_tr)
		c_sp_x_tr = 1 - metrics.compute_distance(sp_x_tr)
		u_sp_x_te = metrics.compute_uniqueness(sp_x_te)
		o_sp_x_te = metrics.compute_overlap(sp_x_te)
		c_sp_x_te = 1 - metrics.compute_distance(sp_x_te)
		u_sp_y_te = metrics.compute_uniqueness(sp_y_te)
		o_sp_y_te = metrics.compute_overlap(sp_y_te)
		c_sp_y_te = 1 - metrics.compute_distance(sp_y_te)
		
		# Log all of the metrics
		sp._log_stats('Input Base Class Train Uniqueness', u_x_tr)
		sp._log_stats('Input Base Class Train Overlap', o_x_tr)
		sp._log_stats('Input Base Class Train Correlation', c_x_tr)
		sp._log_stats('Input Base Class Test Uniqueness', u_x_te)
		sp._log_stats('Input Base Class Test Overlap', o_x_te)
		sp._log_stats('Input Base Class Test Correlation', c_x_te)
		sp._log_stats('Input Novelty Class Test Uniqueness', u_y_te)
		sp._log_stats('Input Novelty Class Test Overlap', o_y_te)
		sp._log_stats('Input Novelty Class Test Correlation', c_y_te)	
		sp._log_stats('SP Base Class Train Uniqueness', u_sp_x_tr)
		sp._log_stats('SP Base Class Train Overlap', o_sp_x_tr)
		sp._log_stats('SP Base Class Train Correlation', c_sp_x_tr)
		sp._log_stats('SP Base Class Test Uniqueness', u_sp_x_te)
		sp._log_stats('SP Base Class Test Overlap', o_sp_x_te)
		sp._log_stats('SP Base Class Test Correlation', c_sp_x_te)
		sp._log_stats('SP Novelty Class Test Uniqueness', u_sp_y_te)
		sp._log_stats('SP Novelty Class Test Overlap', o_sp_y_te)
		sp._log_stats('SP Novelty Class Test Correlation', c_sp_y_te)
		
		# Print the results
		fmt_s = '{0}:\t{1:2.4f}\t{2:2.4f}\t{3:2.4f}\t{4:2.4f}\t{5:2.4f}\t{5:2.4f}'
		if verbose:
			print '\nDescription\tx_tr\tx_te\ty_te\tsp_x_tr\tsp_x_te\tsp_y_te'
			print fmt_s.format('Uniqueness', u_x_tr, u_x_te, u_y_te, u_sp_x_tr,
				u_sp_x_te, u_sp_y_te)
			print fmt_s.format('Overlap', o_x_tr, o_x_te, o_y_te, o_sp_x_tr, o_sp_x_te,
				o_sp_y_te)
			print fmt_s.format('Correlation', c_x_tr, c_x_te, c_y_te, c_sp_x_tr,
				c_sp_x_te, c_sp_y_te)
		
		# Get average representation of the base class
		sp_base_result = np.mean(sp_x_tr, 0)
		sp_base_result[sp_base_result >= 0.5] = 1
		sp_base_result[sp_base_result < 1] = 0
		
		# Averaged results for each metric type
		u_sp_base_to_x_te = 0.
		o_sp_base_to_x_te = 0.
		c_sp_base_to_x_te = 0.
		u_sp_base_to_y_te = 0.
		o_sp_base_to_y_te = 0.
		c_sp_base_to_y_te = 0.
		for x, y in zip(sp_x_te, sp_y_te):
			# Refactor
			xt = np.vstack((sp_base_result, x))
			yt = np.vstack((sp_base_result, y))
			
			# Compute the sums
			u_sp_base_to_x_te += metrics.compute_uniqueness(xt)
			o_sp_base_to_x_te += metrics.compute_overlap(xt)
			c_sp_base_to_x_te += 1 - metrics.compute_distance(xt)
			u_sp_base_to_y_te += metrics.compute_uniqueness(yt)
			o_sp_base_to_y_te += metrics.compute_overlap(yt)
			c_sp_base_to_y_te += 1 - metrics.compute_distance(yt)
		u_sp_base_to_x_te /= ntest
		o_sp_base_to_x_te /= ntest
		c_sp_base_to_x_te /= ntest
		u_sp_base_to_y_te /= ntest
		o_sp_base_to_y_te /= ntest
		c_sp_base_to_y_te /= ntest
		
		# Log the results
		sp._log_stats('Base Train to Base Test Uniqueness',
			u_sp_base_to_x_te)
		sp._log_stats('Base Train to Base Test Overlap', o_sp_base_to_x_te)
		sp._log_stats('Base Train to Base Test Correlation', c_sp_base_to_x_te)
		sp._log_stats('Base Train to Novelty Test Uniqueness',
			u_sp_base_to_y_te)
		sp._log_stats('Base Train to Novelty Test Overlap', o_sp_base_to_y_te)
		sp._log_stats('Base Train to Novelty Test Correlation',
			c_sp_base_to_y_te)
		
		# Print the results
		if verbose:
			print '\nDescription\tx_tr->x_te\tx_tr->y_te'
			print 'Uniqueness:\t{0:2.4f}\t{1:2.4f}'.format(u_sp_base_to_x_te,
				u_sp_base_to_y_te)
			print 'Overlap:\t{0:2.4f}\t{1:2.4f}'.format(o_sp_base_to_x_te,
				o_sp_base_to_y_te)
			print 'Correlation:\t{0:2.4f}\t{1:2.4f}'.format(c_sp_base_to_x_te,
				c_sp_base_to_y_te)
		
		# Create an SVM
		clf = OneClassSVM(kernel='linear', nu=0.1, random_state=seed2)
		
		# Evaluate the SVM's performance
		clf.fit(x_tr)
		svm_x_te = len(np.where(clf.predict(x_te) == 1)[0]) / float(ntest) * \
			100
		svm_y_te = len(np.where(clf.predict(y_te) == -1)[0]) / float(ntest) * \
			100
		
		# Perform classification using overlap as the feature
		# -- The overlap must be above 50%
		clf_x_te = 0.
		clf_y_te = 0.
		for x, y in zip(sp_x_te, sp_y_te):
			# Refactor
			xt = np.vstack((sp_base_result, x))
			yt = np.vstack((sp_base_result, y))
			
			# Compute the accuracy
			xo = metrics.compute_overlap(xt)
			yo = metrics.compute_overlap(yt)
			if xo >= clf_th: clf_x_te += 1
			if yo < clf_th: clf_y_te += 1
		clf_x_te = (clf_x_te / ntest) * 100
		clf_y_te = (clf_y_te / ntest) * 100
		
		# Store the results as errors
		sp_x_results[i] = 100 - clf_x_te
		sp_y_results[i] = 100 - clf_y_te
		svm_x_results[i] = 100 - svm_x_te
		svm_y_results[i] = 100 - svm_y_te
		
		# Log the results
		sp._log_stats('SP % Correct Base Class', clf_x_te)
		sp._log_stats('SP % Correct Novelty Class', clf_y_te)
		sp._log_stats('SVM % Correct Base Class', svm_x_te)
		sp._log_stats('SVM % Correct Novelty Class', svm_y_te)
		
		# Print the results
		if verbose:
			print '\nSP Base Class Detection     : {0:2.2f}%'.format(clf_x_te)
			print 'SP Novelty Class Detection  : {0:2.2f}%'.format(clf_y_te)
			print 'SVM Base Class Detection    : {0:2.2f}%'.format(svm_x_te)
			print 'SVM Novelty Class Detection : {0:2.2f}%'.format(svm_y_te)
	
	return sp_x_results, sp_y_results, svm_x_results, svm_y_results
示例#53
0
#UNIRE X1_test_n E X0_outliers_n in X_TEST_n
X_TEST_n = np.concatenate((X1_test_n, X0_outliers_n))

#UNIRE Y1_test E Y0
Y_TEST = np.concatenate((Y1_test, Y0))

pca = PCA(n_components=0.95)
reducer = pca.fit(X1_train_n)
X1_train_n_reduced = reducer.transform(X1_train_n)
X_TEST_n_reduced = reducer.transform(X_TEST_n)

clf = OneClassSVM(gamma='auto', nu=0.5)

clf.fit(X1_train_n_reduced)

Y1_pred_train = clf.predict(X1_train_n_reduced)
Y_pred_TEST = clf.predict(X_TEST_n_reduced)

#VALUTAZIONE

#TRAIN SET

#matrice di confusione

confmat = confusion_matrix(y_true=Y1_train, y_pred=Y1_pred_train)

fig, ax = plt.subplots(figsize=(2.5, 2.5))
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.5)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')
#UNIRE X1_test_n E X0_outliers_n in X_TEST_n
X_TEST_n=np.concatenate((X1_test_n, X0_outliers_n))

#UNIRE Y1_test E Y0
Y_TEST=np.concatenate((Y1_test, Y0))   


                  
clf=OneClassSVM(gamma='auto', nu=0.1)

clf.fit(X1_train_n)



Y1_pred_train=clf.predict(X1_train_n)
Y_pred_TEST=clf.predict(X_TEST_n)


#VALUTAZIONE

#TRAIN SET

#matrice di confusione

confmat = confusion_matrix(y_true=Y1_train, y_pred=Y1_pred_train)

fig, ax = plt.subplots(figsize=(2.5, 2.5))
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
示例#55
0
count = 0
IFlen = []
for x in IF.predict(df):
    if x < 0 and count < len(df):
        df = df.drop(df.index[count])
        dfOnlyClasses = dfOnlyClasses.drop(dfOnlyClasses.index[count])
        IFlen.append(x)
    count = count + 1
#print(df)
#print(dfOnlyClasses)

#outlier dectection using one-class SVM
OCSVM = OneClassSVM(gamma='auto').fit(dataFrame.drop(columns="class"))
#negative values are outliers
outliersWithOCSVM = [
    x for x in OCSVM.predict(dataFrame.drop(columns="class")) if x < 0
]
print('amount of outliers using Isolation Forest:')
print(len(IFlen))
print('amount of outliers using one-class SVM:')
print(len(outliersWithOCSVM))

# In[2]:

#import what's needed to build ann
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
示例#56
0
    1 if i > 1 else 0 for i in lof.negative_outlier_factor_ * -1
]

#-------------------------------------------------------------------------------------------------#
#------------------------------------------One-Class SVM------------------------------------------#
#-------------------------------------------------------------------------------------------------#

expected_perc_outliers = round(sum(d.diagnosis) / len(d), 1)
boundary_smoothness = 1 / len(d.columns[3:12])
ocsvm = OneClassSVM(kernel='rbf',
                    nu=expected_perc_outliers,
                    gamma=boundary_smoothness,
                    random_state=14)
ocsvm.fit(d.iloc[:, 3:12])

ocsvm_outliers = np.where(ocsvm.predict(d.iloc[:, 3:12]) == -1)[0].tolist()
print("Indices of outliers found by One-Class SVM: \n", ocsvm_outliers)
d['ocsvm_outlier'] = [
    1 if i == -1 else 0 for i in ocsvm.predict(d.iloc[:, 3:12])
]

#-------------------------------------------------------------------------------------------------#
#---------------------------------------Isolation Forest------------------------------------------#
#-------------------------------------------------------------------------------------------------#

expected_perc_outliers = round(sum(d.diagnosis) / len(d), 1)
isoforest = IsolationForest(n_estimators=99,
                            contamination=expected_perc_outliers,
                            max_features=1.0,
                            random_state=14)
isoforest.fit(d.iloc[:, 3:12])
示例#57
0
# plt.imshow(X_train[1].reshape(resize_size))


# In[120]:

clf = OneClassSVM(nu=0.01, kernel="rbf", gamma=0.01)


# In[121]:

clf.fit(X_train)


# In[122]:

print "Error training: %d/%d" % (X_train[clf.predict(X_train)==-1].shape[0], X_train.shape[0])
print "Error training: %d/%d" % (X_val[clf.predict(X_val)==-1].shape[0], X_val.shape[0])


# In[108]:

import cPickle as pickle


# In[123]:

pickle.dump(clf, open('full_retinal_img_clf.pkl', 'wb'))


# In[124]: