def _predict_self(self): clf = IsolationForest(contamination=self.frac) clf.fit(self.num_X) return clf.predict(self.num_X)
def test_iforest_subsampled_features(): # It tests non-regression for #5732 which failed at predict. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], boston.target[:50], random_state=rng) clf = IsolationForest(max_features=0.8) clf.fit(X_train, y_train) clf.predict(X_test)
def outlier_rejection(X, y): model = IsolationForest(max_samples=100, contamination=0.4, random_state=rng) model.fit(X) y_pred = model.predict(X) return X[y_pred == 1], y[y_pred == 1]
def outlier_rejection(X, y): """This will be our function used to resample our dataset.""" model = IsolationForest(max_samples=100, contamination=0.4, random_state=rng) model.fit(X) y_pred = model.predict(X) return X[y_pred == 1], y[y_pred == 1]
def IsolationForest_calulate(train_data_one,test_data): # 使用异常检测方法 clf = IsolationForest() # 训练异常检测模型 clf.fit(train_data_one) # 模型预测 Pre_result = clf.predict(test_data) # 计算多少个概率 prob = len([x for x in Pre_result if x == 1])/len(Pre_result) return prob
def test_iforest_works(): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]] # Test LOF clf = IsolationForest(random_state=rng) clf.fit(X) pred = clf.predict(X) # assert detect outliers: assert_greater(np.min(pred[-2:]), np.max(pred[:-2]))
def test_iforest_works(contamination): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]] # Test IsolationForest clf = IsolationForest(random_state=rng, contamination=contamination) clf.fit(X) decision_func = -clf.decision_function(X) pred = clf.predict(X) # assert detect outliers: assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2])) assert_array_equal(pred, 6 * [1] + 2 * [-1])
def isolationForest(self, settings, mname, data): ''' :param settings: -> settings dictionary :param mname: -> name of serialized cluster :return: -> isolation forest instance :example settings: -> {n_estimators:100, max_samples:100, contamination:0.1, bootstrap:False, max_features:1.0, n_jobs:1, random_state:None, verbose:0} ''' # rng = np.random.RandomState(42) if settings['random_state'] == 'None': settings['random_state'] = None if isinstance(settings['bootstrap'], str): settings['bootstrap'] = str2Bool(settings['bootstrap']) if isinstance(settings['verbose'], str): settings['verbose'] = str2Bool(settings['verbose']) if settings['max_samples'] != 'auto': settings['max_samples'] = int(settings['max_samples']) # print type(settings['max_samples']) for k, v in settings.iteritems(): logger.info('[%s] : [INFO] IsolationForest %s set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v) print "IsolationForest %s set to %s" % (k, v) try: clf = IsolationForest(n_estimators=int(settings['n_estimators']), max_samples=settings['max_samples'], contamination=float(settings['contamination']), bootstrap=settings['bootstrap'], max_features=float(settings['max_features']), n_jobs=int(settings['n_jobs']), random_state=settings['random_state'], verbose=settings['verbose']) except Exception as inst: logger.error('[%s] : [ERROR] Cannot instanciate isolation forest with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) print "Error while instanciating isolation forest with %s and %s" % (type(inst), inst.args) sys.exit(1) # clf = IsolationForest(max_samples=100, random_state=rng) # print "*&*&*&& %s" % type(data) try: clf.fit(data) except Exception as inst: logger.error('[%s] : [ERROR] Cannot fit isolation forest model with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) sys.exit(1) predict = clf.predict(data) print "Anomaly Array:" print predict self.__serializemodel(clf, 'isoforest', mname) return clf
def test_iforest_warm_start(): """Test iterative addition of iTrees to an iForest """ rng = check_random_state(0) X = rng.randn(20, 2) # fit first 10 trees clf = IsolationForest(n_estimators=10, max_samples=20, random_state=rng, warm_start=True) clf.fit(X) # remember the 1st tree tree_1 = clf.estimators_[0] # fit another 10 trees clf.set_params(n_estimators=20) clf.fit(X) # expecting 20 fitted trees and no overwritten trees assert len(clf.estimators_) == 20 assert clf.estimators_[0] is tree_1
def outlier_removal(df, col, method, params): if method == 'Isolation Forest': do_outlier_removal = IsolationForest(**params) if method == 'Local Outlier Factor': do_outlier_removal = LocalOutlierFactor(**params) else: method == None do_outlier_removal.fit(np.array(df[col])) if method == 'Isolation Forest': outlier_scores = do_outlier_removal.decision_function(np.array(df[col])) df[('meta', 'Outlier Scores - ' + method + str(params))] = outlier_scores is_outlier = do_outlier_removal.predict(np.array(df[col])) df[('meta', 'Outliers - ' + method + str(params))] = is_outlier if method == 'Local Outlier Factor': is_outlier = do_outlier_removal.fit_predict(np.array(df[col])) df[('meta', 'Outliers - ' + method + str(params))] = is_outlier df[('meta', 'Outlier Factor - ' + method + str(params))] = do_outlier_removal.negative_outlier_factor_ return df, do_outlier_removal
def predict(self, X, window=DEFAULT_WINDOW): """ Predict if a particular sample is an outlier or not. :param X: the time series to detect of :param type X: pandas.Series :param window: the length of window :param type window: int :return: 1 denotes normal, 0 denotes abnormal. """ x_train = list(range(0, 2 * window + 1)) + list(range(0, 2 * window + 1)) + list(range(0, window + 1)) sample_features = zip(x_train, X) clf = IsolationForest(self.n_estimators, self.max_samples, self.contamination, self.max_feature, self.bootstrap, self.n_jobs, self.random_state, self.verbose) clf.fit(sample_features) predict_res = clf.predict(sample_features) if predict_res[-1] == -1: return 0 return 1
y = (y != b'normal.').astype(int) print_outlier_ratio(y) n_samples, n_features = X.shape n_samples_train = n_samples // 2 X = X.astype(float) X_train = X[:n_samples_train, :] X_test = X[n_samples_train:, :] y_train = y[:n_samples_train] y_test = y[n_samples_train:] print('--- Fitting the IsolationForest estimator...') model = IsolationForest(n_jobs=-1, random_state=random_state) tstart = time() model.fit(X_train) fit_time = time() - tstart tstart = time() scoring = -model.decision_function(X_test) # the lower, the more abnormal print("--- Preparing the plot elements...") if with_decision_function_histograms: fig, ax = plt.subplots(3, sharex=True, sharey=True) bins = np.linspace(-0.5, 0.5, 200) ax[0].hist(scoring, bins, color='black') ax[0].set_title('Decision function for %s dataset' % dat) ax[1].hist(scoring[y_test == 0], bins, color='b', label='normal data') ax[1].legend(loc="lower right") ax[2].hist(scoring[y_test == 1], bins, color='r', label='outliers') ax[2].legend(loc="lower right")
class RamachandranFeature(Feature): '''Analyze the phi/psi torsion distributions of proteins.''' def __init__(self): super().__init__() self.clf = None self.de = None def extract(self, input_path, total_num_threads=1, my_id=0): '''Extract phi, psi angles from structures in the input path.''' for f in self.list_my_jobs(input_path, total_num_threads, my_id): if f.endswith('.pdb'): self.extract_from_one_file(os.path.join(input_path, f)) def extract_from_one_file(self, pdb_file): '''Extract phi, psi angles from a pdb_file.''' structure = data_loading.structure_from_pdb_file(pdb_file) for model in structure: for chain in model: for residue in chain: try: feature_dict = {'phi' : geometry.get_phi(chain, residue), 'psi' : geometry.get_psi(chain, residue)} self.feature_list.append(feature_dict) except: pass def visualize(self, transform_features=True): '''Visualize the feature statistics.''' phis = [ d['phi'] for d in self.feature_list ] psis = [ d['psi'] for d in self.feature_list ] # Prepare grid points xx, yy = np.meshgrid(np.linspace(-np.pi, np.pi, 200), np.linspace(-np.pi, np.pi, 200)) transformed_data = np.c_[xx.ravel(), yy.ravel()] if transform_features: transformed_data = self.transform_features(transformed_data) # Draw the decision boundary from the machine learning classifier if self.clf: Z = self.clf.decision_function(transformed_data) Z = Z.reshape(xx.shape) Z_pred = self.clf.predict(transformed_data) Z_pred = Z_pred.reshape(xx.shape) #plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred') plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), Z.max(), 20), cmap=plt.cm.Blues_r) #plt.contourf(xx, yy, Z, levels=np.linspace(0, Z.max()), colors='orange') plt.contourf(xx, yy, Z_pred, levels=[0.9, 1.1], colors='orange') # Draw the density estimation if self.de: Z = self.de.score_samples(transformed_data) Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), Z.max(), 7), cmap=plt.cm.Blues_r) # Draw the data plt.scatter(phis, psis, c='green', s=5) # Plot the support vectors if the classifier is SVM if isinstance(self.clf, svm.OneClassSVM): if transform_features: s_phis = [ machine_learning.cos_sin_to_angle(v[0], v[1]) for v in self.clf.support_vectors_ ] s_psis = [ machine_learning.cos_sin_to_angle(v[2], v[3]) for v in self.clf.support_vectors_ ] plt.scatter(s_phis, s_psis, c='red') else: plt.scatter(self.clf.support_vectors_[:][0], self.clf.support_vectors_[:][1], c='red') plt.axis([- np.pi, np.pi, - np.pi, np.pi]) plt.show() def save(self, data_path): '''Save the data into a csv file.''' data = [ (d['phi'], d['psi']) for d in self.feature_list ] df = pd.DataFrame(data=data, columns=['phi', 'psi']) self.append_to_csv(df, os.path.join(data_path, 'rama_features.csv')) def load(self, data_path): '''Load data from a csv file.''' df = pd.read_csv(os.path.join(data_path, 'rama_features.csv'), header=None) for index, row in df.iterrows(): self.feature_list.append({'phi':row[0], 'psi':row[1]}) def transform_features(self, feature_list): '''Transform feature representations. The arguement feature_list could be a list of dictionary or a list of list. ''' if isinstance(feature_list[0], dict): return [machine_learning.angle_to_cos_sin(d['phi']) + machine_learning.angle_to_cos_sin(d['psi']) for d in feature_list] else: return [machine_learning.angle_to_cos_sin(d[0]) + machine_learning.angle_to_cos_sin(d[1]) for d in feature_list] def learn(self, clf_type="OneClassSVM", transform_features=True): '''Learn the distribution with a machine learning classifier''' # Prepare the training data all_data = [(d['phi'], d['psi']) for d in self.feature_list] if transform_features: all_data = self.transform_features(all_data) n_data = len(all_data) training_data = all_data[0:int(0.6 * n_data)] test_data = all_data[int(0.6 * n_data):int(0.8 * n_data)] cv_data = all_data[int(0.8 * n_data):n_data] # Train the classifier if clf_type == "OneClassSVM": nus = [0.05, 0.02, 0.01, 0.005, 0.002, 0.001] least_error = len(test_data) for i in range(len(nus)): print("nu = {0}".format(nus[i])) clf = svm.OneClassSVM(nu=nus[i], kernel="rbf", gamma='auto') clf.fit(training_data) predictions = clf.predict(training_data) print("{0}/{1} training error.".format(len(predictions[-1 == predictions]), len(training_data))) predictions = clf.predict(test_data) print("{0}/{1} test error.\n".format(len(predictions[-1 == predictions]), len(test_data))) if len(predictions[-1 == predictions]) < least_error: least_error = len(predictions[-1 == predictions]) self.clf = clf elif clf_type == "IsolationForest": self.clf = IsolationForest(max_samples=20000, contamination=0.01, random_state=np.random.RandomState(42)) self.clf.fit(training_data) # Print Training results predictions = self.clf.predict(cv_data) print("{0}/{1} cross validation error.".format(len(predictions[-1 == predictions]), len(cv_data))) if clf_type == "OneClassSVM": print("{0} support vectors found.".format(len(self.clf.support_))) def predict(self, input_data, transform_features=True): '''Make a prediction for the input data with the machine learning classifier. input_data is a list of phi, psi angles. ''' transformed_data = input_data if transform_features: transformed_data = self.transform_features(transformed_data) return self.clf.predict(transformed_data) def calculate_space_reduction(self, transform_features=True): '''Calculate the space reduction power of the machine learning model.''' phis = np.random.uniform(-np.pi, np.pi, 10000) psis = np.random.uniform(-np.pi, np.pi, 10000) predictions = self.predict(list(zip(phis, psis)), transform_features=transform_features) print("The space is reduced by {0}.".format(len(predictions[1 == predictions]) / len(predictions))) def density_estimate(self, de_type="GaussianMixture", transform_features=True): '''Get a density estimation of the data.''' all_data = [(d['phi'], d['psi']) for d in self.feature_list] if transform_features: all_data = self.transform_features(all_data) n_data = len(all_data) training_data = all_data[0:int(0.7 * n_data)] test_data = all_data[int(0.7 * n_data):n_data] # Make some random data phis = np.random.uniform(-np.pi, np.pi, 10000) psis = np.random.uniform(-np.pi, np.pi, 10000) random_data = list(zip(phis, psis)) if transform_features: random_data = self.transform_features(random_data) if de_type == "GaussianMixture": self.de = mixture.BayesianGaussianMixture(n_components=100, covariance_type='full').fit(training_data) # Evalute the cumulative distribution functions of scores of test data test_scores = self.de.score_samples(test_data) values, base = np.histogram(test_scores, bins=40) cumulative = np.cumsum(values) for i in range(40): # Evaluate the space compression random_scores = self.de.score_samples(random_data) compress_coe = len(random_scores[random_scores > base[i]]) / len(random_scores) print('{0:.3f}\t{1}\t{2:.5f}\t{3:.5f}'.format(base[i], cumulative[i], cumulative[i] / len(test_data), compress_coe)) elif de_type == "KernelDensity": params = {'bandwidth': np.logspace(-1, 1, 5)} grid = GridSearchCV(KernelDensity(), params) grid.fit(training_data) self.de = grid.best_estimator_
filtered_data = df.loc[keys_filter] filtered_data = filtered_data.sort_values(by=['Months Since Start'], ascending=False) if len(filtered_data['Full Work Day']) > 12: # sorted_data = filtered_data.sort_values('Full Work Day') X_full = filtered_data[['Months Since Start']].values y_full = filtered_data['Full Work Day'].values # isolation forest to detect anomalies y_full_reshape = y_full.reshape(-1, 1) clf = IsolationForest(random_state=rng) clf.fit(y_full_reshape) clf_predicted_amount = clf.predict(y_full_reshape) anomaly_test_false = clf_predicted_amount == 1 anomaly_test_true = clf_predicted_amount == -1 X = X_full[anomaly_test_false] y = y_full[anomaly_test_false] X_upto_last_6months = X[0:-6] y_upto_last_6months = y[0:-6] X_last_6months = X[-6:] y_last_6months = y[-6:] # run regression
# Detect unusual sequence but not extreme value. More difficult to evaluate the relevance on this example. The sequence size (5) should be match with some interesting cycle. # ## 2.5 Isolation Forest # #### Use for collective anomalies (unordered). # Simple, works well with different data repartition and efficient with high dimention data. # In[ ]: # Take useful feature and standardize them data = df[['value', 'hours', 'daylight', 'DayOfTheWeek', 'WeekDay']] min_max_scaler = preprocessing.StandardScaler() np_scaled = min_max_scaler.fit_transform(data) data = pd.DataFrame(np_scaled) # train isolation forest model = IsolationForest(contamination=outliers_fraction) model.fit(data) # add the data to the main df['anomaly25'] = pd.Series(model.predict(data)) df['anomaly25'] = df['anomaly25'].map({1: 0, -1: 1}) print(df['anomaly25'].value_counts()) # In[ ]: # visualisation of anomaly throughout time (viz 1) fig, ax = plt.subplots() a = df.loc[df['anomaly25'] == 1, ['time_epoch', 'value']] #anomaly ax.plot(df['time_epoch'], df['value'], color='blue') ax.scatter(a['time_epoch'], a['value'], color='red') plt.show()
######################################################################### index += 1 print(x_train.shape) X_train = x_train rng = np.random.RandomState(42) isofortrain = IsolationForest(n_estimators=1000, max_samples='auto', contamination=.20, max_features=1, random_state=rng, n_jobs=-1) isofortrain.fit(X_train) anomalytrain = isofortrain.decision_function(X_train) predicttrain = isofortrain.predict(X_train) len_predictrain = len(predicttrain) print("len_predictrain", len_predictrain) num_iforest_diff = 0 for i in predicttrain: if i == -1: num_iforest_diff += 1 print("num_iforest_diff", num_iforest_diff) same = 0
def InputOutlierDetection(xtrain, xtest, ytrain, ytest, outlier_percent=0.2, removal=None, isoforest=None, randstate=None, onlytrain=False, n_estimators=100): from sklearn.ensemble import IsolationForest print('\nExecuting [InputOutlierDetection] using Isolation Forest...') # If no current Isolation Forest exists, so to learn the current data to train the Isolation Forest model if isoforest is None: isoforest = IsolationForest(n_jobs=-1, verbose=2, contamination=outlier_percent, random_state=randstate, n_estimators=n_estimators, bootstrap=True) # Train the isolation forest to define and detect outliers isoforest.fit(xtrain) # If I just intend to train an Isolation Forest, then return the Isolation Forest and end the function if onlytrain: return isoforest # Yield score arrays on the training and test data in which -1 means anomaly xtrain_anomalyscore = isoforest.predict(xtrain) # If testSize = 0., then skip test data prediction try: xtest_anomalyscore = isoforest.predict(xtest) except: xtest_anomalyscore = [] # meanScoreTrain = isoforest.decision_function(xtrain) # meanScoreTest = isoforest.decision_function(xtest) # print('Train data anomaly score (higher is better): {0}'.format(meanScoreTrain)) # print('Test data anomaly score (higher is better): {0}'.format(meanScoreTest)) # Get the index array of all data considered abnormal (-1) anomaly_idx_train = np.where(xtrain_anomalyscore == -1) anomaly_idx_test = np.where(xtest_anomalyscore == -1) anomaly_idx_train = anomaly_idx_train[0] anomaly_idx_test = anomaly_idx_test[0] # Initialize "empty" array/list for outliers xtrainOutliers = np.empty([1, xtrain.shape[1]]) # ytrainOutliers = np.empty(len(anomaly_idx_train)) ytrainOutliers = np.empty([1, ytrain.shape[1]]) # If xtest is a an empty list then skip this step try: xtestOutliers = np.empty([1, xtest.shape[1]]) ytestOutliers = np.empty([1, ytest.shape[1]]) except AttributeError: xtestOutliers = ['dummy'] ytestOutliers = ['dummy'] xtrainRaw = xtrain ytrainRaw = ytrain # Remove the anomaly indices iteratively for i, idx in enumerate(anomaly_idx_train): xtrainOutliers = np.vstack((xtrainOutliers, xtrainRaw[idx])) # ytrainOutliers[i] = ytrain[idx] ytrainOutliers = np.vstack((ytrainOutliers, ytrainRaw[idx])) # If removal is 'train' or 'both, remove the outliers in the train input and target data if removal in ('train', 'both'): ytrain = np.delete(ytrain, idx - i, 0) # 0 means the first axis -- row xtrain = np.delete(xtrain, idx - i, 0) # # Update xtrain and ytrain if removal was done # if removal in('train', 'both'): # xtrain = xtrainInliers # ytrain = ytrainInliers xtestRaw = xtest ytestRaw = ytest # When anomaly_idx_test is empty, this loop will not execute for i, idx in enumerate(anomaly_idx_test): xtestOutliers = np.vstack((xtestOutliers, xtestRaw[idx])) # ytestOutliers[i] = ytest[idx] ytestOutliers = np.vstack((ytestOutliers, ytestRaw[idx])) # If removal is 'test' or 'both', then remove the outliers in test input and target data if removal in ('test', 'both'): ytest = np.delete(ytest, idx - i, 0) xtest = np.delete(xtest, idx - i, 0) # Since the arrays were not actually empty when they were initiated, remove the first row, first 0 means "first", second 0 means "row" xtrainOutliers = np.delete(xtrainOutliers, 0, 0) ytrainOutliers = np.delete(ytrainOutliers, 0, 0) # xtestOutliers = xtestOutliers[1:] xtestOutliers = np.delete(xtestOutliers, 0, 0) ytestOutliers = np.delete(ytestOutliers, 0, 0) # Get the outliers for later inspection outliers = dict(xtrain=xtrainOutliers, xtest=xtestOutliers, ytrain=ytrainOutliers, ytest=ytestOutliers, anomaly_idx_train=anomaly_idx_train, anomaly_idx_test=anomaly_idx_test, xtrain_anomalyscore=xtrain_anomalyscore, xtest_anomalyscore=xtest_anomalyscore) return xtrain, xtest, ytrain, ytest, outliers, isoforest
def main(): start = time.clock() URLKeyword, URLchar, action, title = get_dic() #load training dataset mainfile = './data/file_list_20170430_new的副本.txt' WebDirectory = './data/file的副本/' MD5_list, flag_list, URL_list = traverse_directory(WebDirectory, mainfile) X_train = list() Y_train = flag_list for i in range(len(MD5_list)): URL = URL_list[i] Web_data = read_file(MD5_list[i]) web_vec = Web_feature(Web_data, title, action, MD5_list[i]) URL_vec = URL_feature(URL, URLKeyword, URLchar) feature = np.hstack((web_vec, URL_vec)) X_train.append(feature) # print(len(feature)) print(len(X_train), len(Y_train)) X_train = np.asarray(X_train) Y_train = np.asarray(Y_train) print(X_train.shape, Y_train.shape) #feature selection # for a_fea in range(70,60,-2): X_train, Y_train, F_index = feature_selection(X_train, Y_train, 70) # print(F_index) #train model # tuned_parameters = {'n_estimators': range(10, 120, 10), "max_samples": range(70, 270, 20),'contamination' # } clf = IsolationForest(contamination=0.06, n_estimators=90, max_samples=150, bootstrap=True) clf.fit(X_train, Y_train) # print("best parameter:", clf.best_params_) # print(clf.grid_scores_) # joblib.dump(clf,'Isolation_model.m') middle = time.clock() print(middle - start) y_pred = clf.predict(X_train) print('Accuracy Score(normalize=True):', accuracy_score(Y_train, y_pred, normalize=True)) evaluate_model(Y_train, y_pred) end = time.clock() print(end - middle) # print("Testing Score:%f"%clf.score(X_test,y_test)) #load testing dataset mainfile1 = './data/file_list_10000.txt' WebDirectory1 = './data/file1/' MD5_list1, flag_list1, URL_list1 = traverse_directory_t( WebDirectory1, mainfile1) X_test = list() Y_test = flag_list1 for h in range(len(MD5_list1)): s_fea = [] URL1 = URL_list1[h] Web_data1 = read_file(MD5_list1[h]) web_vec1 = Web_feature(Web_data1, title, action, MD5_list1[h]) URL_vec1 = URL_feature(URL1, URLKeyword, URLchar) feature1 = np.hstack((web_vec1, URL_vec1)) for j in F_index: s_fea.append(feature1[j]) X_test.append(s_fea) # print("********") print(len(X_test), len(Y_test)) #test model y_tpred = clf.predict(X_test) print('Accuracy Score(normalize=True):', accuracy_score(Y_test, y_tpred, normalize=True)) evaluate_model(Y_test, y_tpred) end2 = time.clock() print(end2 - end)
# X = X[indices] # y = y[indices] X_train = X[:n_samples_train, :] X_test = X[n_samples_train:, :] y_train = y[:n_samples_train] y_test = y[n_samples_train:] # # training only on normal data: # X_train = X_train[y_train == 0] # y_train = y_train[y_train == 0] print('IsolationForest processing...') model = IsolationForest() tstart = time() model.fit(X_train) fit_time += time() - tstart tstart = time() scoring = -model.decision_function(X_test) # the lower,the more normal predict_time += time() - tstart fpr_, tpr_, thresholds_ = roc_curve(y_test, scoring) if predict_time + fit_time > max_time: raise TimeoutError f = interp1d(fpr_, tpr_) tpr += f(x_axis) tpr[0] = 0. precision_, recall_ = precision_recall_curve(y_test, scoring)[:2]
def Eval(clargs): __version__ = '1.0' usage = """train_flows [options] normaldatafile""" parser = OptionParser(usage=usage, version=__version__) parser.add_option("-x", "--vectorizerfile", action="store", type="string", \ default='/tmp/vectorizers.pkl', help="") parser.add_option("-v", "--verbose", action="store_true", default=False, \ help="enable verbose output") parser.add_option("-o", "--maliciousdatafile", action="store", type="string", \ default=None, help="An optional file of malicious http logs") parser.add_option("-m", "--maxfeaturesperbag", action="store", type="int", \ default=100, help="maximum number of features per bag") parser.add_option("-g", "--ngramsize", action="store", type="int", \ default=7, help="ngram size") parser.add_option("-f", "--features", action="store", type="string", \ default="01000100111111111111", help="An optional file for choosing which features to be extracted") parser.add_option("-t", "--maxtrainingfeatures", action="store", type="int", \ default=50000, help="maximum number of rows to train with per class") parser.add_option("-n", "--numtrees", action="store", type="int", \ default=200, help="number of trees in isolation forest") parser.add_option("-s", "--numsamples", action="store", type="int", \ default=8192, help="number of samples in each tree") Start=time.time() (opts, args) = parser.parse_args(clargs) if len(args) != 2: parser.error('Incorrect number of arguments') ftu=[] features = opts.features for i, j in enumerate(features): if opts.verbose: print(j, all_fields[i]) if j == 1 or j=='1': ftu.append(all_fields[i]) if opts.verbose: print ftu #ftu = ['method', 'user_agent', 'status_code'] # load the http data in to a data frame print('Loading HTTP data') df = load_brofile(args[0], fields_to_use) trainDf = load_brofile(args[1], fields_to_use) total_rows = len(df.index) if opts.verbose: print('Total number of rows: %d' % total_rows) if opts.maliciousdatafile != None: print('Reading malicious training data') df1 = load_brofile(opts.maliciousdatafile, fields_to_use) if opts.verbose: print('Read malicious data with %s rows ' % len(df1.index)) #if (len(df1.index) > opts.maxtrainingfeatures): # if opts.verbose: print('Too many malicious samples for training, downsampling to %d' % opts.maxtrainingfeatures) # df1 = df1.sample(n=opts.maxtrainingfeatures) #set the classes of the dataframes and then stitch them together in to one big dataframe df['class'] = 0 df1['class'] = 1 classedDf = pd.concat([df,df1], ignore_index=True) else: #we weren't passed a file containing class-1 data, so we should generate some of our own. noiseDf = create_noise_contrast(df, numSamples) if opts.verbose: print('Added %s rows of generated malicious data'%numSamples) df['class'] = 0 noiseDf['class'] = 1 classedDf = pd.concat([df,noiseDf], ignore_index=True) #that doesn't matter trainDf['class']=0; #spliting into training and evaluation sets classedDf['is_train']=False trainDf['is_train']=True enhancedDf = enhance_flow(pd.concat([trainDf,classedDf], ignore_index=True), ftu) # construct some vectorizers based on the data in the DF. We need to vectorize future log files the exact same way so we # will be saving these vectorizers to a file. vectorizers = build_vectorizers(enhancedDf, ftu, max_features=opts.maxfeaturesperbag, ngram_size=opts.ngramsize, verbose=opts.verbose) #use the vectorizers to featureize our DF into a numeric feature dataframe featureMatrix = featureize(enhancedDf, ftu, vectorizers, verbose=opts.verbose) #add the class column back in (it wasn't featurized by itself) featureMatrix['class'] = enhancedDf['class'] featureMatrix['is_train'] = enhancedDf['is_train'] #split out the train and test df's into separate objects train, test = featureMatrix[featureMatrix['is_train']==True], featureMatrix[featureMatrix['is_train']==False] #drop the is_train column, we don't need it anymore train = train.drop('is_train', axis=1) test = test.drop('is_train', axis=1) #print('Calculating features') Trees=opts.numtrees Samples=opts.numsamples clf = IsolationForest(n_estimators=Trees, max_samples=Samples) clf.fit(train.drop('class', axis=1)) testnoclass = test.drop('class', axis=1) print('Predicting') test.is_copy = False test['prediction'] = clf.decision_function(testnoclass) + 0.5 print('Analyzing') #get the class-1 (outlier/anomaly) rows from the feature matrix, and drop the prediction so we can investigate them ##From Here Left=0.001 Right=0.01 fpr, tpr, thresholds = roc_curve(test['class'], test['prediction'], pos_label=0) F=interpolate.interp1d(fpr, tpr, assume_sorted=True) x=np.logspace(np.log10(Left), np.log10(Right)) y=F(x) roc_auc=auc(x, y) plt.figure() plt.xscale('log') plt.plot(fpr, tpr, color='b') plt.plot(x,y, color='r') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.plot(plt.xlim(), plt.ylim(), ls="--", c=".3") plt.savefig("fig3.png") plt.clf() plt.close('all') print('Area Under the Curve = %.6f' %(roc_auc)) Min, Sec= divmod( int(time.time() - Start), 60 ) #print Min, Sec target= open('Results.txt', 'a') target.write(str(Trees)+' ') target.write(str(Samples)+' ') target.write(str(Min)+' ') target.write(str(Sec)+' ') target.write(str(roc_auc)) target.write("\n") target.write(str(features)) target.write("\n") target.write("\n") target.close() print("Minutes: %d, Seconds: %d" % (int(Min), int(Sec)) ) return roc_auc
print("Training: One Class SVM (Linear) : ",(Train_Accuracy(train_AD_L)),"%") print("Test: One Class SVM (Linear) : ",(Test_Accuracy(test_AD_L)),"%") # # Isolation Forest # In[58]: from sklearn.ensemble import IsolationForest # In[59]: IFA=IsolationForest() IFA.fit(Negatives) # In[60]: train_IFA=IFA.predict(Negatives) test_IFA=IFA.predict(Positives) # In[61]: print("Training: Isolation Forest: ",(Train_Accuracy(train_IFA)),"%") print("Test: Isolation Forest: ",(Test_Accuracy(test_IFA)),"%") # Isolation Forest has worked way better than one class SVM. Thus, considered as best anomaly detection model.
cur_dir=os.getcwd() input_path=os.path.join(cur_dir,args["dataset"]) #whatever test_directory is named frame=frame_from_dir(input_path) data=[] model = DenseNet201(weights='imagenet', include_top=False) for xy in range(len(frame)): #frame by frame passed through feature extractor to extract feature img_path = os.path.join(input_path,frame[xy]) img = image.load_img(img_path, target_size=(530, 700)) img_data = image.img_to_array(img) img_data = np.expand_dims(img_data, axis=0) img_data = preprocess_input(img_data) vgg_feature = model.predict(img_data) data.append(vgg_feature) a=np.array(data) a=a.reshape(len(frame),-1) #converting to a feature vector # train the anomaly detection model print("[INFO] fitting anomaly detection model...") modelanom = IsolationForest(n_estimators=150, contamination=0.01, random_state=42) modelanom.fit(a) # serialize the anomaly detection model to disk f = open(args["model"], "wb") f.write(pickle.dumps(modelanom)) f.close()
target = lat.flatten() for i in range(len(target)): if (target[i] == 1): target[i] = -1 elif (target[i] == 0): target[i] = 1 print(target[i]) scaler.fit(lon) normalised_input_data = scaler.transform(lon) print(type(normalised_input_data)) clf = IsolationForest(max_samples=100, random_state=42, contamination=.35) clf.fit(normalised_input_data) y_pred = clf.predict(normalised_input_data) accu = 0.000 print(list(y_pred).count(-1)) print(len(y_pred)) no_outliers = list(y_pred).count(-1) l = len(y_pred) accu = no_outliers / l print(accu) print("Accuracy in Detecting Fraud Cases:", accu) print(y_pred) print(target) plt.subplot(2, 1, 1) plt.scatter(normalised_input_data[:, 0],
def anomaly_detection(testdata_name,rank_method_index,test_EVs_ts,test_MVs_ts): # Local Outlier Factor from sklearn.neighbors import LocalOutlierFactor from myFunctions import gen_dist_mat # experimentName = '{}_LOF'.format(testdata_name) # Choose ranking method # rank_group = rank_high_low rank_group = rank_methods[rank_method_index] rank_method_name = rank_methods_names[rank_method_index] test_weather_ts = test_EVs_ts[0] # test weather data # MV_index = 0 # MV we are examining MV_predictions = [] for MV_index in range(len(MVs)): predictions = [] for n in range(test_weather_ts.shape[0]): # The 20th closest weather data weather_group = rank_group(weather_ts,test_weather_ts[n])['Day'][:20] print('{} - group length:{}'.format(n,len(weather_group))) if len(weather_group) < 10: predictions.append('len<') continue # reshape to row array to concatenate test_data_point = test_MVs_ts[MV_index,n].reshape((1,MVs_ts[MV_index,weather_group].shape[1])) # concatenated matrix of training data and the test data sample NT_data = np.concatenate((MVs_ts[MV_index,weather_group],test_data_point),axis = 0) LOF = LocalOutlierFactor(n_neighbors = 3,metric='precomputed') D = gen_dist_mat(NT_data) # distance matrix # if distance matrix are all zeros(all TS are identical), then skip this if len(D[D == 0]) == D.shape[0]*D.shape[1]: predictions.append('D=0') continue pred = LOF.fit_predict(D) predictions.append(str(pred[-1])) # change to string to avoid comparison error in numpy later # if detected as outlier, save plot of MVs if pred[-1] == -1: plt.figure() # # draw only the current MV----- for c in weather_group: plt.plot(MVs_ts[MV_index,c],color='steelblue',alpha=0.5,linestyle='dotted') plt.plot(test_MVs_ts[MV_index,n],color='gold') #-------------------------------- # # draw for all MVs------------- # for index in range(MVs_ts.shape[0]): # for c in combination: # plt.plot(MVs_ts[index,c],color=color_list[index],alpha=0.5,linestyle='dotted') # plt.plot(test_MVs_ts[index,n],color='gold') # plt.show() # ------------------------------- dir_loc = r'C:\Users\James\Desktop\python_figs\rank\{}\{}\{}'.format(rank_method_name,experimentName,MVs[MV_index]) # check directory if exists if not os.path.exists(dir_loc): os.makedirs(dir_loc) # save faulty plot plt.savefig(dir_loc + '\\n{}.png'.format(n)) plt.close() MV_predictions.append(np.array(predictions)) p_fault = np.empty(MV_predictions[0].shape,dtype = np.bool) # faulty p_normal = np.empty(MV_predictions[0].shape,dtype = np.bool) # normal p_lack = np.empty(MV_predictions[0].shape,dtype = np.bool) # lack of data p_fault[:] = False p_normal[:] = True # False p_lack[:] = True # False for predictions in MV_predictions: p_fault = np.logical_or(p_fault, predictions=='-1') normal_with_identical = np.logical_or(predictions=='1',predictions=='D=0') p_normal = np.logical_and(p_normal,normal_with_identical) p_lack = np.logical_and(p_lack, predictions=='len<') # the indices of ts sample which are considered faulty fault_index = np.arange(len(p_fault))[p_fault] normal_index = np.arange(len(p_normal))[p_normal] lack_index = np.arange(len(p_lack))[p_lack] # print results: fd_rate = 'Fault detection rate:\t {}%'.format(len(fault_index)/test_weather_ts.shape[0]*100) nd_rate = 'Normal operation rate:\t {}%'.format(len(normal_index)/test_weather_ts.shape[0]*100) ld_rate = 'Lack of data rate:\t {}%'.format(len(lack_index)/test_weather_ts.shape[0]*100) print(fd_rate) print(nd_rate) print(ld_rate) # Save results: dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}'.format(rank_method_name,experimentName) with open(dir_loc+'\\results.txt','w') as f: f.write(fd_rate + '\n' + nd_rate+ '\n' + ld_rate) # Isolation Forest from sklearn.ensemble import IsolationForest from myFunctions import gen_dist_mat # experimentName = '{}_IsolationForest'.format(testdata_name) # Choose ranking method # rank_group = rank_high_low rank_group = rank_methods[rank_method_index] rank_method_name = rank_methods_names[rank_method_index] # test_weather_ts = test_EVs_ts[0] # test weather data # MV_index = 0 # MV we are examining MV_predictions = [] for MV_index in range(len(MVs)): predictions = [] for n in range(test_weather_ts.shape[0]): # The 20th closest weather data weather_group = rank_group(weather_ts,test_weather_ts[n])['Day'][:20] print('{} - group length:{}'.format(n,len(weather_group))) if len(weather_group) < 10: predictions.append('len<') continue # reshape to row array to concatenate test_data_point = test_MVs_ts[MV_index,n].reshape((1,MVs_ts[MV_index,weather_group].shape[1])) # concatenated matrix of training data and the test data sample NT_data = np.concatenate((MVs_ts[MV_index,weather_group],test_data_point),axis = 0) D = gen_dist_mat(NT_data) # distance matrix # if distance matrix are all zeros(all TS are identical), then skip this if len(D[D == 0]) == D.shape[0]*D.shape[1]: predictions.append('D=0') continue IsoForest = IsolationForest() IsoForest.fit(NT_data) pred = IsoForest.predict(NT_data) predictions.append(str(pred[-1])) # change to string to avoid comparison error in numpy later # if detected as outlier, save plot of MVs if pred[-1] == -1: plt.figure() # # draw only the current MV----- for c in weather_group: plt.plot(MVs_ts[MV_index,c],color='steelblue',alpha=0.5,linestyle='dotted') plt.plot(test_MVs_ts[MV_index,n],color='gold') #-------------------------------- # # draw for all MVs------------- # for index in range(MVs_ts.shape[0]): # for c in combination: # plt.plot(MVs_ts[index,c],color=color_list[index],alpha=0.5,linestyle='dotted') # plt.plot(test_MVs_ts[index,n],color='gold') # plt.show() # ------------------------------- dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}\{}'.format(rank_method_name,experimentName,MVs[MV_index]) # check directory if exists if not os.path.exists(dir_loc): os.makedirs(dir_loc) # save faulty plot plt.savefig(dir_loc + '\\n{}.png'.format(n)) plt.close() MV_predictions.append(np.array(predictions)) p_fault = np.empty(MV_predictions[0].shape,dtype = np.bool) # faulty p_normal = np.empty(MV_predictions[0].shape,dtype = np.bool) # normal p_lack = np.empty(MV_predictions[0].shape,dtype = np.bool) # lack of data p_fault[:] = False p_normal[:] = True # False p_lack[:] = True # False for predictions in MV_predictions: p_fault = np.logical_or(p_fault, predictions=='-1') normal_with_identical = np.logical_or(predictions=='1',predictions=='D=0') p_normal = np.logical_and(p_normal,normal_with_identical) p_lack = np.logical_and(p_lack, predictions=='len<') # the indices of ts sample which are considered faulty fault_index = np.arange(len(p_fault))[p_fault] normal_index = np.arange(len(p_normal))[p_normal] lack_index = np.arange(len(p_lack))[p_lack] # print results: fd_rate = 'Fault detection rate:\t {}%'.format(len(fault_index)/test_weather_ts.shape[0]*100) nd_rate = 'Normal operation rate:\t {}%'.format(len(normal_index)/test_weather_ts.shape[0]*100) ld_rate = 'Lack of data rate:\t {}%'.format(len(lack_index)/test_weather_ts.shape[0]*100) print(fd_rate) print(nd_rate) print(ld_rate) # Save results: dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}'.format(rank_method_name,experimentName) with open(dir_loc+'\\results.txt','w') as f: f.write(fd_rate + '\n' + nd_rate+ '\n' + ld_rate)
# read arguments parser = argparse.ArgumentParser() parser.add_argument('--data-path', type=str, help='path to the dataset') args = parser.parse_args() # process data df = pd.read_csv(args.data_path, index_col=None, header=None) # read it df = process_data(df) # split data train, test_norm, test_anom = split_data(df) X_train, y_train = train # unpack training data # train model model = IsolationForest(random_state=RAND_STATE, n_estimators=50) model.fit(X_train.astype('float32')) # convert to onnxfrom skl2onnx import convert_sklearn initial_types = [('float_input', FloatTensorType([None, X_train.shape[1]]))] onx = convert_sklearn(model, initial_types=initial_types) session = ort.InferenceSession(onx.SerializeToString()) input_name = session.get_inputs()[0].name label_name = session.get_outputs()[0].name del onx, model model = session, input_name, label_name if (DEBUG): print(f'ONNX Runtime Device: {ort.get_device()}') # score model mlflow.log_metric('F1-Score Training Normal', compute_f1(model, train, 1)) mlflow.log_metric('F1-Score Testing Normal',
def test_iforest_deprecation(): iforest = IsolationForest(behaviour='new') warn_msg = "'behaviour' is deprecated in 0.22 and will be removed in 0.24" with pytest.warns(DeprecationWarning, match=warn_msg): iforest.fit(iris.data)
from sklearn.ensemble import IsolationForest rng = np.random.RandomState(42) # Generate train data X = 0.3 * rng.randn(100, 2) X_train = np.r_[X + 2, X - 2] # Generate some regular novel observations X = 0.3 * rng.randn(20, 2) X_test = np.r_[X + 2, X - 2] # Generate some abnormal novel observations X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)) # fit the model clf = IsolationForest(max_samples=100, random_state=rng) clf.fit(X_train) y_pred_train = clf.predict(X_train) y_pred_test = clf.predict(X_test) y_pred_outliers = clf.predict(X_outliers) # plot the line, the samples, and the nearest vectors to the plane xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50)) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.title("IsolationForest") plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white') b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green') c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red')
LABEL = "ddG_offset" y = dataset["ddG_offset"] for i in range(20): # Split into Train and Test (80/20) train, test = train_test_split(dataset, test_size=0.2) # Outlier exclusion step: with warnings.catch_warnings(): warnings.simplefilter("ignore") outlier_forest = IsolationForest(max_samples = "auto") outlier_forest.fit(train) y_no_outliers = outlier_forest.predict(train) y_no_outliers = pd.DataFrame(y_no_outliers, columns = ['Top']) y_no_outliers[y_no_outliers['Top'] == 1].index.values train = train.iloc[y_no_outliers[y_no_outliers['Top'] == 1].index.values] train.reset_index(drop = True, inplace = True) print("Number of outliers in training data:", y_no_outliers[y_no_outliers['Top'] == -1].shape[0]) # Normalisation step: scaler_ddG_offset = StandardScaler() mat_ddG_offset = np.array(train.ddG_offset).reshape((len(train)), 1) scaler_ddG_offset.fit(mat_ddG_offset)
# ## Improving the Predicition model ## # This part is about finding a better metric for predicting future house sales regarding their price. # # First, I will detect outliers and delete them from the dataset if needed. # ### Detecting Outliers ### # The first step to improve our learning behaviour is to find outliers and then remove them from the data set if needed. # To detect outliers I will use the Isolation Forest Algorithm which is good for high-dimensional data sets as we have present here. # In[ ]: from sklearn.ensemble import IsolationForest clf = IsolationForest(max_samples=100, random_state=rng) clf.fit(df) y = clf.predict(df) print y # ### Location based prices ### # House prices don't only depend on the size of the house or amount of rooms, but are also really dependant on the location of said house. To get an idea how the position might impact my data I analyse the relationship between location and price in my dataset. # In[ ]: import gmaps gmaps.configure(api_key="AIzaSyDPWAl8lcrK9q-tOkrl64sGkxDnbWz47Ko") locations = df[["lat", "long"]] prices = df["price"]
pp = PdfPages(plotfolder + 'scatterplots.pdf') for j, features in enumerate(feature_pairs): X, Y = features[0], features[1] print j, 'of', len(feature_pairs) pair_features = np.array([INFO[features[0]], INFO[features[1]]]).T forest = IsolationForest( n_estimators=100, #max_samples=1000, random_state=0, contamination=num_outlier / 343546.0 # number of nodes ) fig = scatter_plot(INFO[X], INFO[Y], INFO['IDs'], discription[Y], discription[X], discription[Y] + ' vs ' + discription[X], compare_value[X]) forest.fit(pair_features) scores = forest.decision_function(pair_features[outlier_ids, :]) rank_list = sorted([(outliers[i], -s) for (i, s) in enumerate(scores)], key=lambda x: x[1], reverse=True) rank_matrix.append(rank_list) pp.close() print rank_matrix # runs, properly till this, why is generate_graph returning nothing? scaled_matrix, normal_matrix = ranklist.generate_graph(P_val, num_outlier, rank_matrix) plots = plotSpot(budget, scaled_matrix, "SpellOut") frequencies = generate_frequency_list(plots, scaled_matrix) for i, plot in enumerate(plots):
y = mat_data['y'] file_name = 'experiment_results/' + datasets[i] + '.txt' File_object = open(file_name, "w") time_all = np.zeros((trials, 5)) precision_all = np.zeros((trials, 5)) auc_all = np.zeros((trials, 5)) for j in range(0, trials): print('\n\n******' + datasets[i] + ' trial ' + str(j + 1) + '*******\n\n') print('\n******Iso-Forest*******\n') start = time.time() clf = IsolationForest(contamination=0.1, behaviour='new') clf.fit(X) end = time.time() time_all[j, 0] = end - start iso_scores = clf.score_samples(X) if run_lof_svm == 0: lof_scores = iso_scores osvm_scores = iso_scores elif j == 0: print('\n******LOF*******\n') start = time.time() lof = LocalOutlierFactor() lof.fit(X) end = time.time() time_all[j, 1] = end - start
def detect(self): ''' 利用孤立森林 isolation forest 进行离群点检测 ''' # 获得预处理之后的数据 data = self.preprocess() # 异常点检测 # 创建 IsolationForest ilf = IsolationForest( n_estimators=self.n_estimators, n_jobs=-1, # 使用全部cpu verbose=self.verbose, contamination=self.contamination, # 离群点的比例 ) # 是否保存/加载模型的控制流 if self.isSaveModel and self.isLoadModel: # isSaveModel = True & isLoadModel = True # 训练并保存模型到本地,然后继续预测 # 训练 print('Model training...') ilf.fit(data) # 保存模型到本地 print('Saving model to `%s`...' % self.modelname) with open(self.modelname, 'wb') as fp: pickle.dump(ilf, fp) elif self.isSaveModel: # isSaveModel = True & isLoadModel = False # 训练并保存模型到本地,然后不再继续预测 # 训练 print('Model training...') ilf.fit(data) # 保存模型到本地 print('Saving model to `%s`...' % self.modelname) with open(self.modelname, 'wb') as fp: pickle.dump(ilf, fp) print('Don\'t predict.') return elif self.isLoadModel: # isSaveModel = False & isLoadModel = True # 直接加载本地模型,然后继续预测 # 加载本地模型 print('Loading model from `%s`...' % self.modelname) with open(self.modelname, 'rb') as fp: ilf = pickle.load(fp) else: # isSaveModel = False & isLoadModel = False # 只训练不保存模型,然后继续预测 # 训练 print('Model training...') ilf.fit(data) # 预测 print('Outliers predicting...') shape = data.shape[0] all_pred = [] all_score = [] for i in range(int(shape / self.batch) + 1): start = i * self.batch end = (i + 1) * self.batch batch_test = data[start:end] # 预测 # 返回值:+1 表示正常样本, -1表示异常样本 pred = ilf.predict(batch_test) # 返回样本的异常评分。 值越小表示越有可能是异常样本 score = ilf.decision_function(batch_test) all_pred.extend(pred) all_score.extend(score) data['timestamp'] = self.origin_data['timestamp'] data['is_outlier'] = all_pred data['outlier_score'] = all_score # 转换输出列值 data['timestamp'] = data['timestamp'].astype('int64') data.loc[data.is_outlier == 1, 'is_outlier'] = 0 data.loc[data.is_outlier == -1, 'is_outlier'] = 1 print('Writing `%s`...' % self.output_filename) data.to_csv(self.output_filename, columns=['timestamp', 'outlier_score', 'is_outlier'], header=True, index=0)
random_state= 0, shuffle= True) '''__________________Anamoly detection________________________________''' cont = 0.1 IS = IsolationForest(max_samples=300, contamination=cont, max_features=1.0, random_state=0) IS.fit(_xtrain) pred_train = IS.predict(_xtrain) # for i in pred_train: print(i) print(pred_train[-6]==-1) num_of_anam = [] for i in pred_train: if i == -1 : num_of_anam += [1] else : None else : print('number of anomalies given contamination of %s : %d ' %(cont,len(num_of_anam)) )
print(x_value.columns) print(x_value.shape) # Print shapes print(x_value.shape) print(y_value.shape) #Algorithms used: Random Isolation, LocalOutlier factor are common anomaly detection methods random_isolation = IsolationForest(max_samples=len(x_value), contamination=outlier_value, random_state=3) local_outlier = LocalOutlierFactor(n_neighbors=12, contamination=outlier_value) n_outlier = len(fraudal_count) #fit and predict random_isolation.fit(x_value) score_prediction = random_isolation.decision_function(x_value) y_predict_lof = random_isolation.predict(x_value) y_predict_isf = local_outlier.fit_predict(x_value) score_prediction = local_outlier.negative_outlier_factor_ #Change the value to 0 for valid and 1 for fradual cases. y_predict_isf[y_predict_isf == 1] = 0 y_predict_isf[y_predict_isf == -1] = 1 y_predict_lof[y_predict_lof == 1] = 0 y_predict_lof[y_predict_lof == -1] = 1 n_error_isf = (y_predict_isf != y_value).sum() n_error_lof = (y_predict_lof != y_value).sum() print("Error value for Isolation forest ", n_error_isf)
r2_score(train.y, ens3_insample_pred) # 0.70266651298615024 # Predict ens3_pred = ens3.predict(df_test) # LB: submission = pd.read_csv('T:/RNA/Baltimore/Jason/ad_hoc/mb/input/sample_submission.csv') submission.y = ens3_pred submission.id = id submission.columns = ['ID', 'y'] submission.to_csv('T:/RNA/Baltimore/Jason/ad_hoc/mb/output/layer2_gbreg.csv', index=False) print("Ensemble Model 4: IsolationForest") ens4 = IsolationForest(n_estimators=100, max_samples='auto', contamination=0.1, max_features=1.0, bootstrap=False, n_jobs=1, random_state=None, verbose=0) ens4.fit(df, train.y) # In Sample R2 ens4_insample_pred = ens4.predict(df) print(r2_score(train.y, ens4_insample_pred )) # # Predict ens4_pred = ens4.predict(df_test) # LB: submission = pd.read_csv('T:/RNA/Baltimore/Jason/ad_hoc/mb/input/sample_submission.csv') submission.y = ens4_pred submission.id = id submission.columns = ['ID', 'y'] submission.to_csv('T:/RNA/Baltimore/Jason/ad_hoc/mb/output/layer2_isolationforest.csv', index=False) print("Ensemble Model 5: RandomTreesEmbedding")
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="char", sublinear_tf=True, ngram_range=(1,3)) #converting data to vectors X = vectorizer.fit_transform(queries) display_scores(vectorizer, X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #splitting data badCount = len(badQueries) validCount = len(validQueries) # lgs = LogisticRegression(class_weight={1: 2 * validCount / badCount, 0: 1.0}) # class_weight='balanced') # lgs = LogisticRegression(penalty='l1') rng = np.random.RandomState(42) clf = IsolationForest(max_samples=100, random_state=rng, n_jobs=4, contamination=(badCount /validCount + badCount)) print('fitting') # lgs.fit(X_train, y_train) #training our model clf.fit(X_train) #training our model print ('done') y_pred_train = clf.predict(X_train) y1 = np.array(y_pred_train) y2 = np.array(y_train) print(len(y_pred_train)) print(len(y_train)) print(np.sum(y1 == y2)) ############## # Evaluation # ############## # predicted = lgs.predict(X_test) print clf.predict(vectorizer.transform(['/<script>alert(123)</script>']))
esc = x escala = MinMaxScaler() escala.fit(esc) escalada = escala.transform(esc) pca=PCA(n_components=2) pca.fit(escalada) transformada=pca.transform(escalada) # grafico de los datos mglearn.discrete_scatter(transformada[:,0], transformada[:,1]) modelo = IsolationForest(n_estimators=100, max_samples=256, contamination=0.02) modelo.fit(transformada) predict = modelo.predict(transformada) """ Con un nuevo set de datos se volverá a entrenar el modelo, este dataset contiene comportamiento de malware. """ dataframe2 = pd.read_csv('trafico_prueba_2016.csv', index_col = 'Time') dataframe2['Count'] = np.nan df2 = dataframe2.groupby(['Time', 'Src Port', 'Dst Port', 'Source', 'Protocol', 'Length']).size().reset_index(name='counts') dfpredict2 = df2.copy() dfpredict2['Time'] = encoder.fit_transform(dfpredict2['Time']) dfpredict2['Src Port'] = encoder.fit_transform(dfpredict2['Src Port'])
training_data = np.asarray(df) for t in training_data: print(t) # TODO - Data pre-processing step # Either eliminate the date column altogether or convert it to unix epoch time. #standardize - this normalizes between 1 and 1. This data and can be used for plotting simultaneous lines scaler = MinMaxScaler().fit(training_data) training_data_transformed = scaler.transform(training_data) for t in training_data_transformed: print(t) #create model model = IsolationForest() model.fit(training_data_transformed) prediction = model.predict(training_data_transformed) #see classification results for p in prediction: if p == -1: anomaly_count += 1 print("anomaly ", p) else: normal_count += 1 print("normal ", p) print("anomaly count: ", anomaly_count) print("normal count: ", normal_count) #save model & scaler for later application pickle.dump(model, open(model_file, 'wb'))
df_All = shuffle(df_All) df_X = df_All.drop(["certid", "label"], axis=1, inplace=False) df_y = df_All["label"] X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2) X_cols = X_train.columns y_train = y_train.values y_test = y_test.values # IF_clf = LocalOutlierFactor(contamination=0.1) # y_pred_train=IF_clf.fit_predict(X_train) IF_clf = IsolationForest(n_estimators=1000, contamination=0.1, n_jobs=-1, bootstrap=True) IF_clf.fit(X_train) y_pred_train = IF_clf.predict(X_train) A = pd.DataFrame(X_train, columns=X_cols) B = pd.DataFrame(y_train, columns=["label_ori"]) C = pd.DataFrame(y_pred_train, columns=["label_IF"]) print A.shape print B.shape print C.shape new_tran_df = pd.concat([A, B, C], axis=1) #print new_tran_df # new_tran_df = new_tran_df[new_tran_df["label_IF"]>0] new_tran_df_0 = new_tran_df[new_tran_df["label_IF"] == 1] #孤立森林的正常点
def anomaly_detection(X, name='anomaly'): pr = IsolationForest() pr.fit(filter_numerical(X)) x = pr.predict(filter_numerical(X)) X[name] = x X[name] = X[name].astype(str)
li_ = np.array(li) x, y = li_.shape # if x<100: pca = PCA(n_components=lenth) li_low = pca.fit_transform(li_) rng = np.random.RandomState(42) # 构造训练样本 n_samples = len(li_low) # 样本总数 outliers_fraction = 0.4 # 异常样本比例 X_train = li_low # fit the model clf = IsolationForest(max_samples=n_samples, random_state=rng, contamination=outliers_fraction) clf.fit(X_train) y_pred_train = clf.predict(X_train) scores_pred = clf.decision_function(X_train) print(video_name) print(y_pred_train) # LOF # clf = LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction) # y_pred_train = clf.fit_predict(X_train) # print(video_name) # print(y_pred_train) i = i + 1 print(i)
# merge vehicle = pd.merge(rpm, speed, how = 'outer', on = 'timestamp') # drop null values and zero speeds --> neutral gear # speed < 200 to remove outliers vh = vehicle.dropna(axis = 0) vh = vh[(vh['rpm'] > 0) & ((vh['speed'] > 0) & (vh['speed'] < 200))] # detect outliers using IsolationForest # assume contamination at 0.01 level distances = pairwise_distances(vh[['rpm','speed']],vh[['rpm','speed']], metric = 'cosine') clf = IsolationForest(max_samples = 100, contamination = 0.01, verbose = 1) clf.fit(distances) labels = clf.predict(distances) vh['outlier'] = labels # remove outliers found by IsolationForest vh = vh[['rpm','speed']][vh['outlier'] == 1] #recompute distances after outlier removal distances = pairwise_distances(vh[['rpm','speed']],vh[['rpm','speed']], metric = 'cosine') # initialize variable to keep best model, its silhouette score and predicted labels best_model = (None, -1, None) # iterate over possible number of gears # since we want to pick model with best silhouette score, can't start with single cluster (k=1)
class IForest(BaseDetector): """Wrapper of scikit-learn Isolation Forest with more functionalities. The IsolationForest 'isolates' observations by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature. See :cite:`liu2008isolation,liu2012isolation` for details. Since recursive partitioning can be represented by a tree structure, the number of splittings required to isolate a sample is equivalent to the path length from the root node to the terminating node. This path length, averaged over a forest of such random trees, is a measure of normality and our decision function. Random partitioning produces noticeably shorter paths for anomalies. Hence, when a forest of random trees collectively produce shorter path lengths for particular samples, they are highly likely to be anomalies. Parameters ---------- n_estimators : int, optional (default=100) The number of base estimators in the ensemble. max_samples : int or float, optional (default="auto") The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. - If "auto", then `max_samples=min(256, n_samples)`. If max_samples is larger than the number of samples provided, all samples will be used for all trees (no sampling). contamination : float in (0., 0.5), optional (default=0.1) The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function. max_features : int or float, optional (default=1.0) The number of features to draw from X to train each base estimator. - If int, then draw `max_features` features. - If float, then draw `max_features * X.shape[1]` features. bootstrap : boolean, optional (default=False) If True, individual trees are fit on random subsets of the training data sampled with replacement. If False, sampling without replacement is performed. n_jobs : integer, optional (default=1) The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. verbose : int, optional (default=0) Controls the verbosity of the tree building process. Attributes ---------- estimators_ : list of DecisionTreeClassifier The collection of fitted sub-estimators. estimators_samples_ : list of arrays The subset of drawn samples (i.e., the in-bag samples) for each base estimator. max_samples_ : integer The actual number of samples decision_scores_ : numpy array of shape (n_samples,) The outlier scores of the training data. The higher, the more abnormal. Outliers tend to have higher scores. This value is available once the detector is fitted. threshold_ : float The threshold is based on ``contamination``. It is the ``n_samples * contamination`` most abnormal samples in ``decision_scores_``. The threshold is calculated for generating binary outlier labels. labels_ : int, either 0 or 1 The binary labels of the training data. 0 stands for inliers and 1 for outliers/anomalies. It is generated by applying ``threshold_`` on ``decision_scores_``. """ def __init__(self, n_estimators=100, max_samples="auto", contamination=0.1, max_features=1., bootstrap=False, n_jobs=1, random_state=None, verbose=0): super(IForest, self).__init__(contamination=contamination) self.n_estimators = n_estimators self.max_samples = max_samples self.max_features = max_features self.bootstrap = bootstrap self.n_jobs = n_jobs self.random_state = random_state self.verbose = verbose def fit(self, X, y=None): """Fit detector. y is optional for unsupervised methods. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : numpy array of shape (n_samples,), optional (default=None) The ground truth of the input samples (labels). """ # validate inputs X and y (optional) X = check_array(X) self._set_n_classes(y) self.detector_ = IsolationForest(n_estimators=self.n_estimators, max_samples=self.max_samples, contamination=self.contamination, max_features=self.max_features, bootstrap=self.bootstrap, n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose) self.detector_.fit(X=X, y=None, sample_weight=None) # invert decision_scores_. Outliers comes with higher outlier scores. self.decision_scores_ = invert_order( self.detector_.decision_function(X)) self._process_decision_scores() return self def decision_function(self, X): """Predict raw anomaly score of X using the fitted detector. The anomaly score of an input sample is computed based on different detector algorithms. For consistency, outliers are assigned with larger anomaly scores. Parameters ---------- X : numpy array of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- anomaly_scores : numpy array of shape (n_samples,) The anomaly score of the input samples. """ check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) # invert outlier scores. Outliers comes with higher outlier scores return invert_order(self.detector_.decision_function(X)) @property def estimators_(self): """The collection of fitted sub-estimators. Decorator for scikit-learn Isolation Forest attributes. """ return self.detector_.estimators_ @property def estimators_samples_(self): """The subset of drawn samples (i.e., the in-bag samples) for each base estimator. Decorator for scikit-learn Isolation Forest attributes. """ return self.detector_.estimators_samples_ @property def max_samples_(self): """The actual number of samples. Decorator for scikit-learn Isolation Forest attributes. """ return self.detector_.max_samples_
data = data.drop(cols, axis=1) data = data.join(vecData) return data, vecData, vec df, t, v = ohEncoding(df, col, replace=True) print "Shape after encoding" print type(df.shape) df_unlabeled = df.drop("Anomaly", axis=1) print "Shape of the dataframe without anomaly column: " print df_unlabeled.shape clf = IsolationForest(max_samples=6444, verbose=1, n_jobs=-1, contamination=0.255555 , bootstrap=True, max_features=9) clf.fit(df_unlabeled) pred = clf.predict(df_unlabeled) # print type(pred) # print data.shape # print len(pred) # print pred anomalies = np.argwhere(pred == -1) normal = np.argwhere(pred == 1) # print anomalies # print type(anomalies) df['ISO1'] = pred # iterate over rows nLabAno = 0 nDetAno = 0
train_numerical.fillna(0, inplace=True) train_categoric = train.select_dtypes(include=["object"]) train_categoric.fillna("NONE", inplace=True) train = train_numerical.merge(train_categoric, left_index=True, right_index=True) test = pd.read_csv("./test.csv") ID = test.Id test.drop("Id", axis=1, inplace=True) test_numerical = test.select_dtypes(exclude=["object"]) test_numerical.fillna(0, inplace=True) test_categoric = test.select_dtypes(include=["object"]) test_categoric.fillna("NONE", inplace=True) test = test_numerical.merge(test_categoric, left_index=True, right_index=True) clf = IsolationForest(max_samples=100, random_state=42) clf.fit(train_numerical) y_noano = clf.predict(train_numerical) y_noano = pd.DataFrame(y_noano, columns=["Top"]) train_numerical = train_numerical.iloc[y_noano[y_noano["Top"] == 1].index.values] train_numerical.reset_index(drop=True, inplace=True) train_categoric = train_categoric.iloc[y_noano[y_noano["Top"] == 1].index.values] train_categoric.reset_index(drop=True, inplace=True) train = train.iloc[y_noano[y_noano["Top"] == 1].index.values] train.reset_index(drop=True, inplace=True) col_train_num = list(train_numerical.columns) col_train_num_bis = list(train_numerical.columns) col_train_cat = list(train_categoric.columns) col_train_num_bis.remove("SalePrice")
# In[28]: def falsenegative_accuracy(values): tn = list(values).count(-1) total = values.shape[0] accuracy = np.round(tn / total, 4) return accuracy # In[29]: st.subheader("Accuracy score For Isolation forest") ISF = IsolationForest(random_state=42) ISF.fit(ins) falsepositive_isf = ISF.predict(ins) falsenegative_isf = ISF.predict(outs) in_accuracy_isf = falsepositive_accuracy(falsepositive_isf) out_accuracy_isf = falsenegative_accuracy(falsenegative_isf) st.write("Accuracy in Detecting falsepositive Alarm:", in_accuracy_isf) st.write("Accuracy in Detecting falsenegative Alarm:", out_accuracy_isf) # In[30]: st.subheader("Accuracy score For Local Outlier Factor") LOF = LocalOutlierFactor(novelty=True) LOF.fit(ins) falsepositive_lof = LOF.predict(ins) falsenegative_lof = LOF.predict(outs) in_accuracy_lof = falsepositive_accuracy(falsepositive_lof)
iforest = IsolationForest() lof = LocalOutlierFactor(n_neighbors=20) ocsvm = OneClassSVM() lim_inf = X.min(axis=0) lim_sup = X.max(axis=0) volume_support = (lim_sup - lim_inf).prod() t = np.arange(0, 100 / volume_support, 0.01 / volume_support) axis_alpha = np.arange(alpha_min, alpha_max, 0.0001) unif = np.random.uniform(lim_inf, lim_sup, size=(n_generated, n_features)) # fit: print('IsolationForest processing...') iforest = IsolationForest() iforest.fit(X_train) s_X_iforest = iforest.decision_function(X_test) print('LocalOutlierFactor processing...') lof = LocalOutlierFactor(n_neighbors=20) lof.fit(X_train) s_X_lof = lof.decision_function(X_test) print('OneClassSVM processing...') ocsvm = OneClassSVM() ocsvm.fit(X_train[:min(ocsvm_max_train, n_samples_train - 1)]) s_X_ocsvm = ocsvm.decision_function(X_test).reshape(1, -1)[0] s_unif_iforest = iforest.decision_function(unif) s_unif_lof = lof.decision_function(unif) s_unif_ocsvm = ocsvm.decision_function(unif).reshape(1, -1)[0] plt.subplot(121) auc_iforest, em_iforest, amax_iforest = em(t, t_max, volume_support,
def get_features(x): return resnet_model.predict(x) from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.ensemble import IsolationForest from sklearn import svm # Apply standard scaler to output from resnet50 ss = StandardScaler() ss.fit(X_train) X_train = ss.transform(X_train) X_test = ss.transform(X_test) # Take PCA to reduce feature space dimensionality pca = PCA(n_components=512, whiten=True) pca = pca.fit(X_train) print('Explained variance percentage = %0.2f' % sum(pca.explained_variance_ratio_)) X_train = pca.transform(X_train) X_test = pca.transform(X_test) # Train classifier and obtain predictions for OC-SVM oc_svm_clf = svm.OneClassSVM(gamma=0.001, kernel='rbf', nu=0.08) # Obtained using grid search if_clf = IsolationForest(contamination=0.08, max_features=1.0, max_samples=1.0, n_estimators=40) # Obtained using grid search oc_svm_clf.fit(X_train) if_clf.fit(X_train) oc_svm_preds = oc_svm_clf.predict(X_test) if_preds = if_clf.predict(X_test)
def _IsolationForest(X): rng = np.random.RandomState(42) clf = IsolationForest(max_samples=X.shape[0], random_state=rng) return clf.fit(X).predict(X)
print "ERROR" sys.exit(-1) Xtrain.append(column_train) Xtest.append(column_test) Xtrain = np.transpose(np.array(Xtrain)) Xtest = np.transpose(np.array(Xtest)) idx_train = idx_train[:Xtrain.shape[0]] idx_test = idx_test[:Xtest.shape[0]] # fit an iforest iforest = IsolationForest(n_estimators=ntrees, max_samples=sample_frac, max_features=feat_frac, n_jobs=-1, random_state=rng, verbose=1) iforest.fit(Xtrain) # anomaly scores y_pred_train = iforest.predict(Xtrain) y_pred_test = iforest.predict(Xtest) train_feature_values = [(gid, val) for gid, val in zip(idx_train, list(y_pred_train))] test_feature_values = [(gid, val) for gid, val in zip(idx_test, list(y_pred_test))] for i, scenario in enumerate(MALICIOUS_SCENARIOS): all_feature_values = train_feature_values + \ [(gid, feat_value) for gid, feat_value in test_feature_values if gid/100 in BENIGN_SCENARIOS or gid/100 == scenario] all_values = np.array([feat_value
#ocsvm = OneClassSVM() #ocsvm = OneClassSVM(kernel='linear', degree=2, gamma='auto', nu=0.5) ocsvm = OneClassSVM(gamma='auto', nu=0.01) lim_inf = X.min(axis=0) lim_sup = X.max(axis=0) volume_support = (lim_sup - lim_inf).prod() t = np.arange(0, 100 / volume_support, 0.01 / volume_support) axis_alpha = np.arange(alpha_min, alpha_max, 0.0001) unif = np.random.uniform(lim_inf, lim_sup, size=(n_generated, n_features)) # fit: print('IsolationForest processing...') iforest = IsolationForest() iforest.fit(X_train) s_X_iforest = iforest.decision_function(X_train) print('LocalOutlierFactor processing...') lof.fit(X_train) s_X_lof = lof.decision_function(X_train) print('OneClassSVM processing...') ocsvm.fit(X_train) s_X_ocsvm = ocsvm.decision_function(X_train).reshape(1, -1)[0] s_unif_iforest = iforest.decision_function(unif) s_unif_lof = lof.decision_function(unif) s_unif_ocsvm = ocsvm.decision_function(unif).reshape(1, -1)[0] plt.subplot(121) print("t ist: " ,t)
featureMatrix['is_train'] = np.random.uniform(0, 1, len(featureMatrix)) <= .75 #split out the train and test df's into separate objects train, test = featureMatrix[featureMatrix['is_train']==True], featureMatrix[featureMatrix['is_train']==False] #drop the is_train column, we don't need it anymore train = train.drop('is_train', axis=1) test = test.drop('is_train', axis=1) #create the isolation forest class and factorize the class column clf = IsolationForest(n_estimators=opts.numtrees) #train the isolation forest on the training set, dropping the class column (since the trainer takes that as a separate argument) print('\nTraining') clf.fit(train.drop('class', axis=1)) #remove the 'answers' from the test set testnoclass = test.drop('class', axis=1) print('\nPredicting (class 1 is normal, class -1 is malicious)') #evaluate our results on the test set. test.is_copy = False test['prediction'] = clf.predict(testnoclass) print #group by class (the real answers) and prediction (what the forest said). we want these values to match for 'good' answers results=test.groupby(['class', 'prediction']) resultsagg = results.size() print(resultsagg)