def dorc(preprocessedData, random_state, outliers_fraction=0.1): t0 = time.time() clf = IForest(contamination=outliers_fraction, random_state=random_state, n_jobs=-1) clf.fit(preprocessedData) scores = clf.decision_function(preprocessedData) # Apply IQR-based criteria to identify rare cells for further downstream analysis. q3 = np.percentile(scores, 75) iqr = stats.iqr(scores) th = q3 + (1.5 * iqr) # Select indexes that satisfy IQR-based thresholding criteria. indIqr = np.where(scores >= th)[0] print('shape of selected cells : {}'.format(indIqr.shape)) # Create a file with binary predictions predictions = np.zeros(preprocessedData.shape[0]) predictions[indIqr] = 1 # Replace predictions for rare cells with '1'. t1 = time.time() duration = round(t1 - t0, ndigits=4) print("Total running DoRC time is :" + str(duration) + " s") return predictions, scores, duration
def do_iforest(x, n_estimators=100, max_samples=512): clf = IForest(behaviour="new", n_estimators=n_estimators, max_samples=max_samples, random_state=None) y_pred = clf.fit_predict(x) scores = clf.decision_function(x) index = np.where(y_pred == 1)[0] return clf, scores, index
class IForestPyOD(BaseAlgorithm): name = "iForest_pyod" def __init__(self, t=100, psi=256): self.iforest = IForest(max_samples=psi, n_estimators=t, behaviour="new", contamination=0.1) def fit(self, X): self.iforest.fit(X) def predict(self, X): return self.iforest.decision_function(X)
def detect(self, X, y=None): """ :param X: Dataframe :param y: np.array :return: outlier scores """ rng = np.random.RandomState(42) # 构造训练样本 n_estimators = 200 # 森林中树的棵数 outliers_fraction = 0.5 # 异常样本比例 clf = IForest(max_samples='auto', random_state=rng, contamination=outliers_fraction, n_estimators=n_estimators) clf.fit(X) scores = clf.decision_function(X) return scores
def main(): dataset, label = pre_data() from numpy import nan as NA from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=NA, strategy="mean") dataset = imputer.fit_transform(dataset) x_train, x_test, y_train, y_label = train_test_split(dataset, label, test_size=0.3, random_state=44) # x_train, x_test, y_train, y_label =[], [], [], [] # for i in range(1000): # x_train.append(dataset[i]) # y_train.append(label[i]) # for i in range(6000,10000): # x_train.append(dataset[i]) # y_train.append(label[i]) # x_test = dataset[1000:6000] # y_label = label[1000:6000] for i in range(3): clf_name = 'IForest' clf = IForest() clf.fit(x_train) # get the prediction label and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores from sklearn.metrics import accuracy_score from sklearn.metrics import precision_score from sklearn.metrics import recall_score print(accuracy_score(y_train, y_train_pred)) print(precision_score(y_train, y_train_pred)) print(recall_score(y_train, y_train_pred)) # get the prediction on the test data y_test_pred = clf.predict(x_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(x_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print(accuracy_score(y_label, y_test_pred)) print(precision_score(y_train, y_train_pred)) print(recall_score(y_train, y_train_pred)) print("\nOn Test Data:") evaluate_print(clf_name, y_label, y_test_scores)
def get_IF_scores(dataframe, cols, outliers_fraction=0.01, standardize=True): '''Takes df, a list selected column nmaes, outliers_fraction = 0.01 default Returns: df with Isolation Forest (IF) scores added ''' if standardize: #standardize selected variables minmax = MinMaxScaler(feature_range=(0, 1)) dataframe[cols] = minmax.fit_transform(dataframe[cols]) #Convert dataframe to a numpy array in order to incorprate our algorithm arrays = [] for row in cols: row = dataframe[row].values.reshape(-1, 1) arrays.append(row) X = np.concatenate((arrays), axis=1) #fit clf = IForest(contamination=outliers_fraction, random_state=0) clf.fit(X) # predict raw anomaly score scores_pred = clf.decision_function(X) * -1 # prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) CheckOutliers.df3 = dataframe CheckOutliers.df3['outlier'] = y_pred.tolist() print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers, 'found with HBOS')
X_train, y_train, X_test, y_test = generate_data( n_train=n_train, n_test=n_test, contamination=contamination) # train IForest detector clf_name = 'IForest' clf = IForest() clf.fit(X_train) # get the prediction label and decision_scores_ on the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred,
#划分测试集和训练集 X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33) #使用pyod中的IForest算法拟合数据 clf_name = 'IForest' clf = IForest() clf.fit(X_train) #预测得到由0和1组成的数组,1表示离群点,0表示飞离群点 y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores,The outlier scores of the training data. #预测样本是不是离群点,返回0和1 的数组 y_test_pred = clf.predict(X_test) y_test_scores = clf.decision_function( X_test) # outlier scores,The anomaly score of the input samples. #使用sklearn中的roc_auc_score方法得到auc值,即roc曲线下面的面积 try: sumAuc_train += sklearn.metrics.roc_auc_score(y_train, y_train_scores, average='macro') sumAuc_test += sklearn.metrics.roc_auc_score(y_test, y_test_scores, average='macro') #s=precision_score(y_train, y_train_scores, average='macro') i += 1 print(sumAuc_train, sumAuc_test) except ValueError: pass #得到ROC值和精确度 prn
class TestIForest(unittest.TestCase): def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = IForest(contamination=self.contamination, random_state=42) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert (hasattr(self.clf, 'estimators_') and self.clf.estimators_ is not None) assert (hasattr(self.clf, 'estimators_samples_') and self.clf.estimators_samples_ is not None) assert (hasattr(self.clf, 'max_samples_') and self.clf.max_samples_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def test_model_clone(self): clone_clf = clone(self.clf) def tearDown(self): pass
#Training clf1 = IForest(random_state = 42) # Default contamination = 0.1 clf1.fit(X_train1) #Setting threshold using the contamination parameter dec_scores = clf1.decision_scores_ dec_scores_sorted=sorted(dec_scores, reverse=True) a = round(len(X_train1) * clf1.contamination) print(a) anomalies=dec_scores_sorted[:a] threshold = anomalies[-1] # Validation data is scored y_valid_scores = clf1.decision_function(X_valid1) y_valid_scores = pd.Series(y_valid_scores) valid_SrcIP = np.load('preprocessing1_valid_srcIP.npy',allow_pickle=True) # For each score, if it is above threshold value, it is considered outlier, else inlier valid_outliers = [] y_pred_valid = [] for score in range(0,len(y_valid_scores)): if y_valid_scores[score] > threshold: reg = (valid_SrcIP[score], y_valid_scores[score]) valid_outliers.append(reg) y_pred_valid.append(1.0) else: y_pred_valid.append(0.0)
# Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train IForest detector clf_name = 'IForest' clf = IForest() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False)
class IF(IForest): def __init__(self, n_estimators=100, max_samples='auto', contamination=0.1, random_state=42, verbose=1): """Isolation Forest (IF) Parameters ---------- n_estimators : int, optional (default=100) The number of base estimators in the ensemble. max_samples : int or float, optional (default="auto") The number of samples to draw from X to train each base estimator. contamination : float in (0., 0.5), optional (default=0.1) The amount of contamination of the data set, i.e., the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function. verbose: int (default is 1) A print level is to control what information should be printed according to the given value. The higher the value is, the more info is printed. random_state: int (default is 42) """ self.n_estimators = n_estimators self.max_samples = max_samples self.contamination = contamination self.verbose = verbose self.random_state = random_state def fit(self, X_train, y_train=None): """Fit the model. y is ignored in unsupervised methods. Parameters ---------- X_train : numpy array of shape (n_samples, n_features) The input samples. y_train : Ignored Not used, present for API consistency by convention. Returns ------- self : object The fitted estimator. """ self.model_ = IForest( n_estimators=self.n_estimators, max_samples=self.max_samples, contamination=self.contamination, max_features=1., bootstrap=False, n_jobs=-1, behaviour='deprecated', # no use any more in sklean 0.24. random_state=self.random_state, verbose=self.verbose) self.model_.fit(X=X_train) return self def decision_function(self, X): """Predict raw anomaly score of X using the fitted detector. The anomaly score of an input sample is computed based on different detector algorithms. For consistency, outliers are assigned with larger anomaly scores. Parameters ---------- X : numpy array of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- anomaly_scores : numpy array of shape (n_samples,) The anomaly score of the input samples. """ return self.model_.decision_function(X) def predict_proba(self, X): raise NotImplementedError
def train_model(request): global clf if request.method == 'POST': try: json_data = json.loads(request.body) print(json_data) file = json_data['file'] data = pd.read_csv(file) data = data.fillna(0) s = data["Birth year"] s[s != 0] data["Birth year"] = s[s != 0].str.replace("/", "").astype(int) data = data.fillna(0) data['Birth year'].apply(type) data['Uid'] = data['Uid'].astype(str).str.replace(' ', '').astype(float) s = data['Uid'] X1 = data['Birth year'].values.reshape(-1, 1) X2 = data['Uid'].values.reshape(-1, 1) X = np.concatenate((X1, X2), axis=1) outliers_fraction = 0.01 outliers_fraction = 0.01 xx, yy = np.meshgrid(np.linspace(0, 1, 100), np.linspace(0, 1, 100)) clf = IForest(contamination=outliers_fraction, random_state=0) clf.fit(X) # predict raw anomaly score scores_pred = clf.decision_function(X) * -1 # prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) plt.figure(figsize=(8, 8)) # copy ofa dataframe data1 = data data['outlier'] = y_pred.tolist() # sales - inlier feature 1, profit - inlier feature 2 inliers_Uid = np.array(data['Uid'][data['outlier'] == 0]).reshape( -1, 1) inliers_Birth_year = np.array( data['Birth year'][data['outlier'] == 0]).reshape(-1, 1) # sales - outlier feature 1, profit - outlier feature 2 outliers_Uid = data1['Uid'][data1['outlier'] == 1].values.reshape( -1, 1) outliers_Birth_year = data1['Birth year'][data1['outlier'] == 1].values.reshape(-1, 1) print('OUTLIERS: ', n_outliers, 'INLIERS: ', n_inliers) output = {'OUTLIERS ': n_outliers, 'INLIERS ': n_inliers} return JsonResponse(output) except Exception: return JsonResponse(Exception, safe=False)
class TestIForest(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = IForest(contamination=self.contamination, random_state=42) self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): assert_true(hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true(hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true(hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true(hasattr(self.clf, 'estimators_') and self.clf.estimators_ is not None) assert_true(hasattr(self.clf, 'estimators_samples_') and self.clf.estimators_samples_ is not None) assert_true(hasattr(self.clf, 'max_samples_') and self.clf.max_samples_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
class TestIForest(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination) self.clf = IForest(contamination=self.contamination) self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): if not hasattr( self.clf, 'decision_scores_') or self.clf.decision_scores_ is None: self.assertRaises(AttributeError, 'decision_scores_ is not set') if not hasattr(self.clf, 'labels_') or self.clf.labels_ is None: self.assertRaises(AttributeError, 'labels_ is not set') if not hasattr(self.clf, 'threshold_') or self.clf.threshold_ is None: self.assertRaises(AttributeError, 'threshold_ is not set') if not hasattr(self.clf, 'estimators_') or self.clf.estimators_ is None: self.assertRaises(AttributeError, 'estimators_ is not set') if not hasattr( self.clf, 'estimators_samples_') or self.clf.estimators_samples_ is None: self.assertRaises(AttributeError, 'estimators_samples_ is not set') if not hasattr(self.clf, 'max_samples_') or self.clf.max_samples_ is None: self.assertRaises(AttributeError, 'max_samples_ is not set') def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_evaluate(self): self.clf.fit_predict_evaluate(self.X_test, self.y_test) def tearDown(self): pass
data_dict = load_dataset( dataset, subdataset, "all", ) x_train = data_dict["train"] x_test = data_dict["test"] x_test_labels = data_dict["test_labels"] start = time.time() od = IForest(n_estimators=n_estimators) od.fit(x_train) anomaly_score = od.decision_function(x_test) anomaly_label = x_test_labels end = time.time() time = end - start # Make evaluation evaluate_all(anomaly_score, anomaly_label) salience = compute_salience(anomaly_score, anomaly_label) print('time') print(' ', time) print('salience') print(' ', salience)