def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.8 # generate data and fit model without missing or infinite values: self.X_train, self.X_test, self.y_train, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=1, contamination=self.contamination, random_state=42) self.clf = MAD() self.clf.fit(self.X_train) # generate data and fit model with missing value: self.X_train_nan, self.X_test_nan, self.y_train_nan, self.y_test_nan = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=1, contamination=self.contamination, random_state=42, n_nan=1) self.clf_nan = MAD() self.clf_nan.fit(self.X_train_nan) # generate data and fit model with infinite value: self.X_train_inf, self.X_test_inf, self.y_train_inf, self.y_test_inf = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=1, contamination=self.contamination, random_state=42, n_inf=1) self.clf_inf = MAD() self.clf_inf.fit(self.X_train_inf)
def setUp(self): # Define data file and read X and y # Generate some data if the source data is missing this_directory = path.abspath(path.dirname(__file__)) mat_file = 'pima.mat' try: mat = loadmat(path.join(*[this_directory, 'data', mat_file])) except TypeError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data except IOError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data else: X = mat['X'] y = mat['y'].ravel() X, y = check_X_y(X, y) self.X_train, self.X_test, self.y_train, self.y_test = \ train_test_split(X, y, test_size=0.4, random_state=42) self.clf = XGBOD(random_state=42) self.clf.fit(self.X_train, self.y_train) self.roc_floor = 0.8
def setUp(self): # Define data file and read X and y # Generate some data if the source data is missing this_directory = path.abspath(path.dirname(__file__)) mat_file = 'pima.mat' try: mat = loadmat(path.join(*[this_directory, 'data', mat_file])) except TypeError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data except IOError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data else: X = mat['X'] y = mat['y'].ravel() X, y = check_X_y(X, y) self.X_train, self.X_test, self.y_train, self.y_test = \ train_test_split(X, y, test_size=0.4, random_state=42) self.clf = XGBOD(random_state=42) self.clf.fit(self.X_train, self.y_train) self.roc_floor = 0.75
def test_default_njobs(self): # Define data file and read X and y # Generate some data if the source data is missing this_directory = path.abspath(path.dirname(__file__)) mat_file = 'cardio.mat' try: mat = loadmat(path.join(*[this_directory, 'data', mat_file])) except TypeError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data except IOError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data else: X = mat['X'] y = mat['y'].ravel() X, y = check_X_y(X, y) self.X_train, self.X_test, self.y_train, self.y_test = \ train_test_split(X, y, test_size=0.4, random_state=42) self.base_estimators = [LOF(), LOF(), IForest(), COPOD()] self.clf = SUOD(n_jobs=2) self.clf.fit(self.X_train) self.roc_floor = 0.7
def setUp(self): self.n_train = 100 self.n_test = 50 self.n_features = 2 self.contamination = 0.1 self.roc_floor = 0.8 # Generate sample data self.X_train, self.X_test, self.y_train, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=self.n_features, contamination=self.contamination, random_state=42) self.clf = AnoGAN(G_layers=[10, 20], D_layers=[20, 2], epochs_query=10, preprocessing=True, index_D_layer_for_recon_error=1, epochs=500, contamination=self.contamination, verbose=0) self.clf.fit(self.X_train)
def test_check_consistent_shape(self): X_train, X_test, y_train, y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination) X_train_n, y_train_n, X_test_n, y_test_n, y_train_pred_n, y_test_pred_n \ = check_consistent_shape(X_train, y_train, X_test, y_test, y_train, y_test) assert_allclose(X_train_n, X_train) assert_allclose(y_train_n, y_train) assert_allclose(X_test_n, X_test) assert_allclose(y_test_n, y_test) assert_allclose(y_train_pred_n, y_train) assert_allclose(y_test_pred_n, y_test) # test shape difference with assert_raises(ValueError): check_consistent_shape(X_train, y_train, y_train, y_test, y_train, y_test) # test shape difference between X_train and X_test X_test = np.hstack((X_test, np.zeros( (X_test.shape[0], 1)))) # add extra column/feature with assert_raises(ValueError): check_consistent_shape(X_train, y_train, X_test, y_test, y_train_pred_n, y_test_pred_n)
def setUp(self): self.n_train = 6000 self.n_test = 1000 self.n_features = 300 self.contamination = 0.1 self.roc_floor = 0.5 self.X_train, self.X_test, self.y_train, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=self.n_features, contamination=self.contamination, random_state=42) self.clf = DeepSVDD(epochs=10, hidden_neurons=[64, 32], contamination=self.contamination, random_state=2021) self.clf_ae = DeepSVDD(epochs=5, use_ae=True, output_activation='relu', hidden_neurons=[16, 8, 4], contamination=self.contamination, preprocessing=False) self.clf.fit(self.X_train) self.clf_ae.fit(self.X_train)
def test_data_generate2(self): X_train, y_train, X_test, y_test = \ generate_data(n_train=self.n_train, n_test=self.n_test, n_features=3, contamination=self.contamination) assert_allclose(X_train.shape, (self.n_train, 3)) assert_allclose(X_test.shape, (self.n_test, 3))
def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination)
def test_data_generate3(self): X_train, y_train, X_test, y_test = \ generate_data(n_train=self.n_train, n_test=self.n_test, n_features=2, contamination=self.contamination, random_state=42) X_train2, y_train2, X_test2, y_test2 = \ generate_data(n_train=self.n_train, n_test=self.n_test, n_features=2, contamination=self.contamination, random_state=42) assert_allclose(X_train, X_train2) assert_allclose(X_test, X_test2) assert_allclose(y_train, y_train2) assert_allclose(y_test, y_test2)
def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.75 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = KNN(contamination=self.contamination, method='median')
def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=10, contamination=self.contamination, random_state=42) self.clf = COPOD(contamination=self.contamination) self.clf.fit(self.X_train)
def setUp(self): self.n_train = 50 self.n_test = 50 self.contamination = 0.2 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = ABOD(contamination=self.contamination, method='default') self.clf.fit(self.X_train)
def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = COF(contamination=self.contamination, method="memory") self.clf.fit(self.X_train)
def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination) self.clf = FeatureBagging(contamination=self.contamination) self.clf.fit(self.X_train)
def test_get_outliers_inliers(self): X_train, y_train = generate_data( n_train=self.n_train, train_only=True, contamination=self.contamination) X_outliers, X_inliers = get_outliers_inliers(X_train, y_train) inlier_index = int(self.n_train * (1 - self.contamination)) assert_allclose(X_train[0:inlier_index, :], X_inliers) assert_allclose(X_train[inlier_index:, :], X_outliers)
def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = LOF(contamination=self.contamination) self.clf.fit(self.X_train)
def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = HBOS(contamination=self.contamination) self.clf.fit(self.X_train)
def data(type, contamination): if type == 'MAD': #contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, y_train, X_test, y_test = generate_data( n_train=n_train, n_test=n_test, n_features=1, contamination=contamination, random_state=42) elif type == 'ABOD': contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) elif type == 'AutoEncoder': #contamination = 0.1 # percentage of outliers n_train = 20000 # number of training points n_test = 2000 # number of testing points n_features = 300 # number of features # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=n_features, contamination=contamination, random_state=42) return X_train, y_train, X_test, y_test
def setUp(self): self.contamination = 0.05 # percentage of outliers self.n_train = 1000 # number of training points self.n_test = 100 # number of testing points # Generate sample data self.X_train, self.y_train, self.X_test, self.y_test = \ generate_data(n_train=self.n_train, n_test=self.n_test, n_features=3, contamination=self.contamination, random_state=42)
def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.X_train, self.X_test = standardizer(self.X_train, self.X_test) self.detector_list = [LOF(), LOF()] self.clf = LSCP(self.detector_list, contamination=self.contamination) self.clf.fit(self.X_train)
def setUp(self): self.n_train = 3000 self.n_test = 1000 self.n_features = 200 self.contamination = 0.1 self.batch_size = 1000 self.X_train, self.X_test, self.y_train, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=self.n_features, contamination=self.contamination, random_state=42)
def setUp(self): self.n_train = 6000 self.n_test = 1000 self.n_features = 300 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.X_test, self.y_train, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=self.n_features, contamination=self.contamination, random_state=42) self.clf = VAE(epochs=5, contamination=self.contamination) self.clf.fit(self.X_train)
def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.75 self.X_train, self.X_test, self.y_train, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = LODA(contamination=self.contamination, n_bins='auto') self.clf.fit(self.X_train)
def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.X_test, self.y_train, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42, ) self.clf = Sampling(contamination=self.contamination, random_state=42) self.clf.fit(self.X_train)
def setUp(self): self.n_train = 1000 self.n_test = 200 self.n_features = 2 self.contamination = 0.1 # GAN may yield unstable results; turning performance check off # self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=self.n_features, contamination=self.contamination, random_state=42) self.clf = SO_GAAL(contamination=self.contamination) self.clf.fit(self.X_train)
def setUp(self): self.n_train = 3000 self.n_test = 1000 self.n_features = 10 self.contamination = 0.1 # TODO: GAN may yield unstable results; turning performance check off # self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=self.n_features, contamination=self.contamination, random_state=42) self.clf = SO_GAAL(contamination=self.contamination) self.clf.fit(self.X_train)
def _create_data(contamination, n_features, n_test, n_train): X_train, y_train, X_test, y_test = generate_data( n_train=n_train, n_test=n_test, n_features=n_features, contamination=contamination, random_state=1234, behaviour="old") X_train = pd.DataFrame(X_train) X_test = pd.DataFrame(X_test) X_train = StandardScaler().fit_transform(X_train) X_train = pd.DataFrame(X_train) X_test = StandardScaler().fit_transform(X_test) X_test = pd.DataFrame(X_test) return X_train, y_train, X_test, y_test
def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) # calculate covariance for mahalanobis distance X_train_cov = np.cov(self.X_train, rowvar=False) self.clf = KNN(algorithm='auto', metric='mahalanobis', metric_params={'V': X_train_cov}) self.clf.fit(self.X_train)
def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.X_test, self.y_train, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=10, contamination=self.contamination, random_state=42) self.clf = ECOD(contamination=self.contamination, n_jobs=2) self.clf.fit(self.X_train) # get a copy from the single thread copy self.clf_ = ECOD(contamination=self.contamination) self.clf_.fit(self.X_train)
def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.8 self.gm = None self.median = None self.data_scaler = None self.angles_scalers1 = None self.angles_scalers2 = None self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=4, contamination=self.contamination, random_state=42) self.clf = ROD() self.clf.fit(self.X_train)
def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) detectors = [KNN(), LOF(), OCSVM()] self.clf = SimpleDetectorAggregator(base_estimators=detectors, method='maximization', contamination=self.contamination) self.clf.fit(self.X_train)
def setUp(self): self.n_train = 1000 self.n_test = 500 self.contamination = 0.1 self.roc_floor = 0.6 self.random_state = 42 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=self.random_state) self.base_estimators = [ LOF(n_neighbors=5, contamination=self.contamination), LOF(n_neighbors=15, contamination=self.contamination), LOF(n_neighbors=25, contamination=self.contamination), LOF(n_neighbors=35, contamination=self.contamination), LOF(n_neighbors=45, contamination=self.contamination), HBOS(contamination=self.contamination), PCA(contamination=self.contamination), LSCP(detector_list=[ LOF(n_neighbors=5, contamination=self.contamination), LOF(n_neighbors=15, contamination=self.contamination) ], random_state=self.random_state) ] this_directory = os.path.abspath(os.path.dirname(__file__)) self.cost_forecast_loc_fit_ = os.path.join(this_directory, 'bps_train.joblib') self.cost_forecast_loc_pred_ = os.path.join(this_directory, 'bps_prediction.joblib') self.model = SUOD(base_estimators=self.base_estimators, n_jobs=2, rp_flag_global=True, bps_flag=True, contamination=self.contamination, approx_flag_global=True, cost_forecast_loc_fit=self.cost_forecast_loc_fit_, cost_forecast_loc_pred=self.cost_forecast_loc_pred_, verbose=True)
from pyod.models.auto_encoder import AutoEncoder from pyod.utils.data import generate_data from pyod.utils.data import evaluate_print if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 20000 # number of training points n_test = 2000 # number of testing points n_features = 300 # number of features # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=n_features, contamination=contamination, random_state=42) # train AutoEncoder detector clf_name = 'AutoEncoder' clf = AutoEncoder(epochs=30, contamination=contamination) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores
from pyod.utils.utility import standardizer from pyod.utils.data import generate_data from pyod.utils.data import evaluate_print if __name__ == "__main__": # Define data file and read X and y # Generate some data if the source data is missing mat_file = 'cardio.mat' try: mat = loadmat(os.path.join('data', mat_file)) except TypeError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data except IOError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data else: X = mat['X'] y = mat['y'].ravel() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) # standardizing data for processing X_train_norm, X_test_norm = standardizer(X_train, X_test) n_clf = 20 # number of base detectors