def model_init(self, model): """Model initialisation of a single model. """ if self.model == 'pca': self.models[model] = PCA(contamination=self.contamination) elif self.model == 'loda': self.models[model] = LODA(contamination=self.contamination) elif self.model == 'iforest': self.models[model] = IForest(n_estimators=50, bootstrap=True, behaviour='new', contamination=self.contamination) elif self.model == 'cblof': self.models[model] = CBLOF(n_clusters=3, contamination=self.contamination) elif self.model == 'feature_bagging': self.models[model] = FeatureBagging( base_estimator=PCA(contamination=self.contamination), contamination=self.contamination) elif self.model == 'copod': self.models[model] = COPOD(contamination=self.contamination) elif self.model == 'hbos': self.models[model] = HBOS(contamination=self.contamination) else: self.models[model] = HBOS(contamination=self.contamination) self.custom_model_scalers[model] = MinMaxScaler()
def pred_KNN(self, k=5, comp_with="openaq"): ## hyperparameters for KNN is tuned here # if self.bool_o_dict == True: self.comp_with = comp_with if comp_with == "openaq": if self.X_o == []: pred = [] elif self.X_o.shape[0] > k: self.clf = KNN(n_neighbors=k) self.clf.fit(self.X_o) pred = self.clf.labels_ elif self.X_o.shape[0] > 2: # print(f"The value of k is changed from {k} to {self.X_o.shape[0]-1}") k = self.X_o.shape[0] - 1 self.clf = KNN(n_neighbors=k) self.clf.fit(self.X_o) pred = self.clf.labels_ else: pred = [] #A_location, B_location, C_location = self.pred_location(pred) elif comp_with == "cams": pred = [] for each_X in self.X_c: # if each_X exists then it will have a shape of (10,8) self.clf = KNN(n_neighbors=k) self.clf.fit(each_X) pred.append(self.clf.labels_[-1]) A_location, B_location, C_location = self.pred_location(pred) return A_location, B_location, C_location
def pca(X_train, X_test, Y_train, Y_test): from pyod.models.pca import PCA model = PCA() model.fit(X_train) pred = model.predict(X_test) acc = np.sum(pred == Y_test) / X_test.shape[0] print(acc) return (acc * 100)
def pca_outlier_detection(X_train, X_test, **kwargs): detector = PCA(**kwargs) detector.fit(X_train) prob = detector.predict_proba(X_test)[:, -1] if isinstance(X_test, pd.DataFrame): return pd.Series(prob, name='outlier', index=X_test.index) return pd.Series(prob, name='outlier')
def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination) self.clf = PCA(contamination=self.contamination)
def S2(self): self.S1() water_data = self.water_data result = self.result # 数据预处理及模型训练 clean_data = water_data[water_data['S1'] == 0] Y = pd.DataFrame(index=clean_data.index, columns=['S2']) X_train = np.array(clean_data.iloc[:, 1:12]) name = list(clean_data.iloc[:, 1:12].columns.values) scaler = preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train) clf1 = IForest(contamination=0.05, max_features=11, bootstrap=True) clf2 = KNN(contamination=0.05, n_neighbors=100) clf3 = HBOS(contamination=0.05, n_bins=10) clf4 = PCA(contamination=0.05) clf1.fit(X_train) clf2.fit(X_train) clf3.fit(X_train) clf4.fit(X_train) Y['S2'] = clf1.labels_ * clf2.labels_ * clf3.labels_ * clf4.labels_ water_data = pd.concat([water_data, Y], axis=1) # water_data.loc[water_data['S2'].isna(),['S2']]=0,将S1中异常的,在S2中标注为0; result['统计异常'] = water_data['S2'].values # 寻找异常维度 from sklearn.neighbors import KernelDensity clean_data = water_data[water_data['S1'] == 0] dens = pd.DataFrame(index=clean_data.index, columns=[ 'temperature', 'pH', 'EC', 'ORP', 'DO', 'turbidity', 'transparency', 'COD', 'P', 'NH3N', 'flux' ]) for i in dens.columns: kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit( clean_data[i].values.reshape(-1, 1)) dens[i] = np.exp( kde.score_samples(clean_data[i].values.reshape(-1, 1))) dens = dens.iloc[:, 0:11].rank() dens['S2_names'] = dens.idxmin(axis=1) water_data = pd.concat([water_data, dens['S2_names']], axis=1) self.water_data = water_data result['统计异常维度'] = water_data['S2_names'].values # 存储模型 joblib.dump(scaler, "./water_model/S2_scaler") joblib.dump(clf1, "./water_model/S2_Iforest")
def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.5 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = PCA(contamination=self.contamination, random_state=42) self.clf.fit(self.X_train)
def models_init(self): """Models initialisation. """ self.model = self.configuration.get('model', 'pca') if self.model == 'pca': self.models = { model: PCA(contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'loda': self.models = { model: LODA(contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'iforest': self.models = { model: IForest(n_estimators=50, bootstrap=True, behaviour='new', contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'cblof': self.models = { model: CBLOF(n_clusters=3, contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'feature_bagging': self.models = { model: FeatureBagging( base_estimator=PCA(contamination=self.contamination), contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'copod': self.models = { model: COPOD(contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'hbos': self.models = { model: HBOS(contamination=self.contamination) for model in self.models_in_scope } else: self.models = { model: HBOS(contamination=self.contamination) for model in self.models_in_scope } self.custom_model_scalers = { model: MinMaxScaler() for model in self.models_in_scope }
def getOutlierPCA(dataset): ''' @brief Function that executes PCA algorithm on the dataset and obtains the labels of the dataset indicating which instance is an inlier (0) or outlier (1) @param dataset Dataset on which to try the algorithm @return It returns a list of labels 0 means inlier, 1 means outlier ''' # Initializating the model pca = PCA() # Fits the data and obtains labels pca.fit(dataset) # Return labels return pca.labels_
def define_classifiers(random_state, outliers_fraction): classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state) } return classifiers
def train_model(station: Station) -> LSCP: t1 = time.time() log.info(f'Training model for {station}...') log.info('Loading training observations') observations_select = Observation.select( Observation.time, Observation.sample_frequency, Observation.sample_count, Observation.rms, Observation.crest, Observation.peak_to_peak, Observation.kurtosis, ).where(Observation.station == station, Observation.is_training) obs_data = [] for observation in observations_select: obs_data.append([ observation.rms, observation.peak_to_peak, observation.kurtosis, observation.crest ]) log.info('Fitting LSCP model') lscp = LSCP([KNN()] * 5 + [LOF()] * 5 + [PCA()] * 5, contamination=0.03) lscp.fit(X=obs_data) log.info(f'Trained model in {time.time() - t1}') return lscp
def train(): dataset = get_data(1000, 10, 100) contamination = 0.01 with mlflow.start_run(): base_estimators = [ LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination)] model = SUOD(base_estimators=base_estimators, n_jobs=6, rp_flag_global=True, bps_flag=True, approx_flag_global=False, contamination=contamination) model.fit(dataset) model.approximate(dataset) predicted_labels = model.predict(dataset) voted_labels = vote(predicted_labels) true_labels = [0]*1000 + [1]*10 auc_score = roc_auc_score(voted_labels, true_labels) print("The resulted area under the ROC curve score is {}".format(auc_score)) mlflow.log_metric("auc_score", auc_score) mlflow.sklearn.log_model(model, "anomaly_model", conda_env="conda.yaml")
def main(args): data = loadmat(args.filename) trainx, testx, trainy, testy = train_test_split(data['X'], data['y'], test_size=args.train_split, random_state=2) valx, evalx, valy, evaly = train_test_split(testx, testy, test_size=0.5) data_size = len(trainx[0]) encoder_neurons = [data_size, data_size / 2, data_size / 4] clf = KNN() clf.fit(trainx) print("Results Validation KNN") print_metrics(valy, clf.predict(valx)) print("Results Evaluation KNN") print_metrics(evaly, clf.predict(evalx)) clf = PCA(n_components=args.components) clf.fit(trainx) print("Results Validation PCA") print_metrics(valy, clf.predict(valx)) print("Results Evaluation PCA") print_metrics(evaly, clf.predict(evalx)) clf = VAE(encoder_neurons=encoder_neurons, decoder_neurons=encoder_neurons[::-1], epochs=args.epochs, contamination=args.contamination, gamma=args.gamma, capacity=args.capacity) clf.fit(trainx) print("Results Validation VAE") print_metrics(valy, clf.predict(valx)) print("Results Evaluation VAE") print_metrics(evaly, clf.predict(evalx))
def calculate(method, total_roc, total_prn, x_train, x_test, y_train, y_test): if method == 'KNN': clf = KNN() elif method == 'CBLOF': clf = CBLOF() elif method == 'PCA': clf = PCA() else: clf = IForest() clf.fit(x_train) # 使用x_train训练检测器clf # 返回训练数据x_train上的异常标签和异常分值 y_train_pred = clf.labels_ # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值) y_train_scores = clf.decision_scores_ # 返回训练数据上的异常值 (分值越大越异常) print("On train Data:") evaluate_print(method, y_train, y_train_scores) # 用训练好的clf来预测未知数据中的异常值 y_test_pred = clf.predict(x_test) # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值) y_test_scores = clf.decision_function(x_test) # 返回未知数据上的异常值 (分值越大越异常) print("On Test Data:") evaluate_print(method, y_test, y_test_scores) y_true = column_or_1d(y_test) y_pred = column_or_1d(y_test_scores) check_consistent_length(y_true, y_pred) roc = np.round(roc_auc_score(y_true, y_pred), decimals=4), prn = np.round(precision_n_scores(y_true, y_pred), decimals=4) total_roc.append(roc) total_prn.append(prn)
def load_classifiers(outliers_fraction): outliers_fraction = min(0.5, outliers_fraction) random_state = np.random.RandomState(42) # Define nine outlier detection tools to be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state, behaviour="new"), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(n_neighbors=35, contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state) } return classifiers
def main(): scalers = ['no', 'std', 'minmax'] root = 'Unsupervised_Anamaly_Detection_csv' start = 0 counts = 90 CPUS = 3 CPUS_Models = 4 sklearn_models = [ 'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS', 'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD', 'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal', 'VAE', 'AutoEncoder' ] models = { 'BRM': BRM(bootstrap_sample_percent=70), 'GM': GaussianMixture(), 'IF': IsolationForest(), 'OCSVM': OneClassSVM(), 'EE': EllipticEnvelope(), 'AvgKNN': KNN(method='mean'), 'LargestKNN': KNN(method='largest'), 'MedKNN': KNN(method='median'), 'PCA': PCA(), 'COF': COF(), 'LODA': LODA(), 'LOF': LOF(), 'HBOS': HBOS(), 'MCD': MCD(), 'AvgBagging': FeatureBagging(combination='average'), 'MaxBagging': FeatureBagging(combination='max'), 'CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'FactorAnalysis': FactorAnalysis(), 'KernelDensity': KernelDensity(), 'COPOD': COPOD(), 'SOD': SOD(), 'LSCPwithLODA': LSCP([LODA(), LODA()]), 'AveLMDD': LMDD(dis_measure='aad'), 'VarLMDD': LMDD(dis_measure='var'), 'IqrLMDD': LMDD(dis_measure='iqr'), 'SoGaal': SO_GAAL(), 'MoGaal': MO_GAAL(), 'VAE': VAE(encoder_neurons=[8, 4, 2]), 'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]), 'OCKRA': m_OCKRA(), } name = "30_Models" Parallel(n_jobs=CPUS) \ (delayed(runByScaler) (root, scaler, models, start, counts, other_models=sklearn_models, CPUS=CPUS_Models, save_name=name) for scaler in scalers)
def outlier_detection(x_raw, y_raw): """ Filter all ourlier points :param x_raw: feature in ndarray :param y_raw: label in ndarray :return x_clean, y_clean: cleaned feature and label in ndarray """ # TODO Filter the outliers. print() print("Detecting outliers...") print("Before outlier detection: {}".format(x_raw.shape)) outliers_fraction = 0.04 random_state = np.random.RandomState(42) # all outlier detection method candidate list as follows classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state), 'Improving Supervised Outlier Detection with Unsupervised Representation Learning': XGBOD(contamination=outliers_fraction), } clf_name = 'Isolation Forest' clf = IForest(contamination=outliers_fraction, random_state=random_state) # clf_name = 'Angle-based Outlier Detector (ABOD)' # clf = ABOD(contamination=outliers_fraction, method='default') clf.fit(x_raw) y_pred = clf.predict(x_raw) # for pyod, 1 means outliers and 0 means inliers # for sklearn, -1 means outliers and 1 means inliers idx_y_pred = [i for i in range(0, 1212) if y_pred[i] == 1] x_clean = del_rowsorcolumns(x_raw, idx_y_pred, axis=0) y_clean = del_rowsorcolumns(y_raw, idx_y_pred, axis=0) print("After outlier detection: {}".format(x_clean.shape)) assert (x_clean.shape[0] == y_clean.shape[0]) return x_clean, y_clean
def OD_detect(df, id_col=None, contamination=0.05, trans_cols=None): """ use pyod lib to find 5% outlier in dataset """ df = df.copy() OD_clfs = { "HBOS": HBOS(contamination=contamination), "IForest": IForest(contamination=contamination), "CBLOF": CBLOF(contamination=contamination, n_clusters=5), # "OCSVM": OCSVM(contamination=contamination), "PCA": PCA(contamination=contamination) } results_list = [] od_cols = ["id", "name", "result", "label"] if id_col is None: s_id = df.index od_cols = df.columns else: s_id = df[id_col] X_cols = df.columns.drop(id_col) if trans_cols is not None: for col in trans_cols: df[col] = PowerTransformer().fit_transform(df[col].values.reshape( -1, 1)) for clf_name, clf in OD_clfs.items(): od_result = pd.DataFrame(columns=od_cols) # create an empty dataframe od_result["id"] = s_id od_result['name'] = clf_name print(f"{clf_name}, {clf}") clf.fit(df[X_cols]) od_result['result'] = clf.decision_scores_ od_result['label'] = clf.labels_ results_list.append(od_result) od_results_df = pd.concat(results_list, axis=0, ignore_index=True) job_name = f'{pd.datetime.now():%H%M}' od_results_df['job_name'] = job_name od_results_df.to_sql('t_ml', engine, if_exists='append', schema='wh_v1', method=psql_insert_copy) print( f"OD results {od_results_df.shape}exported to database{engine},job_name={job_name}" ) return od_results_df
def pred_PCA(self, n_comp=3, comp_with='openaq'): ## hyperparameters for KNN is tuned here # Number of samples must be greater than the n_components (3 in this case). It can be made 0.3 to make it work self.comp_with = comp_with if comp_with == "openaq": if self.X_o == []: pred = [] elif self.X_o.shape[0] > n_comp: self.clf = PCA(n_components=n_comp) self.clf.fit(self.X_o) pred = self.clf.labels_ elif self.X_o.shape[0] > 2: # print(f"The value of k is changed from {k} to {self.X_o.shape[0]-1}") n_comp = self.X_o.shape[0] - 1 self.clf = PCA(n_components=n_comp) self.clf.fit(self.X_o) pred = self.clf.labels_ else: pred = [] elif comp_with == "cams": pred = [] for each_X in self.X_c: self.clf = PCA(n_components=n_comp) self.clf.fit(each_X) pred.append(self.clf.labels_[-1]) A_location, B_location, C_location = self.pred_location(pred) return A_location, B_location, C_location
def pca(self, X_train, n_components=None, contamination=None): """ Train PCA model from PYOD Parameters __________ X_train: scaled training data contamination: percentage of anomalies in the data n_components: number of components to transform Returns ________ Anomaly scores """ model = PCAOD(n_components=n_components, contamination=contamination) model.fit(X_train) # Predict raw anomaly score labels = model.predict(X_train) # outlier labels (0 or 1) pca_anomaly_scores = model.decision_function(X_train) # outlier scores pca_anomaly_scores = self.min_max_scaler(pca_anomaly_scores) return pca_anomaly_scores, labels
def pred_COPOD(self, comp_with="openaq"): self.comp_with = comp_with if comp_with == "openaq": if self.X_o == []: pred = [] else: self.clf = COPOD() self.clf.fit(self.X_o) pred = self.clf.labels_ elif comp_with == "cams": pred = [] for each_X in self.X_c: self.clf = COPOD() self.clf.fit(each_X) pred.append(self.clf.labels_[-1]) A_location, B_location, C_location = self.pred_location(pred) return A_location, B_location, C_location
def fit_transform(self, df_train, df_corrupted): pyod_model = PCA( contamination=0.25 ) # n_components = min(n_samples, n_features) default # n_selected_components = None df_outliers_num = self.num_out_detect(df_train, df_corrupted, pyod_model) df_outliers_cat = self.cat_out_detect(df_train, df_corrupted) df_outliers = df_outliers_num.join(df_outliers_cat, how='inner') for col in df_corrupted.columns: for i in df_outliers.index: if df_outliers.loc[i, col + "_outlier"] == 1: df_outliers.loc[i, col] = np.nan return df_outliers, self.predictors
def setUp(self): self.n_train = 1000 self.n_test = 500 self.contamination = 0.1 self.roc_floor = 0.6 self.random_state = 42 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=self.random_state) self.base_estimators = [ LOF(n_neighbors=5, contamination=self.contamination), LOF(n_neighbors=15, contamination=self.contamination), LOF(n_neighbors=25, contamination=self.contamination), LOF(n_neighbors=35, contamination=self.contamination), LOF(n_neighbors=45, contamination=self.contamination), HBOS(contamination=self.contamination), PCA(contamination=self.contamination), LSCP(detector_list=[ LOF(n_neighbors=5, contamination=self.contamination), LOF(n_neighbors=15, contamination=self.contamination) ], random_state=self.random_state) ] this_directory = os.path.abspath(os.path.dirname(__file__)) self.cost_forecast_loc_fit_ = os.path.join(this_directory, 'bps_train.joblib') self.cost_forecast_loc_pred_ = os.path.join(this_directory, 'bps_prediction.joblib') self.model = SUOD(base_estimators=self.base_estimators, n_jobs=2, rp_flag_global=True, bps_flag=True, contamination=self.contamination, approx_flag_global=True, cost_forecast_loc_fit=self.cost_forecast_loc_fit_, cost_forecast_loc_pred=self.cost_forecast_loc_pred_, verbose=True)
def fit_transform(self, dataset, y=None): data = dataset.copy() if 'iso' in self.methods: self.iso_forest = IForest(contamination=self.contamination, random_state=self.random_state, behaviour='new') self.iso_forest.fit(data.drop(self.target, axis=1)) iso_predict = self.iso_forest.predict( data.drop(self.target, axis=1)) data['iso'] = iso_predict if 'knn' in self.methods: self.knn_out = KNN(contamination=self.contamination) self.knn_out.fit(data.drop(self.target, axis=1)) knn_predict = self.knn_out.predict(data.drop(self.target, axis=1)) data['knn'] = knn_predict if 'pca' in self.methods: self.out_pca = PCA_RO(contamination=self.contamination, random_state=self.random_state) self.out_pca.fit(data.drop(self.target, axis=1)) pca_predict = self.out_pca.predict(data.drop(self.target, axis=1)) data['pca'] = pca_predict # use for those features which are gaussian distributed if 'mcd' in self.methods: self.mcd = EllipticEnvelope(contamination=0.01) self.mcd.fit(data.drop(self.target, axis=1)) mcd_predict = self.mcd.predict(data.drop(self.target, axis=1)) data['mcd'] = mcd_predict data['vote_outlier'] = 0 for i in self.methods: data['vote_outlier'] = data['vote_outlier'] + data[i] self.outliers = data[data['vote_outlier'] == len(self.methods)] return dataset[[ True if i not in self.outliers.index else False for i in dataset.index ]]
def __init__(self, window_size, step_size=1, contamination=0.1, n_components=None, n_selected_components=None, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None, weighted=True, standardization=True): super(PCA, self).__init__(contamination=contamination) self.window_size = window_size self.step_size = step_size # parameters for PCA self.n_components = n_components self.n_selected_components = n_selected_components self.copy = copy self.whiten = whiten self.svd_solver = svd_solver self.tol = tol self.iterated_power = iterated_power self.random_state = random_state self.weighted = weighted self.standardization = standardization # initialize a kNN model self.model_ = PCA_PYOD( n_components=self.n_components, n_selected_components=self.n_selected_components, contamination=self.contamination, copy=self.copy, whiten=self.whiten, svd_solver=self.svd_solver, tol=self.tol, iterated_power=self.iterated_power, random_state=self.random_state, weighted=self.weighted, standardization=self.standardization)
def choose_model(model, nnet): """ among implemented in PyOD """ clfs = { 'AE': AutoEncoder(hidden_neurons=nnet, contamination=0.1, epochs=15), 'VAE': VAE(encoder_neurons=nnet[:5], decoder_neurons=nnet[4:], contamination=0.1, epochs=13), 'ABOD': ABOD(), 'FeatureBagging': FeatureBagging(), 'HBOS': HBOS(), 'IForest': IForest(), 'KNN': KNN(), 'LOF': LOF(), 'OCSVM': OCSVM(), 'PCA': PCA(), 'SOS': SOS(), 'COF': COF(), 'CBLOF': CBLOF(), 'SOD': SOD(), 'LOCI': LOCI(), 'MCD': MCD() } return clfs[model]
def train(doc_list, dataset_name, clf_name): model_roc = [] model_prc = [] if clf_name == "PCA": clf = PCA() elif clf_name == "MCD": clf = MCD() elif clf_name == "LOF": clf = LOF() elif clf_name == "KNN": clf = KNN() elif clf_name == "LODA": clf = LODA() for i in range(10): data = pd.read_csv(doc_list[i], header=0, index_col=0) train_x = data.drop(drop + ground_truth, axis=1).values train_y = np.array([ transfor[x] for x in list(_flatten(data[ground_truth].values.tolist())) ]) clf.fit(train_x) predict = clf.decision_scores_ roc = roc_auc_score(train_y, predict) prc = precision_n_scores(train_y, predict) if ((i + 1) % 200 == 0): print("第" + str(i + 1) + "个文件结果:") evaluate_print(clf_name, train_y, predict) model_roc.append(roc) model_prc.append(prc) model_roc_avg = np.mean(model_roc) model_prc_avg = np.mean(model_prc) print("模型" + clf_name + "在数据集" + dataset_name + "的平均roc_auc为" + str(round(model_roc_avg, 4)) + ",平均prc为" + str(round(model_prc_avg, 4)) + "。") return model_roc_avg, model_prc_avg
def print_accuracy(train_arr,test_arr,trader_id): if len(train_arr)==0 or len(test_arr)==0: return for i in range(len(train_arr)): l1=len(train_arr[i]) l2=len(test_arr[i]) if l1==0 or l2==0: continue train_data=np.array([train_arr[i]]).T test_data=np.array([test_arr[i]]).T # clf=OCSVM(kernel ='rbf',gamma = 0.5) print(len(train_arr)) clf = PCA(n_components =15) clf.fit(train_arr) y_pred=clf.predict(train_arr) print("TRAINING ACCURACY for TRADER",trader_id,":",100 - (sum(y_pred)*100/l1)) y_pred=clf.predict(test_data) print("TESTING ACCURACY: ",sum(y_pred)*100/l2)
'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), # 'Median KNN': KNN(method='median', # contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(n_neighbors=35, contamination=outliers_fraction), # 'Local Correlation Integral (LOCI)': # LOCI(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state), # 'Stochastic Outlier Selection (SOS)': SOS( # contamination=outliers_fraction), 'Locally Selective Combination (LSCP)': LSCP(detector_list, contamination=outliers_fraction, random_state=random_state), # 'Connectivity-Based Outlier Factor (COF)': # COF(n_neighbors=35, contamination=outliers_fraction), # 'Subspace Outlier Detection (SOD)': # SOD(contamination=outliers_fraction), } # Show all detectors for i, clf in enumerate(classifiers.keys()): print('Model', i + 1, clf)
'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), '(KNN) K Nearest Neighbors ': KNN( contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), # 'Median KNN': KNN(method='median', # contamination=outliers_fraction), '(LOF) Local Outlier Factor ': LOF(n_neighbors=35, contamination=outliers_fraction), # 'Local Correlation Integral (LOCI)': # LOCI(contamination=outliers_fraction), '(MCD) Minimum Covariance Determinant ': MCD( contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), '(PCA) Principal Component Analysis ': PCA( contamination=outliers_fraction, random_state=random_state), # 'Stochastic Outlier Selection (SOS)': SOS( # contamination=outliers_fraction), '(LSCP) Locally Selective Combination ': LSCP( detector_list, contamination=outliers_fraction, random_state=random_state), # 'Connectivity-Based Outlier Factor (COF)': # COF(n_neighbors=35, contamination=outliers_fraction), # 'Subspace Outlier Detection (SOD)': # SOD(contamination=outliers_fraction), } st.subheader('SELECT AN ALGORITHM:') classifier_name = st.selectbox('THE ALGORITHM',[*classifiers]) # Show all detectors
if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train PCA detector clf_name = 'PCA' clf = PCA() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
class TestPCA(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.5 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = PCA(contamination=self.contamination, random_state=42) self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): assert_true(hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true(hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true(hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true(hasattr(self.clf, 'selected_components_') and self.clf.selected_components_ is not None) assert_true(hasattr(self.clf, 'selected_w_components_') and self.clf.selected_w_components_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass