def verify_maldist_equivalence(dataset): # 马氏距离的初始定义 dist_original = mahal_dist(dataset) # 根据数值大小,对数据集索引降序排列 indices_desc_original = np.argsort(-dist_original) # 马氏距离的变体 dist_variant = mahal_dist_variant(dataset) # 根据数值大小,对数据集索引降序排列 indices_desc_variant = np.argsort(-dist_variant) assert not np.allclose(dist_original, dist_variant), '马氏距离及其变体返回的数值一般不相等' indices_verify_result = np.allclose(indices_desc_original, indices_desc_variant) return indices_verify_result
def predict_anomaly_indices(X, contamination): # 孤立森林 iforest = IsolationForest(n_estimators=125, contamination=contamination, behaviour='new', random_state=2018, n_jobs=-1) iforest_result = iforest.fit_predict(X) anomaly_num = len(np.where(iforest_result == -1)[0]) # 分数越小于0,越有可能是异常值 anomaly_score = iforest.decision_function(X) if_idx = np.argsort(anomaly_score)[:anomaly_num] # LOF lof = LocalOutlierFactor(contamination=contamination, p=2, novelty=False, n_jobs=-1) lof.fit(X) score = -lof.negative_outlier_factor_ lof_idx = np.argsort(-score)[:anomaly_num] # RobustPCC rpcc = rp.RobustPCC(X, X, gamma=0.01, quantile=99) rpcc_idx = rpcc.test_anomaly_idx()[:anomaly_num] # 马氏距离 dist = md.mahal_dist(X) md_idx = np.argsort(-dist)[:anomaly_num] # LinearPCA重构 pre = rep.PCA_Recon_Error(X, contamination=contamination) pre_idx = pre.anomaly_idx() # KernelPCA重构 kre = rek.KPCA_Recon_Error(X, contamination=contamination) kre_idx = kre.anomaly_idx() # 返回预测出的异常样本索引 anomaly_indices = [if_idx, lof_idx, rpcc_idx, md_idx, kre_idx, pre_idx] return np.array(anomaly_indices)
def predict_anomaly_indices(X, contamination): # 孤立森林 iforest = IsolationForest(n_estimators=125, contamination=contamination, behaviour='new', random_state=2018, n_jobs=-1) # Returns -1 for outliers and 1 for inliers. iforest_pred = iforest.fit_predict(X) iforest_result = np.array([1 if pred==-1 else 0 for pred in iforest_pred]) # LOF lof = LocalOutlierFactor(contamination=contamination, p=2, novelty=False, n_jobs=-1) # Returns -1 for outliers and 1 for inliers. lof_pred = lof.fit_predict(X) lof_result = np.array([1 if pred==-1 else 0 for pred in lof_pred]) # 马氏距离 dist = md.mahal_dist(X) anomaly_num = int(np.ceil(contamination * len(X))) md_idx = np.argsort(-dist)[:anomaly_num] mahal_result = np.array([1 if i in md_idx else 0 for i in range(len(X))]) # RobustPCC rpcc = rp.RobustPCC(X, X, gamma=0.01, quantile=99, contamination=contamination) rpcc_result = rpcc.predict() #LinearPCA重构 pre = rep.PCA_Recon_Error(X, contamination=contamination) pre_result = pre.predict() ##KernelPCA重构 kre = rek.KPCA_Recon_Error(X, contamination=contamination, kernel='linear') print('KernelPCA starts.') start = time.time() kre_result = kre.predict() end = time.time() print("KernelPCA cost time: {:.2f}s".format(end-start)) anomaly_pred = [iforest_result, lof_result, mahal_result, pre_result, kre_result, rpcc_result] return np.array(anomaly_pred)