def __init__(self, anomalies, unlabel, classifer, return_proba=True, n_clusters='auto', cluster_algo='kmeans', contamination=0.02, theta=0.85, alpha='auto', beta='auto', random_state=2018): scaler = StandardScaler() self.anomalies = scaler.fit_transform(anomalies) # 对应于已知的正样本集(P集) self.unlabel = scaler.fit_transform(unlabel) # 对应于无标签样本集(U集) self.n_clusters = n_clusters # 聚类簇数可以预先指定,也可以由get_cluster_centers自动确定最佳聚类簇数 self.classifer = classifer # 选取的分类器 self.return_proba = return_proba # 布尔型参数,是否返回样本取正的后验概率 self.cluster_algo = cluster_algo # 聚类算法,可以选取'spectral'、'birch'、'dbscan'、'kmeans',默认取'kmeans' self.contamination = contamination # contamination为预估的U中异常样本(即正样本)比例 self.theta = theta # isolation_score、similarity_score的加权系数分别为theta、1-theta self.alpha = alpha # 论文默认取已知异常样本(P集)的total_score均值,作为无标签样本是否为potential anomalies的阈值 self.beta = beta # 判定无标签样本是否为reliable normal的阈值 self.random_state = random_state self.centers = get_cluster_centers(self.anomalies, self.n_clusters, self.cluster_algo) # 返回聚类的簇中心
def __init__(self, anomalies, unlabel, classifer, cluster_algo='kmeans', n_clusters='auto', contamination=0.01, theta=0.85, alpha='auto', beta='auto', return_proba=False, random_state=2018): ''' :param anomalies: Observed anomaly data sets :param unlabel: Unlabeled data sets. :param classifer: A Classifer fitting weighted samples and labels to predict unlabel samples. :param cluster_algo: str, {'kmeans'、'spectral'、'birch'、'dbscan'}, default = 'kmeans' Clustering algorithm for clustering anomaly samples. :param n_clusters: int, default=5 The number of clusters to form as well as the number of centroids to generate. :param contamination : float, range (0, 0.5). The proportion of outliers in the data set. :param theta : float, range [0, 1]. The weights of isolation_score and similarity_score are theta and 1-theta respectively. :param alpha : float, should be positive number, default = mean value of anomalies's score Threshold value for determining unlabel sample as potential anomaly :param beta : float, should be positive number Threshold value for determining unlabel sample as reliable normal sample :param return_proba : bool, default=False Whether return the predicted probability for positive(anomaly) class for each sample. Need classifer to provide predict_proba method. ''' dataset_scaled = StandardScaler().fit_transform(np.r_[anomalies, unlabel]) self.anomalies = dataset_scaled[:len(anomalies), :] self.unlabel = dataset_scaled[len(anomalies):, :] self.contamination = contamination self.classifer = classifer self.n_clusters = n_clusters self.cluster_algo = cluster_algo self.theta = theta self.alpha = alpha self.beta = beta self.return_proba = return_proba self.random_state = random_state self.centers, self.cluster_score = get_cluster_centers(self.anomalies, self.n_clusters, self.cluster_algo)
def __init__(self, anomalies, unlabel, classifer, cluster_algo='kmeans', n_clusters='auto', kernel='rbf', verbose=3, contamination=0.01, theta=0.85, alpha='auto', beta='auto', return_proba=False, random_state=2018): ''' :param anomalies: Observed anomaly datasets. :param unlabel: Unlabeled datasets. :param classifer: A Classifer fitting weighted samples and labels to predict unlabel samples. :param cluster_algo: str, {'kmeans'、'spectral'、'birch'、'dbscan'}, default = 'kmeans' Clustering algorithm for clustering anomaly samples. :param n_clusters: int, default=5 The number of clusters to form as well as the number of centroids to generate. :param kernel: str, default='rbf'. 'linear' | 'poly' | 'rbf' | 'sigmoid' | 'cosine' | 'precomputed' kernel. :param verbose: int, default=3, Verbosity mode. the higher, the less messages. KernelPCA is time-consuming, and the verbose parameter helps to check the progress of the reconstruction. If verbose = m, information is printed every m rounds. :param contamination : float, range (0, 0.5). The proportion of outliers in the data set. :param theta : float, range [0, 1]. The weights of isolation_score and similarity_score are theta and 1-theta respectively. :param alpha : float, should be positive number, default = mean value of anomalies's score Threshold value for determining unlabel sample as potential anomaly :param beta : float, should be positive number Threshold value for determining unlabel sample as reliable normal sample :param return_proba : bool, default=False Whether return the predicted probability for positive(anomaly) class for each sample. Need classifer to provide predict_proba method. ''' self.dataset = StandardScaler().fit_transform(np.r_[anomalies, unlabel]) self.anomalies = self.dataset[:len(anomalies), :] self.unlabel = self.dataset[len(anomalies):, :] self.contamination = contamination self.verbose = verbose self.kernel = kernel self.classifer = classifer self.n_clusters = n_clusters self.cluster_algo = cluster_algo self.theta = theta self.alpha = alpha self.beta = beta self.return_proba = return_proba self.random_state = random_state self.centers, self.cluster_score = get_cluster_centers( self.anomalies, self.n_clusters, self.cluster_algo)