import pandas import cv2 bg_input_path = "/home/k_mathin/PycharmProjects/DataMiningClass_v4/background_removal/imgs/bg.jpeg" foreground_input_path = "/home/k_mathin/PycharmProjects/DataMiningClass_v4/background_removal/imgs/foreground.jpeg" bg = cv2.imread(bg_input_path, 0) fg = cv2.imread(foreground_input_path, 0) bg_o_shape = bg.shape fg_o_shape = fg.shape k = 5 bg_new_data = bg.reshape(-1, 1) fg_new_data = fg.reshape(-1, 1) bg_vgmm = BayesianGaussianMixture(n_components=k) fg_vgmm = BayesianGaussianMixture(n_components=k) # vgmm = GaussianMixture(n_components=k) bg_vgmm = bg_vgmm.fit(bg_new_data) fg_vgmm = fg_vgmm.fit(fg_new_data) bg_cluater = bg_vgmm.predict(bg_new_data) fg_cluater = fg_vgmm.predict(fg_new_data) # Reshape the input data to the orignal shape bg_img_cluater = bg_cluater.reshape(bg_o_shape[0], bg_o_shape[1]) fg_img_cluater = fg_cluater.reshape(fg_o_shape[0], fg_o_shape[1]) from matplotlib import pyplot pyplot.subplot(2, 1, 1) pyplot.imshow(bg_img_cluater) pyplot.subplot(2, 1, 2)
print((train_sizes[np.argmax(test_mean)])) print(test_mean[np.argmax(test_mean)]) print(train_mean[np.argmax(test_mean)]) mlp_learner = MLPClassifier(hidden_layer_sizes=(100,),activation='relu',solver='sgd', learning_rate = 'adaptive', learning_rate_init = 0.07) train_sizes, train_scores, test_scores = learning_curve(mlp_learner, x_train_scaled_km, y_train, cv=5, n_jobs=-1, train_sizes=np.linspace(0.01, 1.0, 100)) train_mean = np.mean(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) plot_data(train_sizes, test_mean, title="Neural Network Learning Curve - Dataset 1 + Clustering Result", x_label="Training Size", y_label="Accuracy Score", color="orange", label='CV (+K-means Result)', linestyle='dashed') plot_data(train_sizes, train_mean, title="Neural Network Learning Curve - Dataset 1 + Clustering Result", x_label="Training Size", y_label="Accuracy Score", color="orange", label='Training (+K-means Result') print(train_sizes) print((train_sizes[np.argmax(test_mean)])) print(test_mean[np.argmax(test_mean)]) print(train_mean[np.argmax(test_mean)]) gm = BayesianGaussianMixture(n_components = 14, random_state=random_state, reg_covar=1e-01) y_pred = gm.fit_predict(x_projected_pca) x_train_scaled_em = np.column_stack((x_train_scaled,y_pred)) mlp_learner = MLPClassifier(hidden_layer_sizes=(100,),activation='relu',solver='sgd', learning_rate = 'adaptive', learning_rate_init = 0.07) train_sizes, train_scores, test_scores = learning_curve(mlp_learner, x_train_scaled_em, y_train, cv=5, n_jobs=-1, train_sizes=np.linspace(0.01, 1.0, 100)) train_mean = np.mean(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) plot_data(train_sizes, test_mean, title="Neural Network Learning Curve - Dataset 1 + Clustering Result", x_label="Training Size", y_label="Accuracy Score", color="blue", label='CV (+EM Result)', linestyle='dashed') plot_data(train_sizes, train_mean, title="Neural Network Learning Curve - Dataset 1 + Clustering Result", x_label="Training Size", y_label="Accuracy Score", color="blue", label='Training (+EM Result)') print(train_sizes) print((train_sizes[np.argmax(test_mean)])) print(test_mean[np.argmax(test_mean)]) print(train_mean[np.argmax(test_mean)])
def bin_test(X_train, y_train, X_test, y_test): starttime = time.time() # binary models models = [ "BNB", "GNB", "LDA", "SVM_G", "5NN", "LR2", "P2", "SGD", "ADA", "DT", "RF", "DPGMM", "ET", "GMM", "MLP" ] #"SVM_L", "SVM_G", "P2", "DT", "ADA_R", clfs = [BernoulliNB(), \ GaussianNB(), \ LinearDiscriminantAnalysis(), \ svm.SVC(kernel='rbf', probability=True), \ neighbors.KNeighborsClassifier(n_neighbors=5), \ LogisticRegression(), \ Perceptron(penalty='l2',tol=None,max_iter=1000), \ SGDClassifier(tol=0.0001, power_t=0.4, average=True), \ AdaBoostClassifier(base_estimator=None, n_estimators=100), \ DecisionTreeClassifier(), \ RandomForestClassifier(oob_score=True), \ BayesianGaussianMixture(n_components=2,max_iter=1000, weight_concentration_prior_type='dirichlet_process', tol=0.0001), \ ExtraTreesClassifier(bootstrap=True, oob_score=True, n_estimators=4), \ GaussianMixture(n_components=2, tol=0.0001, max_iter=1000, n_init=2), \ MLPClassifier(activation='relu', alpha=0.00001, max_iter=1000)] results = [] outlier_results = [] for i in range(len(clfs)): print "model being tested: {0}".format(models[i]) time_start = time.time() clf = clfs[i].fit(X_train, y_train) predict = clf.predict(X_test) runtime = time.time() - time_start p = metrics.precision_score(y_test, predict) r = metrics.recall_score(y_test, predict, average="macro") f = metrics.f1_score(y_test, predict) # find outliers data = [('TeamID', X_test['TeamID'].values), ('predicted', predict), ('label', y_test.values)] labels_and_predicted = pd.DataFrame.from_items(data) outliers = X_test.merge(labels_and_predicted, on='TeamID') outliers = outliers[outliers['label'] != outliers['predicted']] num_mislabeled = outliers.shape[0] p_new = -1 r_new = -1 f_new = -1 if (outliers['label'].unique().size > 1): # train separate model on outliers mislabeled_labels = outliers['label'] mislabeled_samples = outliers.drop(['label', 'predicted'], axis=1) (train_vars, validate_vars, train_outcomes, validate_outcomes) = train_test_split(mislabeled_samples, mislabeled_labels, test_size=0.2) clf_new = clfs[i].fit(train_vars, train_outcomes) validate_predicted = clf_new.predict(validate_vars) # evaluate p_new = metrics.precision_score(validate_outcomes, validate_predicted) r_new = metrics.recall_score(validate_outcomes, validate_predicted, average="macro") f_new = metrics.f1_score(validate_outcomes, validate_predicted) outlier_results.append([models[i], p_new, r_new, f_new]) results.append([models[i], p, r, f, runtime]) # create confusion matrix cm = metrics.confusion_matrix(y_test, predict) plot_confusion(cm, y_test, filename='{0}_confusion.png'.format(models[i])) print print "All data models" print print tabulate.tabulate( results, headers=['Classif', 'Precision', 'Recall', 'F1 Score', 'Runtime']) print print "Outlier models" print print tabulate.tabulate( outlier_results, headers=['Classif', 'Precision', 'Recall', 'F1 Score', 'Runtime']) print "Binary test took {0} secs".format(time.time() - starttime) return pd.DataFrame(data=results)
iter(skf.split(data.values, sedclass[sedclass.columns[0]]))) X_train = data.iloc[train_index].dropna().values y_train = sedclass.iloc[train_index].dropna().values X_test = data.iloc[test_index].dropna().values y_test = sedclass.iloc[test_index].dropna().values n_classes = len(np.unique(y_train)) # Try GMMs using different types of covariances. estimators = dict( (cov_type, BayesianGaussianMixture( n_components=n_classes, covariance_type=cov_type, max_iter=100, random_state=0, weight_concentration_prior_type='dirichlet_distribution')) for cov_type in ['spherical', 'diag', 'tied', 'full']) n_estimators = len(estimators) plt.figure(figsize=(3 * n_estimators // 2, 6)) plt.subplots_adjust(bottom=.01, top=0.95, hspace=.15, wspace=.05, left=.01, right=.99) for index, (name, estimator) in enumerate(estimators.items()): # Since we have class labels for the training data, we can
def BGMreport(path,visualize=1,cut_n=6): t2=15 t3=0.07 n_components=3 denses,_=finddensefromcut(path,cut_n) maxd=[] for dense in denses[(cut_n-5):]: maxd.append(max(dense)) lofd=len(denses[0]) samples=list() for i in range((cut_n-5),cut_n):#sampling for BGM samples.append(np.array(tosample(denses[i])).reshape(-1,1)) allmeans=[] allcovs=[] allweights=[] BGM45=np.zeros((45)) for i in range(5): BGM=BayesianGaussianMixture(n_components=n_components,covariance_type='spherical',weight_concentration_prior=0.000000000001,max_iter=500) BGM.fit(samples[i]) means=np.reshape(BGM.means_,(-1,)) permu=np.argsort(means) means=means[permu] BGM45[i*9+3:i*9+6]=means allmeans.append(means) covs=BGM.covariances_ covs=covs[permu] BGM45[i*9+6:i*9+9]=covs allcovs.append(covs) weights=BGM.weights_ weights=weights[permu] BGM45[i*9:i*9+3]=weights*len(samples[i]) allweights.append(weights) if visualize==1: l=0 for i in range(cut_n-5,cut_n):#visualization l+=1 plt.subplot(2,n_components,l),plt.plot(denses[i]) X=np.linspace(0,lofd,num=200,endpoint=False) Ys=toGM(X,n_components,allmeans[l-1],allcovs[l-1],allweights[l-1]) for j in range(n_components): #plt.subplot(1,5,l),plt.plot([allmeans[l-1][j],allmeans[l-1][j]],[0,255]) plt.subplot(2,n_components,l),plt.plot(X,len(samples[l-1])*Ys[j]) #plt.subplot(2,n_components,l),plt.plot(X,Ys[j]) plt.ylim(0,255) plt.show() ans=np.zeros((12,)) pre=np.zeros((5,n_components)) for i in range(5):###preprocessing the data to avoid peak overlapping(far overlap and near overlap) influence: identify far/near overlap cases and suppress far overlap peaks, amplify near overlap peaks ###如果很理想的情况应该能把两个far overlap的peak合并成一个在中间mean的,但是现在可以先直接把两个抑制掉,毕竟就不太可能是单克隆峰了。far overlap也就是两个峰实际上在图里面是同一个,BGM将其拆分从而更好的拟合高斯模型,我们这里将其抑制因为能够拆分为两个峰的基本上cov都比较大,不尖。 for j in range(n_components): for l in range(n_components): if j<l: if allweights[i][j]/allweights[i][l]>3 or allweights[i][j]/allweights[i][l]<0.3333:#ignore when weight difference is too large continue if allcovs[i][j]/allweights[i][j]/allcovs[i][l]*allweights[i][l]/abs(allmeans[i][j]-allmeans[i][l])*mean(np.sqrt(allcovs[i][j]),np.sqrt(allcovs[i][l]))>2 or allcovs[i][l]/allweights[i][l]/allcovs[i][j]*allweights[i][j]/abs(allmeans[i][j]-allmeans[i][l])*mean(np.sqrt(allcovs[i][j]),np.sqrt(allcovs[i][l]))>2:#if the cov difference is large than it will be ignored from far overlap because there should be two peaks in the original density plot #near overlap situation is when a sharp peak is on a mild one. it happens when monoclonal peak has a background polyclonal peak. here we amplify the sharp peaks' weight when their cov difference is large enough or their distance is close enough so that it will be detected as abnormal in the classification step if abs(allmeans[i][j]-allmeans[i][l])<3.5*np.sqrt(max(allcovs[i][j],allcovs[i][l])): neww=allweights[i][j]+allweights[i][l] if allcovs[i][l]/allweights[i][l]/allcovs[i][j]*allweights[i][j]>1 and allweights[i][j]>0.15: if allcovs[i][j]<400: allweights[i][j]=neww else: if allcovs[i][l]<400: allweights[i][l]=neww continue if allcovs[i][j]/allweights[i][j]/len(samples[i])<t3/2.5 or allcovs[i][l]/allweights[i][l]/len(samples[i])<t3/2.5:#if one of the considered peak has very small variance, then it should not be far overlap situation where the original peak is mild continue if allcovs[i][j]<70 or allcovs[i][l]<70: continue elif abs(allmeans[i][j]-allmeans[i][l])<3.5*np.sqrt(max(allcovs[i][j],allcovs[i][l])):#far overlap situation where there is only a mild peak in the original density plot, and GMM model break it down to two sharper peaks to fit the guassian curves more accurately. here we just suppress the peaks and thus we cannot determine the column is abnormal because of the two considered components pre[i][j]=pre[i][l]=1 for i in [0,1,2]: for j in [3,4]: if maxd[i]<50 or maxd[j]<50: continue else: for k in range(len(allmeans[i])): for l in range(len(allmeans[j])): if pre[i][k]==1 or pre[j][l]==1: continue if abs(allmeans[i][k]-allmeans[j][l])>lofd/t2: continue else: if allweights[i][k]<0.1 or allweights[j][l]<0.1: continue else: if allcovs[i][k]/allweights[i][k]/len(samples[i])>t3 or allcovs[j][l]/allweights[j][l]/len(samples[j])>t3:###the t figure, represents the sharpness of the peak. just variance is not enough, we need to consider n_samples and weights too. continue else: ans[i*2+j-2]=1 ans[7+i]=1 ans[7+j]=1 ans[0]=1 for i in range(5): for j in range(n_components): if pre[i][j]==1: continue if maxd[i]<80: continue elif allweights[i][j]<0.05: continue if allcovs[i][j]/allweights[i][j]/len(samples[i])>t3:###t-figure continue else: ans[7+i]=1 ans[0]=1 return ans,BGM45
from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis from sklearn.mixture import GMM, DPGMM, BayesianGaussianMixture, VBGMM from sklearn.svm import NuSVC, SVC # Useful for seeing all sklearn estimators that have `predict_prob` attribute estimators = all_estimators() for name, class_ in estimators: if hasattr(class_, 'predict_proba'): print(name) # Now pick and choose the ones you like estimators = { AdaBoostClassifier(): 'AdaBoost', BayesianGaussianMixture(): 'BayesianGaussianMixture', BernoulliNB(): 'BernoulliNB', DPGMM(): 'DPGMM', ExtraTreesClassifier(): 'ExtraTreesClassifier', GMM(): 'GMM', GaussianNB(): 'GaussianNB', GaussianProcessClassifier(): 'GaussianProcessClassifier', GradientBoostingClassifier(): 'GradientBoostingClassifier', KNeighborsClassifier(): 'KNeighborsClassifier', LabelPropagation(): 'LabelPropagation', LabelSpreading(): 'LabelSpreading', LinearDiscriminantAnalysis(): 'LinearDiscriminantAnalysis', LogisticRegression(): 'LogisticRegression', MLPClassifier(): 'MLPClassifier', NuSVC(): 'NuSVC', QuadraticDiscriminantAnalysis(): 'QuadraticDiscriminantAnalysis',
# from numpy import where # from sklearn.datasets import make_classification # from sklearn.cluster import Birch # from matplotlib import pyplot # # define the model # model = Birch(threshold=0.01, n_clusters=10) # # fit the model # model.fit(X) import pandas as pd # training gaussian mixture model from sklearn.mixture import BayesianGaussianMixture n_components = 5 gmm = BayesianGaussianMixture(n_components, max_iter=1000000, weight_concentration_prior=1) gmm.fit(X.toarray()) #predictions from gmm labels = gmm.predict(X.toarray()) order_centroids = gmm.means_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() centroids = gmm.means_ from sklearn.metrics.pairwise import cosine_similarity dist = 1 - cosine_similarity(X) centdist = 1 - cosine_similarity(X, centroids) # frame = pd.DataFrame(X) # frame['cluster'] = labels # # frame.columns = ['Weight', 'cluster']
def colour(img, scale=1.0, samples=10000): '''Model the distribution of colours in an image. The method models the distribution of the values in the chroma channels of an image after being converted from RGB to CIE Lab. This decouples the luma (intensity) values from the chroma (colour) information, make it easier to visualize how the colours themselves appear. The resulting visualization is the same size as the original image. Parameters ---------- img : numpy.ndarray input image scale : float image scaling factor samples : int number of samples to draw when generating the density estimate Returns ------- numpy.ndarray a new image, same dimensions as the input, visualizing the colour distribution Raises ------ ValueError if the input image is not an RGB image ''' if img.ndim != 3: raise ValueError('Require RGB image to compute structure tensor.') img = skimage.transform.rescale(img, 1.0 / scale, anti_aliasing=True, mode='constant', multichannel=True) img = skimage.color.rgb2hsv(img) height, width = img.shape[0:2] # Extract the colour vectors and sample from them. ind = generate_samples(width, height, samples) X = np.squeeze(img[ind[1, :], ind[0, :], 0:2]) # Convert a polar to cartesian coordinate conversation (will make the # visualization easier). mag = X[:, 1] ang = 2 * np.pi * X[:, 0] X[:, 0] = mag * np.cos(ang) X[:, 1] = mag * np.sin(ang) # Perform a density estimation using a GMM. gmm = BayesianGaussianMixture( n_components=25, weight_concentration_prior_type='dirichlet_distribution', weight_concentration_prior=1e-3) gmm.fit(X) # Generate the output array. x, y = np.meshgrid(np.linspace(-1, 1, width), np.linspace(-1, 1, height)) X = np.c_[x.flatten(), y.flatten()] scores = np.exp(gmm.score_samples(X)) max_score = np.max(scores) # Apply a gamma correction to make the image look a bit nicer. val = np.reshape(scores, (height, width)) / max_score val = skimage.exposure.adjust_gamma(val, gamma=0.3) # Convert back from HSV to RGB. The saturation needs to be clamped so that # it doesn't produce invalid values during the HSV->RGB conversion. mag = x**2 + y**2 sat = np.sqrt(mag) sat[sat > 1] = 1 # The hue also needs to be adjust since atan2() returns a value between # -pi and pi, but the hue needs to be between 0 an 1. hue = np.arctan2(y, x) hue[hue < 0] = hue[hue < 0] + 2 * np.pi hue /= 2 * np.pi output = np.dstack((hue, sat, val)) output = skimage.color.hsv2rgb(output) output = skimage.transform.rescale(output, scale, anti_aliasing=True, mode='constant', multichannel=True) return output
def test_bayesian_mixture_precisions_prior_initialisation(): rng = np.random.RandomState(0) n_samples, n_features = 10, 2 X = rng.rand(n_samples, n_features) # Check raise message for a bad value of degrees_of_freedom_prior bad_degrees_of_freedom_prior_ = n_features - 1. bgmm = BayesianGaussianMixture( degrees_of_freedom_prior=bad_degrees_of_freedom_prior_, random_state=rng) assert_raise_message( ValueError, "The parameter 'degrees_of_freedom_prior' should be " "greater than %d, but got %.3f." % (n_features - 1, bad_degrees_of_freedom_prior_), bgmm.fit, X) # Check correct init for a given value of degrees_of_freedom_prior degrees_of_freedom_prior = rng.rand() + n_features - 1. bgmm = BayesianGaussianMixture( degrees_of_freedom_prior=degrees_of_freedom_prior, random_state=rng).fit(X) assert_almost_equal(degrees_of_freedom_prior, bgmm.degrees_of_freedom_prior_) # Check correct init for the default value of degrees_of_freedom_prior degrees_of_freedom_prior_default = n_features bgmm = BayesianGaussianMixture( degrees_of_freedom_prior=degrees_of_freedom_prior_default, random_state=rng).fit(X) assert_almost_equal(degrees_of_freedom_prior_default, bgmm.degrees_of_freedom_prior_) # Check correct init for a given value of covariance_prior covariance_prior = { 'full': np.cov(X.T, bias=1) + 10, 'tied': np.cov(X.T, bias=1) + 5, 'diag': np.diag(np.atleast_2d(np.cov(X.T, bias=1))) + 3, 'spherical': rng.rand() } bgmm = BayesianGaussianMixture(random_state=rng) for cov_type in ['full', 'tied', 'diag', 'spherical']: bgmm.covariance_type = cov_type bgmm.covariance_prior = covariance_prior[cov_type] bgmm.fit(X) assert_almost_equal(covariance_prior[cov_type], bgmm.covariance_prior_) # Check raise message for a bad spherical value of covariance_prior bad_covariance_prior_ = -1. bgmm = BayesianGaussianMixture(covariance_type='spherical', covariance_prior=bad_covariance_prior_, random_state=rng) assert_raise_message( ValueError, "The parameter 'spherical covariance_prior' " "should be greater than 0., but got %.3f." % bad_covariance_prior_, bgmm.fit, X) # Check correct init for the default value of covariance_prior covariance_prior_default = { 'full': np.atleast_2d(np.cov(X.T)), 'tied': np.atleast_2d(np.cov(X.T)), 'diag': np.var(X, axis=0, ddof=1), 'spherical': np.var(X, axis=0, ddof=1).mean() } bgmm = BayesianGaussianMixture(random_state=0) for cov_type in ['full', 'tied', 'diag', 'spherical']: bgmm.covariance_type = cov_type bgmm.fit(X) assert_almost_equal(covariance_prior_default[cov_type], bgmm.covariance_prior_)
X_pen_scaled = pen_kmeans.fit_predict(X_pen_scaled) X_pen_scaled = X_pen_scaled.reshape(-1, 1) X_train_pen, X_test_pen, y_train_pen, y_test_pen = train_test_split( X_pen_scaled, ypen, test_size=0.20) pen_classifier.fit(X_train_pen, y_train_pen) pen_pred = pen_classifier.predict(X_test_pen) pen_error_kmean.append(1 - metrics.accuracy_score(pen_pred, y_test_pen)) #=========================================================== #===========================EM============================= from sklearn.decomposition import FastICA for i in range(1, 31): X_pen_scaled = pen_scaler.fit_transform(Xpen) pen_bgm = BayesianGaussianMixture(n_components=i) X_pen_scaled = pen_bgm.fit_predict(X_pen_scaled) X_pen_scaled = X_pen_scaled.reshape(-1, 1) X_train_pen, X_test_pen, y_train_pen, y_test_pen = train_test_split( X_pen_scaled, ypen, test_size=0.20) pen_classifier.fit(X_train_pen, y_train_pen) pen_pred = pen_classifier.predict(X_test_pen) pen_error_em.append(1 - metrics.accuracy_score(pen_pred, y_test_pen)) #=========================================================== plt.figure(figsize=(12, 6)) plt.plot(range(1, 31), pen_error, label='No Clustering',
# ------------------------ # # from sklearn.covariance import EllipticEnvelope from sklearn.ensemble import IsolationForest from sklearn.mixture import GaussianMixture, BayesianGaussianMixture anomaly_algorithms = [ ("Elliptic Envelope", EllipticEnvelope(contamination=outliers_fraction)), ("GMM (2, full)", GaussianMixture(n_components=2, covariance_type='full')), ("GMM (4, full)", GaussianMixture(n_components=4, covariance_type='full')), #("Gaussian Mixture model (32, full)", GaussianMixture(n_components=4, covariance_type='diag', random_state=1)), ("Baysian GMM ", BayesianGaussianMixture(n_components=12, covariance_type='diag', random_state=1, n_init=4, degrees_of_freedom_prior=1.1, max_iter=20)), # Not yet supported #( "Isolation Forest", IsolationForest(contamination=outliers_fraction, random_state=42)), #("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1)), ] # %% # Plotting tools # ------------------------ # # Plots the anomaly score landscape def plot_results(ax, model, X):
# Author: Guillaume Lemaitre <*****@*****.**> # License: BSD 3 clause import pytest import numpy as np from sklearn.mixture import GaussianMixture from sklearn.mixture import BayesianGaussianMixture @pytest.mark.parametrize( "estimator", [GaussianMixture(), BayesianGaussianMixture()]) def test_gaussian_mixture_n_iter(estimator): # check that n_iter is the number of iteration performed. rng = np.random.RandomState(0) X = rng.rand(10, 5) max_iter = 1 estimator.set_params(max_iter=max_iter) estimator.fit(X) assert estimator.n_iter_ == max_iter
def detectoutliers(type, nodesattr_pre, nodesattr_new): outlierrecord = {} time_start = time.time() # 获取k core num最值 mink_pre, maxk_pre = maxminvalue(nodesattr_pre) mink_new, maxk_new = maxminvalue(nodesattr_new) nodes_pre = nodesattr_pre.keys() nodes_new = nodesattr_new.keys() nodes = list(set(nodes_pre) | set(nodes_new)) if type == 0: #统计前后两次高维属性的差 #nodesattr_pre=scaledimensions(nodesattr_pre0) #nodesattr_new=scaledimensions(nodesattr_new0) difference = [] #nodeschanged=[] #避免错误,全部加1,即中点(最小值)为[1,0] for n in nodes: differ = [1, 0] if n in nodes_pre and n in nodes_new: tmpattrs_pre = scaledattrs(mink_pre, maxk_pre, nodesattr_pre[n]) tmpattrs_new = scaledattrs(mink_new, maxk_new, nodesattr_new[n]) differ[1] = LA.norm( np.array(tmpattrs_new) - np.array(tmpattrs_pre)) elif n in nodes_pre: tmpattrs_pre = scaledattrs(mink_pre, maxk_pre, nodesattr_pre[n]) differ[1] = LA.norm(np.array(tmpattrs_pre)) elif n in nodes_new: tmpattrs_new = scaledattrs(mink_new, maxk_new, nodesattr_new[n]) differ[1] = LA.norm(np.array(tmpattrs_new)) if differ[1] == 0: differ[1] = 0.0001 difference.append(differ) difference = np.array(difference) # 阈值,设一个度为1的节点消失产生的属性差为differ,阈值设为differ*1.5,小于阈值的异常不计 mindegree = float(1) / len(nodes_pre) threshold = 0 for n in nodes_pre: if nodesattr_pre[n][0] < mindegree * 1.1: tmpattrs_pre = scaledattrs(mink_pre, maxk_pre, nodesattr_pre[n]) threshold = LA.norm(np.array(tmpattrs_pre)) break threshold = threshold * 1.5 #检测outlier dectector = EllipticEnvelope(contamination=0.1) results = dectector.fit(difference).predict(difference) #更新outlier记录 recordnodes = outlierrecord.keys() for i in range(len(results)): if results[i] == -1: if difference[i][1] > threshold: outlierrecord[nodes[i]] = difference[i][1] ''' #记录每个节点为outlier的次数 if nodes[i] in recordnodes: outlierrecord[nodes[i]]=outlierrecord[nodes[i]]+1 else: outlierrecord[nodes[i]]=1 ''' else: #只对前后都出现的节点聚类 nodes_together = list(set(nodes_pre) & set(nodes_new)) #k core num归一化,提取value nodesval_pre = [] nodesval_new = [] for i in range(len(nodes_together)): n = nodes_together[i] nodesval_pre.append( scaledattrs(mink_pre, maxk_pre, nodesattr_pre[n])) nodesval_new.append( scaledattrs(mink_new, maxk_new, nodesattr_new[n])) '''for i in range(len(nodes_pre)): n=nodes_pre[i] nodesval_pre[n] = scaledattrs(mink_pre, maxk_pre, nodesattr_pre[n]) for i in range(len(nodes_new)): n = nodes_new[i] nodesval_new[n] = scaledattrs(mink_new, maxk_new, nodesattr_new[n])''' nodesval_pre = np.array(nodesval_pre) nodesval_new = np.array(nodesval_new) #聚类 class_pre = BayesianGaussianMixture( n_components=6, n_init=3).fit(nodesval_pre).predict(nodesval_pre) class_new = BayesianGaussianMixture( n_components=6, n_init=3).fit(nodesval_new).predict(nodesval_new) #类分裂 visited_class = [] for c in class_pre: #对每个前一次类别,获取该类别节点在最新一次聚类中的类别 if not c in visited_class: visited_class.append(c) nodesindex = np.where(class_pre == c)[0] newclass = [] for id in nodesindex: newclass.append(class_new[id]) newclass = np.array(newclass) uniqnewclass = np.array(list(set(newclass))) # 如果不再是同一类 if len(uniqnewclass) != 1: for cnew in uniqnewclass: tmpnodesid = np.where(newclass == cnew)[0] # 如果有30%节点一起分为了另一类,不属于异常(如:原本同属于一个类的8个节点变成了4个节点各属于一类),否则认为是异常 if not (len(tmpnodesid) > 0.3 * len(nodesindex)): for id2 in tmpnodesid: tmpnode = nodes_together[nodesindex[id2]] outlierrecord[tmpnode] = 1 - float( len(tmpnodesid)) / len(nodesindex) ''' #类合并 visited_class = [] for c in class_new: if not c in visited_class: visited_class.append(c) nodesindex = np.where(class_new == c)[0] preclass = [] for id in nodesindex: preclass.append(class_pre[id]) preclass = np.array(preclass) uniqpreclass = np.array(list(set(preclass))) if len(uniqpreclass) != 1: for cpre in uniqpreclass: tmpnodesid = np.where(preclass == cpre)[0] if not (len(tmpnodesid) > 0.3 * len(nodesindex)): for id2 in tmpnodesid: tmpnode = nodes_together[nodesindex[id2]] tmpabnormal=1 - float(len(tmpnodesid)) / len(nodesindex) if not(tmpnode in outlierrecord.keys()) or (tmpnode in outlierrecord.keys() and outlierrecord[tmpnode]<tmpabnormal): outlierrecord[tmpnode] = tmpabnormal ''' time_end = time.time() print('detect outliers', time_end - time_start) return outlierrecord
def fit_gmm( max_components, n_distances, atoms, distances, regularization_type="bic", covariance_type="diag", ): """ Fit a GMM to a set of distances. This routine will fit a Gaussian mixture model from a set of input distances using sklearn_. The resulting set of parameters can be used to initialize a `GMMDistanceRestraint` in a MELD simulation. .. _sklearn: http://scikit-learn.org/stable/modules/mixture.html Parameters ---------- max_components: int Maximum number of components to use in fitting GMM. n_distances: int Number of distances involved in GMM atoms: list of (int, str, int, str) tuples. The atoms that are involved in each distance are specified as a list of `n_distances` tuples, each of the form (r1, n1, r2, n2), where r1, r2 are the integer residue indices starting from one, and n1, n2 are the atom names. distances: array_like(n_dim=2) An (n_samples, n_distances) array of distances (in nm) to fit. regularization_type: str The type of regularization to use, options are "bic" and "dirichlet". covariance_type: str The form of the covariance matrix, options are "diag" and "full". Returns ------- GMMParams The fit parameters, which can be used to initialize a `meld.system.restraints.GMMDistanceRestraint` using ``GMMDistanceRestraint.from_params``. Notes ----- There are two ways to regularize in order to prevent over fitting. ``regularization_type="bic"`` will use the Bayesian information criterion to penalize models that have more parameters. When using ``bic``, The final number of components in the model will be less than or equal to `max_components`. ``regularization_type=dirichlet`` will use a Dirichlet process prior on the weight distributions. The final number of components in the model will always be equal to `max_components`, but most of the weights will be small. There are two forms for the covariance matrix, which differ in the number of parameters and expressiveness. ``covariance_type="diag"`` will fit using a diagonal covariance matrix. This has few parameters, but does not capture correlations between input distances. Typically, choosing ``"diag"`` will result in a model with more components. ``covariance_type="full"`` will fit using a full representation of the covariance matrix. This captures correlations between input distances, but has far more parameters and is potentially prone to over fitting. """ # # Constants # N_INIT = 25 MAX_ITER = 1000 KFOLD_SPLITS = 5 REG_COVAR = 1e-4 RANDOMSEARCH_TRIALS = 32 # # Check the inputs # if distances.shape[1] != n_distances: raise ValueError("distances must have shape (n_samples, n_distances)") if len(atoms) != n_distances: raise ValueError( "atoms must be a list of (ind1, name1, ind2, name2) of " "length n_components" ) if regularization_type not in ["bic", "dirichlet"]: raise ValueError('regularization_type must be one of ["bic", "dirichlet"]') if covariance_type not in ["diag", "full"]: raise ValueError('covariance_type must be one of ["diag", "full"]') if max_components < 1: raise ValueError("max_components must be >= 1") if max_components > 32: raise ValueError("MELD supports a maximum of 32 GMM components") # # Create and fit the model # if regularization_type == "bic": # BIC fit # Search different values of n_components to find the minimal # BIC. models = [] for i in range(1, max_components + 1): g = GaussianMixture( n_components=i, n_init=N_INIT, max_iter=MAX_ITER, covariance_type=covariance_type, reg_covar=REG_COVAR, ) g.fit(distances) models.append((g.bic(distances), g)) gmm = sorted(models, key=lambda x: x[0])[0][1] else: # Dirichlet process fit # use RandomSearchCV to optimize hyperparameters params = { "weight_concentration_prior": LogUniformSampler(1e-6, 10), "mean_precision_prior": LogUniformSampler(1, 10), } model = BayesianGaussianMixture( max_components, n_init=N_INIT, max_iter=MAX_ITER, covariance_type=covariance_type, reg_covar=REG_COVAR, ) rs = RandomizedSearchCV( model, param_distributions=params, n_iter=RANDOMSEARCH_TRIALS, cv=KFold(n_splits=KFOLD_SPLITS, shuffle=True), ) rs.fit(distances) gmm = rs.best_estimator_ # turn the vector representation of the diagonal into a full # precision matrix if covariance_type == "diag": precisions = gmm.precisions_ assert len(precisions.shape) == 2 new_precisions = [] for i in range(precisions.shape[0]): new_precisions.append(np.diag(precisions[i, :])) precisions = np.array(new_precisions) else: precisions = gmm.precisions_ # convert the list of atoms into the correct form new_atoms = [] for r1, n1, r2, n2 in atoms: new_atoms.append((r1, n1)) new_atoms.append((r2, n2)) # Return the parameters for a GMM return GMMParams( n_components=gmm.weights_.shape[0], n_distances=n_distances, atoms=new_atoms, weights=gmm.weights_, means=gmm.means_, precisions=precisions, )
def test_bayesian_mixture_precisions_prior_initialisation(): rng = np.random.RandomState(0) n_samples, n_features = 10, 2 X = rng.rand(n_samples, n_features) # Check raise message for a bad value of degrees_of_freedom_prior bad_degrees_of_freedom_prior_ = n_features - 1.0 bgmm = BayesianGaussianMixture( degrees_of_freedom_prior=bad_degrees_of_freedom_prior_, random_state=rng) msg = ("The parameter 'degrees_of_freedom_prior' should be greater than" f" {n_features -1}, but got {bad_degrees_of_freedom_prior_:.3f}.") with pytest.raises(ValueError, match=msg): bgmm.fit(X) # Check correct init for a given value of degrees_of_freedom_prior degrees_of_freedom_prior = rng.rand() + n_features - 1.0 bgmm = BayesianGaussianMixture( degrees_of_freedom_prior=degrees_of_freedom_prior, random_state=rng).fit(X) assert_almost_equal(degrees_of_freedom_prior, bgmm.degrees_of_freedom_prior_) # Check correct init for the default value of degrees_of_freedom_prior degrees_of_freedom_prior_default = n_features bgmm = BayesianGaussianMixture( degrees_of_freedom_prior=degrees_of_freedom_prior_default, random_state=rng).fit(X) assert_almost_equal(degrees_of_freedom_prior_default, bgmm.degrees_of_freedom_prior_) # Check correct init for a given value of covariance_prior covariance_prior = { "full": np.cov(X.T, bias=1) + 10, "tied": np.cov(X.T, bias=1) + 5, "diag": np.diag(np.atleast_2d(np.cov(X.T, bias=1))) + 3, "spherical": rng.rand(), } bgmm = BayesianGaussianMixture(random_state=rng) for cov_type in ["full", "tied", "diag", "spherical"]: bgmm.covariance_type = cov_type bgmm.covariance_prior = covariance_prior[cov_type] bgmm.fit(X) assert_almost_equal(covariance_prior[cov_type], bgmm.covariance_prior_) # Check raise message for a bad spherical value of covariance_prior bad_covariance_prior_ = -1.0 bgmm = BayesianGaussianMixture( covariance_type="spherical", covariance_prior=bad_covariance_prior_, random_state=rng, ) msg = ("The parameter 'spherical covariance_prior' " f"should be greater than 0., but got {bad_covariance_prior_:.3f}.") with pytest.raises(ValueError, match=msg): bgmm.fit(X) # Check correct init for the default value of covariance_prior covariance_prior_default = { "full": np.atleast_2d(np.cov(X.T)), "tied": np.atleast_2d(np.cov(X.T)), "diag": np.var(X, axis=0, ddof=1), "spherical": np.var(X, axis=0, ddof=1).mean(), } bgmm = BayesianGaussianMixture(random_state=0) for cov_type in ["full", "tied", "diag", "spherical"]: bgmm.covariance_type = cov_type bgmm.fit(X) assert_almost_equal(covariance_prior_default[cov_type], bgmm.covariance_prior_)
color=clrs[i], alpha=0.5, clip_box=ax.bbox) ax.add_artist(e) ax1_min, ax1_max, ax2_min, ax2_max = plt.axis() plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.title('GMM', fontsize=15) plt.grid(b=True, ls=':', color='#606060') # DPGMM dpgmm = BayesianGaussianMixture( n_components=n_components, covariance_type='full', max_iter=1000, n_init=5, weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=0.1) dpgmm.fit(x) centers = dpgmm.means_ covs = dpgmm.covariances_ print u'DPGMM均值 = \n', centers print u'DPGMM方差 = \n', covs y_hat = dpgmm.predict(x) print y_hat ax = plt.subplot(212) grid_hat = dpgmm.predict(grid_test) grid_hat = grid_hat.reshape(x1.shape) plt.pcolormesh(x1, x2, grid_hat, cmap=cm)
def test_compare_covar_type(): # We can compare the 'full' precision with the other cov_type if we apply # 1 iter of the M-step (done during _initialize_parameters). rng = np.random.RandomState(0) rand_data = RandomData(rng, scale=7) X = rand_data.X["full"] n_components = rand_data.n_components for prior_type in PRIOR_TYPE: # Computation of the full_covariance bgmm = BayesianGaussianMixture( weight_concentration_prior_type=prior_type, n_components=2 * n_components, covariance_type="full", max_iter=1, random_state=0, tol=1e-7, ) bgmm._check_initial_parameters(X) bgmm._initialize_parameters(X, np.random.RandomState(0)) full_covariances = ( bgmm.covariances_ * bgmm.degrees_of_freedom_[:, np.newaxis, np.newaxis]) # Check tied_covariance = mean(full_covariances, 0) bgmm = BayesianGaussianMixture( weight_concentration_prior_type=prior_type, n_components=2 * n_components, covariance_type="tied", max_iter=1, random_state=0, tol=1e-7, ) bgmm._check_initial_parameters(X) bgmm._initialize_parameters(X, np.random.RandomState(0)) tied_covariance = bgmm.covariances_ * bgmm.degrees_of_freedom_ assert_almost_equal(tied_covariance, np.mean(full_covariances, 0)) # Check diag_covariance = diag(full_covariances) bgmm = BayesianGaussianMixture( weight_concentration_prior_type=prior_type, n_components=2 * n_components, covariance_type="diag", max_iter=1, random_state=0, tol=1e-7, ) bgmm._check_initial_parameters(X) bgmm._initialize_parameters(X, np.random.RandomState(0)) diag_covariances = bgmm.covariances_ * bgmm.degrees_of_freedom_[:, np. newaxis] assert_almost_equal( diag_covariances, np.array([np.diag(cov) for cov in full_covariances])) # Check spherical_covariance = np.mean(diag_covariances, 0) bgmm = BayesianGaussianMixture( weight_concentration_prior_type=prior_type, n_components=2 * n_components, covariance_type="spherical", max_iter=1, random_state=0, tol=1e-7, ) bgmm._check_initial_parameters(X) bgmm._initialize_parameters(X, np.random.RandomState(0)) spherical_covariances = bgmm.covariances_ * bgmm.degrees_of_freedom_ assert_almost_equal(spherical_covariances, np.mean(diag_covariances, 1))
['#0072B2', '#F0E442', '#D55E00', '#EE82EE', '#A0522D', '#2E8B57']) covars = np.array([[[.1, .02], [.02, .15]], [[.3, -.01], [-.01, .3]], [[.7, .4], [.3, .6]], [[.3, .03], [.09, .3]], [[.6, -.07], [-.05, .6]], [[.6, .13], [.12, .86]]]) samples = np.array([300, 500, 400, 400, 400, 300]) means = np.array([[.8, -2.0], [-2.5, -.05], [-2, 2.0], [1.2, 2.5], [2, 0.7], [-1, -2.0]]) # mean_precision_prior= 0.8 to minimize the influence of the prior estimators = [("Variational Inference in Finite mixture with Dirichlet Prior ", BayesianGaussianMixture( weight_concentration_prior_type="dirichlet_distribution", n_components=3 * n_components, reg_covar=0, init_params='random', max_iter=5, mean_precision_prior=.8, random_state=random_state, tol=1e-5), [1])] #Discard small proportions SMALL_PROBS = 0 # Generate data rng = np.random.RandomState(random_state) X = np.vstack([ rng.multivariate_normal(means[j], covars[j], samples[j]) for j in range(n_components) ]) y = np.concatenate(
def execute(self, namespace): from sklearn.mixture import GaussianMixture, BayesianGaussianMixture from PYME.IO import MetaDataHandler points = namespace[self.input_points] X = np.stack([points['x'], points['y'], points['z']], axis=1) if self.mode == 'n': gmm = GaussianMixture(n_components=self.n, covariance_type=self.covariance, max_iter=self.max_iter, init_params=self.init_params) predictions = gmm.fit_predict(X) + 1 # PYME labeling scheme log_prob = gmm.score_samples(X) if not gmm.converged_: logger.error('GMM fitting did not converge') predictions = np.zeros(len(points), int) log_prob = -np.inf * np.ones(len(points)) elif self.mode == 'bic': n_components = range(1, self.n + 1) bic = np.zeros(len(n_components)) for ind in range(len(n_components)): gmm = GaussianMixture(n_components=n_components[ind], covariance_type=self.covariance, max_iter=self.max_iter, init_params=self.init_params) gmm.fit(X) bic[ind] = gmm.bic(X) logger.debug('%d BIC: %f' % (n_components[ind], bic[ind])) best = n_components[np.argmin(bic)] if best == self.n or (self.n > 10 and best > 0.9 * self.n): logger.warning( 'BIC optimization selected n components near n max') gmm = GaussianMixture(n_components=best, covariance_type=self.covariance, max_iter=self.max_iter, init_params=self.init_params) predictions = gmm.fit_predict(X) + 1 # PYME labeling scheme log_prob = gmm.score_samples(X) if not gmm.converged_: logger.error('GMM fitting did not converge') predictions = np.zeros(len(points), int) log_prob = -np.inf * np.ones(len(points)) elif self.mode == 'bayesian': bgm = BayesianGaussianMixture(n_components=self.n, covariance_type=self.covariance, max_iter=self.max_iter, init_params=self.init_params) predictions = bgm.fit_predict(X) + 1 # PYME labeling scheme log_prob = bgm.score_samples(X) if not bgm.converged_: logger.error('GMM fitting did not converge') predictions = np.zeros(len(points), int) log_prob = -np.inf * np.ones(len(points)) out = tabular.MappingFilter(points) try: out.mdh = MetaDataHandler.DictMDHandler(points.mdh) except AttributeError: pass out.addColumn(self.label_key, predictions) out.addColumn(self.label_key + '_log_prob', log_prob) avg_log_prob = np.empty_like(log_prob) for label in np.unique(predictions): mask = label == predictions avg_log_prob[mask] = np.mean(log_prob[mask]) out.addColumn(self.label_key + '_avg_log_prob', avg_log_prob) namespace[self.output_labeled] = out
batch_size=batch_size, shuffle=True) model = VAE(n_genes, latent_dim=args.latent_dim).to(device) fit(train_loader, model, args.epochs, n_genes) #%% Visualization ------------------------------ from sklearn.manifold import TSNE from sklearn.mixture import BayesianGaussianMixture import matplotlib.pyplot as plt #from sklearn.cluster import KMeans params = {'edgecolor': 'white'} clustering = BayesianGaussianMixture( n_components=args.nclusters, covariance_type='diag', max_iter=1000, weight_concentration_prior_type='dirichlet_process') dimred = TSNE(n_components=2) fig2, ax2 = plt.subplots(1, 1) cmap = iter([plt.cm.tab20(x) for x in range(0, 20)]) with torch.no_grad(): diter = iter(train_loader) y, lab = diter.next() mu, lvar = model.encode(y.view(-1, n_genes)) y2 = model.reparam(mu, lvar) clustering.fit(y2.numpy()) idx = clustering.fit_predict(y2.numpy()) # idx = km.fit_predict(y2.numpy())
def train(data:np.ndarray, obs_len:int, filter_name:str, model_dir:str, result_dir:str, save_model:bool=True)->NoReturn: print('[Bayesian Gaussian Mixture Clustering][train] creating model...') bgm = BayesianGaussianMixture(n_components=3, covariance_type="full", max_iter=1000, tol=1e-5, n_init=10, random_state=7, weight_concentration_prior_type='dirichlet_process', init_params="kmeans") print('[Bayesian Gaussian Mixture Clustering][train] training...') _y = bgm.fit_predict(X=data) _y = np.expand_dims(_y, axis=1) print(f'[Bayesian Gaussian Mixture Clustering][train] converged?:{bgm.converged_}') print('[Bayesian Gaussian Mixture Clustering][train] params (center and covariance):') for i, m, c, w in zip(range(1, 4), bgm.means_, bgm.covariances_, bgm.weights_): print(f'\tc_{i}-> mean: {m}') print(f'\t\tcov: {c}') print(f'\t\tweight: {w}') print('[Bayesian Gaussian Mixture Clustering][train] results:') _c, _l = np.unique(_y, return_counts=True) for i, c in zip(_c,_l): print (f'\tc_{i}: {c}') if save_model: model_file=f'bgm_{obs_len}s_{filter_name}.pkl' print (f'[Bayesian Gaussian Mixture Clustering][train] saving model ({model_file})...') with open(os.path.join(model_dir, model_file), 'wb') as f: pickle.dump(bgm, f) result_file = f'results_bgm_train_{obs_len}s_{filter_name}.csv' print (f'[Bayesian Gaussian Mixture Clustering][train] saving results ({result_file})...') labels = ['mean_velocity', 'mean_acceleration', 'mean_deceleration', 'std_lateral_jerk', 'driving_style'] result = np.concatenate((data, _y), axis=1) df = pd.DataFrame(data=result, columns=labels) df.to_csv(os.path.join(result_dir,result_file)) result_file = result_file.replace('results', 'params').replace('csv', 'json') print (f'[Bayesian Gaussian Mixture Clustering][train] saving results ({result_file})...') _d = {} _d['means'] = bgm.means_.tolist() _d['covariances'] = bgm.covariances_.tolist() _d['weights'] = bgm.weights_.tolist() with open(os.path.join(result_dir, result_file), 'w') as f: json.dump(_d, f)
expected_mean = X[:, :, 1].mean(axis=0) expected_std = X[:, :, 1].std(axis=0) n_demonstrations, n_steps, n_task_dims = X.shape X_train = np.empty((n_demonstrations, n_steps, n_task_dims + 1)) X_train[:, :, 1:] = X t = np.linspace(0, 1, n_steps) X_train[:, :, 0] = t X_train = X_train.reshape(n_demonstrations * n_steps, n_task_dims + 1) random_state = check_random_state(0) n_components = 4 initial_means = kmeansplusplus_initialization(X_train, n_components, random_state) initial_covs = covariance_initialization(X_train, n_components) bgmm = BayesianGaussianMixture(n_components=n_components, max_iter=100).fit(X_train) gmm = GMM(n_components=n_components, priors=bgmm.weights_, means=bgmm.means_, covariances=bgmm.covariances_, random_state=random_state) plt.figure(figsize=(10, 5)) plt.subplot(121) plt.title("Model and data adaptation") plt.plot(X[:, :, 0].T, X[:, :, 1].T, c="k", alpha=0.1) means_over_time = [] y_stds = [] for step in t:
def main(data_directory_path, init_file): pred = pklload('predidcted.txt') wins = pklload('windows.txt') raws = pklload('r_data.txt') print('DATA_DIRECTORY:{}'.format(data_directory_path)) print('CONFIGURATION_FILE: {}'.format(init_file)) # settings recovering # via settings.ini file print('Parameters recovering..') config = ConfigParser() config.read('settings.ini') # parameters recovering # features domain fdom = config.get('section_b', 'fdom') sampling_freq = config.getfloat('section_b', 'sampling_freq') # epoch half size as int epk_half_sizei = config.getint('section_a', 'epk_half_size') # frequencies banks frequency_bands = eval(config.get('section_a', 'frequency_bands')) # best setting recovering best_setting = config.get('section_c', 'best_setting').split(',') if (best_setting[0] == 'None'): print('please run training_pipeline script before testing!!') else: # freatures domain fdom = best_setting[0] # reduction procedure redux_proc = best_setting[1] # classifiers clf_type = best_setting[2] # Raw data recovering print('Data loading..') r_data = load_raws_within_dir(data_directory_path) # BUILDING ARTIFICIAL EQUALLY SPACED WINDOWS OVER PSEUDO EVENTS windows = [] for raw in r_data: windows.append( windower(raw, 0, -epk_half_sizei / sampling_freq, epk_half_sizei / sampling_freq)) # FEATURES COMPUTATION features_set = None if (fdom == 'time') or (fdom == 'time_frequency'): print('######################## Time Domain Features - computations -') tdf = extract_td_features_from_epks(windows) # data formatting/reshaping rtdf = reshape_numpy(tdf) # standardization rtdf_std = [] for data in rtdf: rtdf_std.append(standardize_data(data)) features_set = rtdf_std if (fdom == 'frequency') or (fdom == 'time_frequency'): # frequency domain coefficients computing print( '########################Frequency domain coefficients computation..' ) print(type(frequency_bands)) fd_coeffs = band_filter(windows, frequency_bands) print( '######################## Frequency Domain Features - computations -' ) fdf = [] for dec in fd_coeffs: fdf.append(svm_features(dec)) # data formatting (reshaping) rfdf = reshape_numpy(fdf) # standardization rfdf_std = [] for data in rfdf: rfdf_std.append(standardize_data(data)) features_set = rfdf_std if fdom == 'time_frequency': # time and frequency domain features concatenation rtfdf = [] for tf, ff in zip(rtdf, rfdf): print(tf.shape, ff.shape) rtfdf.append(np.concatenate((tf, ff), axis=1)) # standardization_events_to_raws rtfdf_std = [] for features in rtfdf: rtfdf_std.append(standardize_data(features)) features_set = rtfdf_std # DIMENSION REDUCTION redux_set = [] for features in features_set: if redux_proc == 'pca': redux_set.append(pca(features, 2)) elif redux_proc == 'ica': redux_set.append(ica(features, 2)) #elif redux_proc == 'lda': # redux = eest.lda(fset, 2, labset) else: # no reduction -> ident redux_set.append(ident(features)) # CLASSIFICATION # classifier selection n_classes = 2 if clf_type == 'kmeans': clf = KMeans(n_clusters=n_classes) #elif clf_type == 'svm': # # SVM- support vector machine # clf = svm.SVC() elif clf_type == 'hc': # hierarchical clustering clf = AgglomerativeClustering(n_clusters=n_classes, affinity='euclidean', linkage='ward') elif clf_type == 'if': # isolation forest clf = IsolationForest() elif clf_type == 'em': # n_components shall be chosen via bic criterion # cv_type: full(default)/spherical/tied/dag clf = GaussianMixture(n_components=n_classes, covariance_type='full') elif clf_type == 'ap': # affinity propagation clf = AffinityPropagation( random_state=5, max_iter=1000) # convergence issues might need tuning elif clf_type == 'bgm': # BayesianGaussianMixture clf = BayesianGaussianMixture(n_components=n_classes, max_iter=200) else: # error handling (default behaviour) todo print('lkajdflkj----- bad clf_type') clf = None # PREDICTION predicted = [] for features in redux_set: clf.fit(features[0]) predicted.append(clf.predict(features[0])) # RAW OBJECT: EVENT ADDITION pkldump(r_data, 'r_data.txt') pkldump(windows, 'windows.txt') pkldump(predicted, 'predidcted.txt') tagged = add_events_to_raws(predicted, windows, r_data) a = 11
def get(self, request, name): if not request.user.info.is_teacher or name == 'undefined': return HttpResponse('Unauthorized', status=401) try: user = User.objects.get(username=request.user) classroom = Classes.objects.get(name=name, creator=user) users = [x.info for x in classroom.students.all()] learners = [Serial.parm_to_skill(info.params[0]) for info in users] reprs = [] for i in learners: rep = [] for j in i.learners: rep.append(i.learners[j].mu) if rep != []: reprs.append(rep) reprs = np.array(reprs) # print(type(reprs[0])) # print(type(np.array(reprs))) reprs = np.array(reprs) # print(reprs.shape) tsne = TSNE(n_components=2) tsne = tsne.fit(reprs) # print('here') repX = tsne.embedding_[:, 0].tolist() repY = tsne.embedding_[:, 1].tolist() GM = BayesianGaussianMixture(n_components=2, max_iter=200) GM = GM.fit(reprs) clusters = GM.predict(reprs).tolist() usernames = [x.user.username for x in users] types = [x.userType for x in users] usersinfo = [{ 'x': obj[0], 'y': obj[1], 'type': obj[2], 'r': 10, 'user': obj[3] } for obj in zip(repX, repY, types, usernames)] formatted = [{ 'label': 'Blind students', 'data': [x for x in usersinfo if x['type'] == 0], 'backgroundColor': '#EA9AAD85' }, { 'label': 'Partially blind students', 'data': [x for x in usersinfo if x['type'] == 1], 'backgroundColor': '#7EB7DF75' }] # print(formatted) return (Response(formatted)) except Exception as e: print(e) return HttpResponse(e, status=500)
scl = [0, "rgb(150,0,90)"], [0.125, "rgb(0, 0, 200)"], [0.25, "rgb(0, 25, 255)"], \ [0.375, "rgb(0, 152, 255)"], [0.5, "rgb(44, 255, 150)"], [0.625, "rgb(151, 255, 0)"], \ [0.75, "rgb(255, 234, 0)"], [0.875, "rgb(255, 111, 0)"], [1, "rgb(255, 0, 0)"] if __name__ == '__main__': scoords = SitesCoords() sites_i = 31265 sites_f = 12100 nc = 50 mutual = True lsites = scoords.get_direct_neighbors(sites_i, 0.35) # lsites = range(sites_i, sites_f) lclust = compute_clusterings(lsites, nc, mutual=mutual) mdist = compute_distance_matrix(lclust, mutual=mutual) #plot_md_scaling(mdist) tdata = md_scaling(mdist) #cs = adjust_nc(tdata) #kmeans = KMeans(n_clusters=cs) #labels = kmeans.fit_predict(tdata) gmm = BayesianGaussianMixture(n_components=10, covariance_type='full', max_iter=1000, n_init=10, tol=0.00001) labels = gmm.fit_predict(tdata) create_plot(data_plot(lsites, labels), str(sites_i))
# Parameters of the dataset random_state, n_components, n_features = 2, 3, 2 colors = np.array(['#0072B2', '#F0E442', '#D55E00']) covars = np.array([[[.7, .0], [.0, .1]], [[.5, .0], [.0, .1]], [[.5, .0], [.0, .1]]]) samples = np.array([200, 500, 200]) means = np.array([[.0, -.70], [.0, .0], [.0, .70]]) # mean_precision_prior= 0.8 to minimize the influence of the prior estimators = [("Finite mixture with a Dirichlet distribution\nprior and " r"$\gamma_0=$", BayesianGaussianMixture( weight_concentration_prior_type="dirichlet_distribution", n_components=2 * n_components, reg_covar=0, init_params='random', max_iter=1500, mean_precision_prior=.8, random_state=random_state), [0.001, 1, 1000]), ("Infinite mixture with a Dirichlet process\n prior and" r"$\gamma_0=$", BayesianGaussianMixture( weight_concentration_prior_type="dirichlet_process", n_components=2 * n_components, reg_covar=0, init_params='random', max_iter=1500, mean_precision_prior=.8, random_state=random_state), [1, 1000, 100000])] # Generate data
def km_em(x_train_scaled, dataset_name="", true_vals = y_train, reg_covar = 1e-01): distortions = [] sil = [] n = 22 # v_measure = [] homogeneity = [] completeness = [] mutual_info = [] adj_rand_score = [] sil = [] kmeans_times = [] homogeneity_em = [] completeness_em = [] mutual_info_em = [] adj_rand_score_em = [] sil_em = [] em_times = [] em_likelihood = [] for i in range(2,n+1): # print(i) start_time = time.time() kmeans = KMeans(n_clusters=i, random_state=random_state) kmeans.fit(x_train_scaled) distortions.append(kmeans.inertia_) y_pred = kmeans.predict(x_train_scaled) kmeans_times.append(time.time()-start_time) homogeneity.append(homogeneity_score(true_vals, y_pred.tolist())) completeness.append(completeness_score(true_vals, y_pred.tolist())) mutual_info.append(adjusted_mutual_info_score(true_vals, y_pred.tolist())) adj_rand_score.append(adjusted_rand_score(true_vals, y_pred.tolist())) sil.append(silhouette_score(x_train_scaled, kmeans.labels_, metric='euclidean')) start_time = time.time() gm = BayesianGaussianMixture(n_components = i, random_state=random_state, reg_covar=reg_covar) y_pred = gm.fit_predict(x_train_scaled) em_times.append(time.time()-start_time) homogeneity_em.append(homogeneity_score(true_vals, y_pred.tolist())) completeness_em.append(completeness_score(true_vals, y_pred.tolist())) mutual_info_em.append(adjusted_mutual_info_score(true_vals, y_pred.tolist())) adj_rand_score_em.append(adjusted_rand_score(true_vals, y_pred.tolist())) if len(set(y_pred))>1: sil_em.append(silhouette_score(x_train_scaled, y_pred, metric='euclidean')) else: sil_em.append(1) em_likelihood.append(gm.score(x_train_scaled)) # plot plt.plot(range(2, n+1), distortions, marker='o') plt.title("K-means Elbow ("+(str(dataset_name))+")") plt.xlabel('Number of clusters') plt.ylabel('Sum of Squared Distances') plt.savefig((str(dataset_name))+' km elbow.png') plt.show() plt.plot(range(2, n+1), sil, marker='o') plt.title('K-means Silhouette Scores ('+(str(dataset_name))+')') plt.xlabel('Number of clusters') plt.ylabel('Silhouette Score') plt.savefig((str(dataset_name))+' km silho.png') plt.show() plt.plot(range(2, n+1), em_likelihood, marker='o') plt.title('EM likelihood ('+(str(dataset_name))+')') plt.xlabel('Number of clusters') plt.ylabel('Likelihood') plt.savefig((str(dataset_name))+' em likelihood.png') plt.show() plt.plot(range(2, n+1), sil_em, marker='o') plt.title('EM Silhouette Scores ('+(str(dataset_name))+')') plt.xlabel('Number of clusters') plt.ylabel('Silhouette Score') plt.savefig((str(dataset_name))+' em silho.png') plt.show() plt.close() plot_data(list(range(1, n)), homogeneity, title="Performance Evaluation k-means ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="blue", label='Homogeneity') plot_data(list(range(1, n)), completeness, title="Performance Evaluation k-means ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="orange", label='Completeness') plot_data(list(range(1, n)), mutual_info, title="Performance Evaluation k-means ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="red", label='Adgusted Mutual Info') plot_data(list(range(1, n)), adj_rand_score, title="Performance Evaluation k-means ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="green", label='Adjusted random index') # plot_data(list(range(1, n)), v_measure, title="Performance Evaluation k-means", x_label="Number of Clusters", y_label="Score", color="brown", label='V-measure') plt.savefig((str(dataset_name))+' km perfo.png') plt.show() plt.close() plot_data(list(range(1, n)), homogeneity_em, title="Performance Evaluation EM ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="blue", label='Homogeneity') plot_data(list(range(1, n)), completeness_em, title="Performance Evaluation EM ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="orange", label='Completeness') plot_data(list(range(1, n)), mutual_info_em, title="Performance Evaluation EM ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="red", label='Adgusted Mutual Info') plot_data(list(range(1, n)), adj_rand_score_em, title="Performance Evaluation EM ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="green", label='Adjusted random index') # plot_data(list(range(1, n)), v_measure, title="Performance Evaluation EM", x_label="Number of Clusters", y_label="Score", color="brown", label='V-measure') plt.savefig((str(dataset_name))+' em perfo.png') plt.show() plt.close() plot_data(list(range(1, n)), kmeans_times, title="k-means/EM Running Time ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Time", color="red", label='k-means') plot_data(list(range(1, n)), em_times, title="k-means/EM Running Time ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Time", color="blue", label='EM') plt.savefig((str(dataset_name))+' km-em time.png') plt.show() print('kmeans_times') print(kmeans_times) print('em_times') print(em_times) return {'sil': sil, 'kmeans_times':kmeans_times, 'em_times':em_times, 'homogeneity':homogeneity, 'completeness':completeness, 'mutual_info':mutual_info, 'adj_rand_score':adj_rand_score, 'homogeneity_em':homogeneity_em, 'completeness_em':completeness_em, 'mutual_info_em':mutual_info_em, 'adj_rand_score_em':adj_rand_score_em}
def adtk_init(model, colname=None): if model == 'iqr': from adtk.detector import InterQuartileRangeAD clf = InterQuartileRangeAD() elif model == 'ar': from adtk.detector import AutoregressionAD clf = AutoregressionAD() elif model == 'esd': from adtk.detector import GeneralizedESDTestAD clf = GeneralizedESDTestAD() elif model == 'level': from adtk.detector import LevelShiftAD clf = LevelShiftAD(15) elif model == 'persist': from adtk.detector import PersistAD clf = PersistAD(15) elif model == 'quantile': from adtk.detector import QuantileAD clf = QuantileAD() elif model == 'seasonal': from adtk.detector import SeasonalAD clf = SeasonalAD() elif model == 'volatility': from adtk.detector import VolatilityShiftAD clf = VolatilityShiftAD(15) elif model == 'kmeans': from adtk.detector import MinClusterDetector from sklearn.cluster import KMeans clf = MinClusterDetector(KMeans(n_clusters=2)) elif model == 'birch': from adtk.detector import MinClusterDetector from sklearn.cluster import Birch clf = MinClusterDetector(Birch(threshold=0.25, branching_factor=25)) elif model == 'gmm': from adtk.detector import MinClusterDetector from sklearn.mixture import GaussianMixture clf = MinClusterDetector(GaussianMixture(n_components=2, max_iter=50)) elif model == 'vbgmm': from adtk.detector import MinClusterDetector from sklearn.mixture import BayesianGaussianMixture clf = MinClusterDetector(BayesianGaussianMixture(n_components=2, max_iter=50)) elif model == 'eliptic': from adtk.detector import OutlierDetector from sklearn.covariance import EllipticEnvelope clf = OutlierDetector(EllipticEnvelope()) elif model == 'mcdad': from adtk.detector import OutlierDetector from sklearn.covariance import MinCovDet clf = OutlierDetector(MinCovDet()) elif model == 'isof': from adtk.detector import OutlierDetector from sklearn.ensemble import IsolationForest clf = OutlierDetector(IsolationForest()) elif model == 'lofad': from adtk.detector import OutlierDetector from sklearn.neighbors import LocalOutlierFactor clf = OutlierDetector(LocalOutlierFactor()) elif model == 'pcaad': from adtk.detector import PcaAD clf = PcaAD() elif model == 'linear': from adtk.detector import RegressionAD from sklearn.linear_model import LinearRegression clf = RegressionAD(LinearRegression(), target=colname) elif model == 'rf': from adtk.detector import RegressionAD from sklearn.ensemble import RandomForestRegressor clf = RegressionAD(RandomForestRegressor(), target=colname) elif model == 'huber': from adtk.detector import RegressionAD from sklearn.linear_model import HuberRegressor clf = RegressionAD(HuberRegressor(), target=colname) elif model == 'knnad': from adtk.detector import RegressionAD from sklearn.neighbors import KNeighborsRegressor clf = RegressionAD(KNeighborsRegressor(), target=colname) elif model == 'kernridge': from adtk.detector import RegressionAD from sklearn.kernel_ridge import KernelRidge clf = RegressionAD(KernelRidge(), target=colname) else: clf = ADTKDefault() return clf
def train(loader, epoch, model_list, method='ocsvm'): # 大于阈值表示属于正常 # model_list 对需要多轮训练的模型有效, 传入上一次训练的模型,例如ocnn datas, labels = get_features(loader) threshold_list = [] update_models = [] update_optimizer = [] clf_list, optimizers = model_list for label in range(args.class_num): # 为每个类别拟合ocsvm模型 condition_index = np.where(labels == label)[0] fit_data = datas[condition_index] # 标签label的训练数据 optimizer = optimizers[label] if method == 'ocsvm': clf = OneClassSVM() elif method == 'isofore': clf = IsolationForest() elif method == 'gmm': clf = BayesianGaussianMixture() elif method == 'svdd': clf = SVDD(parameters) elif method == 'lof': clf = LocalOutlierFactor(novelty=True, n_neighbors=int(fit_data.size * 0.1)) elif method == 'cnn': clf = '' elif method != 'sp': clf = clf_list[label] # 训练异常检测模型 if method == 'ocnn': clf, optimizer = fit(clf, fit_data, optimizer, epoch) scores_temp = score_samples(clf, fit_data, epoch) elif method == 'lof': clf.fit(fit_data) scores_temp = clf.decision_function(fit_data) elif method == 'sp': pass elif method == 'cnn': pass else: clf.fit(fit_data) scores_temp = clf.score_samples(fit_data) # 异常检测模型阈值的计算 if method != 'sp' and method != 'gmm' and method != 'cnn': threshold = np.mean(scores_temp) - \ args.threshold_std_times*np.std(scores_temp) update_optimizer.append(optimizer) update_models.append(clf) threshold_list.append(threshold) elif method == 'gmm': threshold = np.mean(scores_temp) update_optimizer.append(optimizer) update_models.append(clf) threshold_list.append(threshold) elif method == 'sp': from cnn import get_c_v threshold_list = get_c_v(p_s=datas, labels=labels) elif method == 'cnn': threshold_list = '' model_list = (update_models, optimizers) return model_list, threshold_list
# ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color) # ell.set_clip_box(splot.bbox) # ell.set_alpha(0.5) # splot.add_artist(ell) # # plt.title(title) # plt.show() maxscore, now_c = 0, 0 for now_community in range(min_community,max_community+1, 1): model = BayesianGaussianMixture(n_components=now_community, ####4 covariance_type="full", # reg_covar=0, 去掉提高了0.01 max_iter=expected_iter_times, # 100 init_params="random", random_state=5, weight_concentration_prior_type="dirichlet_distribution", # weight_concentration_prior=1e-2, # mean_precision_prior=.8, verbose=0, verbose_interval=10, warm_start=True) # 0.259 model.fit(data) label_pred = model.predict(data) # label_pred=[],多此一举 pred_community = np.column_stack((num_list, label_pred)) pred_community = pred_community[np.argsort(pred_community[:, 0])] labels_pred = pred_community[:, 1] if metrics.normalized_mutual_info_score(labels_true, labels_pred) > maxscore: maxscore = metrics.normalized_mutual_info_score(labels_true, labels_pred) now_c = now_community