Exemplo n.º 1
0
import pandas

import cv2

bg_input_path = "/home/k_mathin/PycharmProjects/DataMiningClass_v4/background_removal/imgs/bg.jpeg"
foreground_input_path = "/home/k_mathin/PycharmProjects/DataMiningClass_v4/background_removal/imgs/foreground.jpeg"
bg = cv2.imread(bg_input_path, 0)
fg = cv2.imread(foreground_input_path, 0)

bg_o_shape = bg.shape
fg_o_shape = fg.shape
k = 5
bg_new_data = bg.reshape(-1, 1)
fg_new_data = fg.reshape(-1, 1)

bg_vgmm = BayesianGaussianMixture(n_components=k)
fg_vgmm = BayesianGaussianMixture(n_components=k)

# vgmm = GaussianMixture(n_components=k)
bg_vgmm = bg_vgmm.fit(bg_new_data)
fg_vgmm = fg_vgmm.fit(fg_new_data)
bg_cluater = bg_vgmm.predict(bg_new_data)
fg_cluater = fg_vgmm.predict(fg_new_data)

# Reshape the input data to the orignal shape
bg_img_cluater = bg_cluater.reshape(bg_o_shape[0], bg_o_shape[1])
fg_img_cluater = fg_cluater.reshape(fg_o_shape[0], fg_o_shape[1])
from matplotlib import pyplot
pyplot.subplot(2, 1, 1)
pyplot.imshow(bg_img_cluater)
pyplot.subplot(2, 1, 2)
Exemplo n.º 2
0
print((train_sizes[np.argmax(test_mean)]))
print(test_mean[np.argmax(test_mean)])
print(train_mean[np.argmax(test_mean)])

mlp_learner = MLPClassifier(hidden_layer_sizes=(100,),activation='relu',solver='sgd', learning_rate = 'adaptive', learning_rate_init = 0.07)
train_sizes, train_scores, test_scores = learning_curve(mlp_learner, x_train_scaled_km, y_train, cv=5, n_jobs=-1, train_sizes=np.linspace(0.01, 1.0, 100))
train_mean = np.mean(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
plot_data(train_sizes, test_mean, title="Neural Network Learning Curve - Dataset 1 + Clustering Result", x_label="Training Size", y_label="Accuracy Score", color="orange", label='CV (+K-means Result)', linestyle='dashed')
plot_data(train_sizes, train_mean, title="Neural Network Learning Curve - Dataset 1 + Clustering Result", x_label="Training Size", y_label="Accuracy Score", color="orange", label='Training (+K-means Result')
print(train_sizes)
print((train_sizes[np.argmax(test_mean)]))
print(test_mean[np.argmax(test_mean)])
print(train_mean[np.argmax(test_mean)])

gm = BayesianGaussianMixture(n_components = 14, random_state=random_state, reg_covar=1e-01)
y_pred = gm.fit_predict(x_projected_pca)
x_train_scaled_em = np.column_stack((x_train_scaled,y_pred))


mlp_learner = MLPClassifier(hidden_layer_sizes=(100,),activation='relu',solver='sgd', learning_rate = 'adaptive', learning_rate_init = 0.07)
train_sizes, train_scores, test_scores = learning_curve(mlp_learner, x_train_scaled_em, y_train, cv=5, n_jobs=-1, train_sizes=np.linspace(0.01, 1.0, 100))
train_mean = np.mean(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
plot_data(train_sizes, test_mean, title="Neural Network Learning Curve - Dataset 1 + Clustering Result", x_label="Training Size", y_label="Accuracy Score", color="blue", label='CV (+EM Result)', linestyle='dashed')
plot_data(train_sizes, train_mean, title="Neural Network Learning Curve - Dataset 1 + Clustering Result", x_label="Training Size", y_label="Accuracy Score", color="blue", label='Training (+EM Result)')
print(train_sizes)
print((train_sizes[np.argmax(test_mean)]))
print(test_mean[np.argmax(test_mean)])
print(train_mean[np.argmax(test_mean)])
Exemplo n.º 3
0
def bin_test(X_train, y_train, X_test, y_test):
    starttime = time.time()

    # binary models
    models = [
        "BNB", "GNB", "LDA", "SVM_G", "5NN", "LR2", "P2", "SGD", "ADA", "DT",
        "RF", "DPGMM", "ET", "GMM", "MLP"
    ]  #"SVM_L", "SVM_G", "P2", "DT",  "ADA_R",
    clfs = [BernoulliNB(), \
            GaussianNB(), \
            LinearDiscriminantAnalysis(), \
            svm.SVC(kernel='rbf', probability=True), \
            neighbors.KNeighborsClassifier(n_neighbors=5), \
            LogisticRegression(), \
            Perceptron(penalty='l2',tol=None,max_iter=1000), \
            SGDClassifier(tol=0.0001, power_t=0.4, average=True), \
            AdaBoostClassifier(base_estimator=None, n_estimators=100), \
            DecisionTreeClassifier(), \
            RandomForestClassifier(oob_score=True),  \
            BayesianGaussianMixture(n_components=2,max_iter=1000, weight_concentration_prior_type='dirichlet_process', tol=0.0001), \
            ExtraTreesClassifier(bootstrap=True, oob_score=True, n_estimators=4), \
            GaussianMixture(n_components=2, tol=0.0001, max_iter=1000, n_init=2), \
            MLPClassifier(activation='relu', alpha=0.00001, max_iter=1000)]

    results = []
    outlier_results = []

    for i in range(len(clfs)):
        print "model being tested: {0}".format(models[i])
        time_start = time.time()
        clf = clfs[i].fit(X_train, y_train)
        predict = clf.predict(X_test)
        runtime = time.time() - time_start
        p = metrics.precision_score(y_test, predict)
        r = metrics.recall_score(y_test, predict, average="macro")
        f = metrics.f1_score(y_test, predict)

        # find outliers
        data = [('TeamID', X_test['TeamID'].values), ('predicted', predict),
                ('label', y_test.values)]
        labels_and_predicted = pd.DataFrame.from_items(data)
        outliers = X_test.merge(labels_and_predicted, on='TeamID')
        outliers = outliers[outliers['label'] != outliers['predicted']]

        num_mislabeled = outliers.shape[0]

        p_new = -1
        r_new = -1
        f_new = -1

        if (outliers['label'].unique().size > 1):
            # train separate model on outliers
            mislabeled_labels = outliers['label']
            mislabeled_samples = outliers.drop(['label', 'predicted'], axis=1)

            (train_vars, validate_vars, train_outcomes,
             validate_outcomes) = train_test_split(mislabeled_samples,
                                                   mislabeled_labels,
                                                   test_size=0.2)

            clf_new = clfs[i].fit(train_vars, train_outcomes)
            validate_predicted = clf_new.predict(validate_vars)

            # evaluate
            p_new = metrics.precision_score(validate_outcomes,
                                            validate_predicted)
            r_new = metrics.recall_score(validate_outcomes,
                                         validate_predicted,
                                         average="macro")
            f_new = metrics.f1_score(validate_outcomes, validate_predicted)
            outlier_results.append([models[i], p_new, r_new, f_new])

        results.append([models[i], p, r, f, runtime])
        # create confusion matrix
        cm = metrics.confusion_matrix(y_test, predict)
        plot_confusion(cm,
                       y_test,
                       filename='{0}_confusion.png'.format(models[i]))

    print
    print "All data models"
    print
    print tabulate.tabulate(
        results,
        headers=['Classif', 'Precision', 'Recall', 'F1 Score', 'Runtime'])
    print
    print "Outlier models"
    print
    print tabulate.tabulate(
        outlier_results,
        headers=['Classif', 'Precision', 'Recall', 'F1 Score', 'Runtime'])
    print "Binary test took {0} secs".format(time.time() - starttime)
    return pd.DataFrame(data=results)
Exemplo n.º 4
0
    iter(skf.split(data.values, sedclass[sedclass.columns[0]])))
X_train = data.iloc[train_index].dropna().values

y_train = sedclass.iloc[train_index].dropna().values

X_test = data.iloc[test_index].dropna().values
y_test = sedclass.iloc[test_index].dropna().values

n_classes = len(np.unique(y_train))

# Try GMMs using different types of covariances.
estimators = dict(
    (cov_type,
     BayesianGaussianMixture(
         n_components=n_classes,
         covariance_type=cov_type,
         max_iter=100,
         random_state=0,
         weight_concentration_prior_type='dirichlet_distribution'))
    for cov_type in ['spherical', 'diag', 'tied', 'full'])
n_estimators = len(estimators)

plt.figure(figsize=(3 * n_estimators // 2, 6))
plt.subplots_adjust(bottom=.01,
                    top=0.95,
                    hspace=.15,
                    wspace=.05,
                    left=.01,
                    right=.99)

for index, (name, estimator) in enumerate(estimators.items()):
    # Since we have class labels for the training data, we can
Exemplo n.º 5
0
def BGMreport(path,visualize=1,cut_n=6):
    t2=15
    t3=0.07
    n_components=3
    denses,_=finddensefromcut(path,cut_n)
    maxd=[]
    for dense in denses[(cut_n-5):]:
        maxd.append(max(dense))
    lofd=len(denses[0])
    samples=list()
    for i in range((cut_n-5),cut_n):#sampling for BGM
        samples.append(np.array(tosample(denses[i])).reshape(-1,1))
    allmeans=[]
    allcovs=[]
    allweights=[]
    BGM45=np.zeros((45))
    for i in range(5):
        BGM=BayesianGaussianMixture(n_components=n_components,covariance_type='spherical',weight_concentration_prior=0.000000000001,max_iter=500)
        BGM.fit(samples[i])
        means=np.reshape(BGM.means_,(-1,))
        permu=np.argsort(means)
        means=means[permu]
        BGM45[i*9+3:i*9+6]=means
        allmeans.append(means)
        covs=BGM.covariances_
        covs=covs[permu]
        BGM45[i*9+6:i*9+9]=covs
        allcovs.append(covs)
        weights=BGM.weights_
        weights=weights[permu]
        BGM45[i*9:i*9+3]=weights*len(samples[i])
        allweights.append(weights)
    if visualize==1:
        l=0
        for i in range(cut_n-5,cut_n):#visualization
            l+=1
            plt.subplot(2,n_components,l),plt.plot(denses[i])
            X=np.linspace(0,lofd,num=200,endpoint=False)
            Ys=toGM(X,n_components,allmeans[l-1],allcovs[l-1],allweights[l-1])
            for j in range(n_components):
                #plt.subplot(1,5,l),plt.plot([allmeans[l-1][j],allmeans[l-1][j]],[0,255])
                plt.subplot(2,n_components,l),plt.plot(X,len(samples[l-1])*Ys[j])
                #plt.subplot(2,n_components,l),plt.plot(X,Ys[j])
                plt.ylim(0,255)
        plt.show()
    ans=np.zeros((12,))
    pre=np.zeros((5,n_components))
    for i in range(5):###preprocessing the data to avoid peak overlapping(far overlap and near overlap) influence: identify far/near overlap cases and suppress far overlap peaks, amplify near overlap peaks
        ###如果很理想的情况应该能把两个far overlap的peak合并成一个在中间mean的,但是现在可以先直接把两个抑制掉,毕竟就不太可能是单克隆峰了。far overlap也就是两个峰实际上在图里面是同一个,BGM将其拆分从而更好的拟合高斯模型,我们这里将其抑制因为能够拆分为两个峰的基本上cov都比较大,不尖。
        for j in range(n_components):
            for l in range(n_components):
                if j<l:
                    if allweights[i][j]/allweights[i][l]>3 or allweights[i][j]/allweights[i][l]<0.3333:#ignore when weight difference is too large
                        continue
                    if allcovs[i][j]/allweights[i][j]/allcovs[i][l]*allweights[i][l]/abs(allmeans[i][j]-allmeans[i][l])*mean(np.sqrt(allcovs[i][j]),np.sqrt(allcovs[i][l]))>2 or allcovs[i][l]/allweights[i][l]/allcovs[i][j]*allweights[i][j]/abs(allmeans[i][j]-allmeans[i][l])*mean(np.sqrt(allcovs[i][j]),np.sqrt(allcovs[i][l]))>2:#if the cov difference is large than it will be ignored from far overlap because there should be two peaks in the original density plot
                    #near overlap situation is when a sharp peak is on a mild one. it happens when monoclonal peak has a background polyclonal peak. here we amplify the sharp peaks' weight when their cov difference is large enough or their distance is close enough so that it will be detected as abnormal in the classification step
                        if abs(allmeans[i][j]-allmeans[i][l])<3.5*np.sqrt(max(allcovs[i][j],allcovs[i][l])):
                            neww=allweights[i][j]+allweights[i][l]
                            if allcovs[i][l]/allweights[i][l]/allcovs[i][j]*allweights[i][j]>1 and allweights[i][j]>0.15:
                                if allcovs[i][j]<400:
                                    allweights[i][j]=neww
                            else:
                                if allcovs[i][l]<400:
                                    allweights[i][l]=neww
                        continue
                    if allcovs[i][j]/allweights[i][j]/len(samples[i])<t3/2.5 or allcovs[i][l]/allweights[i][l]/len(samples[i])<t3/2.5:#if one of the considered peak has very small variance, then it should not be far overlap situation where the original peak is mild
                        continue
                    if allcovs[i][j]<70 or allcovs[i][l]<70:
                        continue
                    elif abs(allmeans[i][j]-allmeans[i][l])<3.5*np.sqrt(max(allcovs[i][j],allcovs[i][l])):#far overlap situation where there is only a mild peak in the original density plot, and GMM model break it down to two sharper peaks to fit the guassian curves more accurately. here we just suppress the peaks and thus we cannot determine the column is abnormal because of the two considered components
                        pre[i][j]=pre[i][l]=1             
    for i in [0,1,2]:
        for j in [3,4]:
            if maxd[i]<50 or maxd[j]<50:
                continue
            else:
                for k in range(len(allmeans[i])):
                    for l in range(len(allmeans[j])):
                        if pre[i][k]==1 or pre[j][l]==1:
                            continue
                        if abs(allmeans[i][k]-allmeans[j][l])>lofd/t2:
                            continue
                        else:
                            if allweights[i][k]<0.1 or allweights[j][l]<0.1:
                                continue
                            else:
                                if allcovs[i][k]/allweights[i][k]/len(samples[i])>t3 or allcovs[j][l]/allweights[j][l]/len(samples[j])>t3:###the t figure, represents the sharpness of the peak. just variance is not enough, we need to consider n_samples and weights too.
                                    continue
                                else:
                                    ans[i*2+j-2]=1 
                                    ans[7+i]=1
                                    ans[7+j]=1  
                                    ans[0]=1   
    for i in range(5):
        for j in range(n_components):
            if pre[i][j]==1:
                continue
            if maxd[i]<80:
                continue
            elif allweights[i][j]<0.05:
                continue
            if allcovs[i][j]/allweights[i][j]/len(samples[i])>t3:###t-figure
                continue
            else:
                ans[7+i]=1
                ans[0]=1
    return ans,BGM45
Exemplo n.º 6
0
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.mixture import GMM, DPGMM, BayesianGaussianMixture, VBGMM
from sklearn.svm import NuSVC, SVC

# Useful for seeing all sklearn estimators that have `predict_prob` attribute
estimators = all_estimators()
for name, class_ in estimators:
    if hasattr(class_, 'predict_proba'):
        print(name)

# Now pick and choose the ones you like
estimators = {
    AdaBoostClassifier(): 'AdaBoost',
    BayesianGaussianMixture(): 'BayesianGaussianMixture',
    BernoulliNB(): 'BernoulliNB',
    DPGMM(): 'DPGMM',
    ExtraTreesClassifier(): 'ExtraTreesClassifier',
    GMM(): 'GMM',
    GaussianNB(): 'GaussianNB',
    GaussianProcessClassifier(): 'GaussianProcessClassifier',
    GradientBoostingClassifier(): 'GradientBoostingClassifier',
    KNeighborsClassifier(): 'KNeighborsClassifier',
    LabelPropagation(): 'LabelPropagation',
    LabelSpreading(): 'LabelSpreading',
    LinearDiscriminantAnalysis(): 'LinearDiscriminantAnalysis',
    LogisticRegression(): 'LogisticRegression',
    MLPClassifier(): 'MLPClassifier',
    NuSVC(): 'NuSVC',
    QuadraticDiscriminantAnalysis(): 'QuadraticDiscriminantAnalysis',
Exemplo n.º 7
0
# from numpy import where
# from sklearn.datasets import make_classification
# from sklearn.cluster import Birch
# from matplotlib import pyplot
# # define the model
# model = Birch(threshold=0.01, n_clusters=10)
# # fit the model
# model.fit(X)

import pandas as pd

# training gaussian mixture model
from sklearn.mixture import BayesianGaussianMixture
n_components = 5
gmm = BayesianGaussianMixture(n_components,
                              max_iter=1000000,
                              weight_concentration_prior=1)
gmm.fit(X.toarray())

#predictions from gmm
labels = gmm.predict(X.toarray())
order_centroids = gmm.means_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
centroids = gmm.means_

from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(X)
centdist = 1 - cosine_similarity(X, centroids)
# frame = pd.DataFrame(X)
# frame['cluster'] = labels
# # frame.columns = ['Weight', 'cluster']
Exemplo n.º 8
0
def colour(img, scale=1.0, samples=10000):
    '''Model the distribution of colours in an image.

    The method models the distribution of the values in the chroma channels of
    an image after being converted from RGB to CIE Lab.  This decouples the
    luma (intensity) values from the chroma (colour) information, make it
    easier to visualize how the colours themselves appear.  The resulting
    visualization is the same size as the original image.

    Parameters
    ----------
    img : numpy.ndarray
        input image
    scale : float
        image scaling factor
    samples : int
        number of samples to draw when generating the density estimate

    Returns
    -------
    numpy.ndarray
        a new image, same dimensions as the input, visualizing the colour
        distribution

    Raises
    ------
    ValueError
        if the input image is not an RGB image
    '''
    if img.ndim != 3:
        raise ValueError('Require RGB image to compute structure tensor.')

    img = skimage.transform.rescale(img,
                                    1.0 / scale,
                                    anti_aliasing=True,
                                    mode='constant',
                                    multichannel=True)
    img = skimage.color.rgb2hsv(img)
    height, width = img.shape[0:2]

    # Extract the colour vectors and sample from them.
    ind = generate_samples(width, height, samples)
    X = np.squeeze(img[ind[1, :], ind[0, :], 0:2])

    # Convert a polar to cartesian coordinate conversation (will make the
    # visualization easier).
    mag = X[:, 1]
    ang = 2 * np.pi * X[:, 0]

    X[:, 0] = mag * np.cos(ang)
    X[:, 1] = mag * np.sin(ang)

    # Perform a density estimation using a GMM.
    gmm = BayesianGaussianMixture(
        n_components=25,
        weight_concentration_prior_type='dirichlet_distribution',
        weight_concentration_prior=1e-3)
    gmm.fit(X)

    # Generate the output array.
    x, y = np.meshgrid(np.linspace(-1, 1, width), np.linspace(-1, 1, height))
    X = np.c_[x.flatten(), y.flatten()]
    scores = np.exp(gmm.score_samples(X))
    max_score = np.max(scores)

    # Apply a gamma correction to make the image look a bit nicer.
    val = np.reshape(scores, (height, width)) / max_score
    val = skimage.exposure.adjust_gamma(val, gamma=0.3)

    # Convert back from HSV to RGB.  The saturation needs to be clamped so that
    # it doesn't produce invalid values during the HSV->RGB conversion.
    mag = x**2 + y**2
    sat = np.sqrt(mag)
    sat[sat > 1] = 1

    # The hue also needs to be adjust since atan2() returns a value between
    # -pi and pi, but the hue needs to be between 0 an 1.
    hue = np.arctan2(y, x)
    hue[hue < 0] = hue[hue < 0] + 2 * np.pi
    hue /= 2 * np.pi

    output = np.dstack((hue, sat, val))
    output = skimage.color.hsv2rgb(output)
    output = skimage.transform.rescale(output,
                                       scale,
                                       anti_aliasing=True,
                                       mode='constant',
                                       multichannel=True)

    return output
Exemplo n.º 9
0
def test_bayesian_mixture_precisions_prior_initialisation():
    rng = np.random.RandomState(0)
    n_samples, n_features = 10, 2
    X = rng.rand(n_samples, n_features)

    # Check raise message for a bad value of degrees_of_freedom_prior
    bad_degrees_of_freedom_prior_ = n_features - 1.
    bgmm = BayesianGaussianMixture(
        degrees_of_freedom_prior=bad_degrees_of_freedom_prior_,
        random_state=rng)
    assert_raise_message(
        ValueError, "The parameter 'degrees_of_freedom_prior' should be "
        "greater than %d, but got %.3f." %
        (n_features - 1, bad_degrees_of_freedom_prior_), bgmm.fit, X)

    # Check correct init for a given value of degrees_of_freedom_prior
    degrees_of_freedom_prior = rng.rand() + n_features - 1.
    bgmm = BayesianGaussianMixture(
        degrees_of_freedom_prior=degrees_of_freedom_prior,
        random_state=rng).fit(X)
    assert_almost_equal(degrees_of_freedom_prior,
                        bgmm.degrees_of_freedom_prior_)

    # Check correct init for the default value of degrees_of_freedom_prior
    degrees_of_freedom_prior_default = n_features
    bgmm = BayesianGaussianMixture(
        degrees_of_freedom_prior=degrees_of_freedom_prior_default,
        random_state=rng).fit(X)
    assert_almost_equal(degrees_of_freedom_prior_default,
                        bgmm.degrees_of_freedom_prior_)

    # Check correct init for a given value of covariance_prior
    covariance_prior = {
        'full': np.cov(X.T, bias=1) + 10,
        'tied': np.cov(X.T, bias=1) + 5,
        'diag': np.diag(np.atleast_2d(np.cov(X.T, bias=1))) + 3,
        'spherical': rng.rand()
    }

    bgmm = BayesianGaussianMixture(random_state=rng)
    for cov_type in ['full', 'tied', 'diag', 'spherical']:
        bgmm.covariance_type = cov_type
        bgmm.covariance_prior = covariance_prior[cov_type]
        bgmm.fit(X)
        assert_almost_equal(covariance_prior[cov_type], bgmm.covariance_prior_)

    # Check raise message for a bad spherical value of covariance_prior
    bad_covariance_prior_ = -1.
    bgmm = BayesianGaussianMixture(covariance_type='spherical',
                                   covariance_prior=bad_covariance_prior_,
                                   random_state=rng)
    assert_raise_message(
        ValueError, "The parameter 'spherical covariance_prior' "
        "should be greater than 0., but got %.3f." % bad_covariance_prior_,
        bgmm.fit, X)

    # Check correct init for the default value of covariance_prior
    covariance_prior_default = {
        'full': np.atleast_2d(np.cov(X.T)),
        'tied': np.atleast_2d(np.cov(X.T)),
        'diag': np.var(X, axis=0, ddof=1),
        'spherical': np.var(X, axis=0, ddof=1).mean()
    }

    bgmm = BayesianGaussianMixture(random_state=0)
    for cov_type in ['full', 'tied', 'diag', 'spherical']:
        bgmm.covariance_type = cov_type
        bgmm.fit(X)
        assert_almost_equal(covariance_prior_default[cov_type],
                            bgmm.covariance_prior_)
    X_pen_scaled = pen_kmeans.fit_predict(X_pen_scaled)
    X_pen_scaled = X_pen_scaled.reshape(-1, 1)

    X_train_pen, X_test_pen, y_train_pen, y_test_pen = train_test_split(
        X_pen_scaled, ypen, test_size=0.20)

    pen_classifier.fit(X_train_pen, y_train_pen)
    pen_pred = pen_classifier.predict(X_test_pen)
    pen_error_kmean.append(1 - metrics.accuracy_score(pen_pred, y_test_pen))
#===========================================================
#===========================EM=============================
from sklearn.decomposition import FastICA
for i in range(1, 31):
    X_pen_scaled = pen_scaler.fit_transform(Xpen)

    pen_bgm = BayesianGaussianMixture(n_components=i)
    X_pen_scaled = pen_bgm.fit_predict(X_pen_scaled)
    X_pen_scaled = X_pen_scaled.reshape(-1, 1)

    X_train_pen, X_test_pen, y_train_pen, y_test_pen = train_test_split(
        X_pen_scaled, ypen, test_size=0.20)

    pen_classifier.fit(X_train_pen, y_train_pen)
    pen_pred = pen_classifier.predict(X_test_pen)
    pen_error_em.append(1 - metrics.accuracy_score(pen_pred, y_test_pen))
#===========================================================

plt.figure(figsize=(12, 6))
plt.plot(range(1, 31),
         pen_error,
         label='No Clustering',
Exemplo n.º 11
0
# ------------------------
#
#
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture

anomaly_algorithms = [
    ("Elliptic Envelope", EllipticEnvelope(contamination=outliers_fraction)),
    ("GMM (2, full)", GaussianMixture(n_components=2, covariance_type='full')),
    ("GMM (4, full)", GaussianMixture(n_components=4, covariance_type='full')),
    #("Gaussian Mixture model (32, full)", GaussianMixture(n_components=4, covariance_type='diag', random_state=1)),
    ("Baysian GMM ",
     BayesianGaussianMixture(n_components=12,
                             covariance_type='diag',
                             random_state=1,
                             n_init=4,
                             degrees_of_freedom_prior=1.1,
                             max_iter=20)),

    # Not yet supported
    #( "Isolation Forest", IsolationForest(contamination=outliers_fraction, random_state=42)),
    #("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1)),
]


# %%
# Plotting tools
# ------------------------
#
# Plots the anomaly score landscape
def plot_results(ax, model, X):
Exemplo n.º 12
0
# Author: Guillaume Lemaitre <*****@*****.**>
# License: BSD 3 clause

import pytest
import numpy as np

from sklearn.mixture import GaussianMixture
from sklearn.mixture import BayesianGaussianMixture


@pytest.mark.parametrize(
    "estimator",
    [GaussianMixture(), BayesianGaussianMixture()])
def test_gaussian_mixture_n_iter(estimator):
    # check that n_iter is the number of iteration performed.
    rng = np.random.RandomState(0)
    X = rng.rand(10, 5)
    max_iter = 1
    estimator.set_params(max_iter=max_iter)
    estimator.fit(X)
    assert estimator.n_iter_ == max_iter
Exemplo n.º 13
0
def detectoutliers(type, nodesattr_pre, nodesattr_new):
    outlierrecord = {}
    time_start = time.time()
    # 获取k core num最值
    mink_pre, maxk_pre = maxminvalue(nodesattr_pre)
    mink_new, maxk_new = maxminvalue(nodesattr_new)
    nodes_pre = nodesattr_pre.keys()
    nodes_new = nodesattr_new.keys()
    nodes = list(set(nodes_pre) | set(nodes_new))
    if type == 0:
        #统计前后两次高维属性的差
        #nodesattr_pre=scaledimensions(nodesattr_pre0)
        #nodesattr_new=scaledimensions(nodesattr_new0)
        difference = []
        #nodeschanged=[]
        #避免错误,全部加1,即中点(最小值)为[1,0]
        for n in nodes:
            differ = [1, 0]
            if n in nodes_pre and n in nodes_new:
                tmpattrs_pre = scaledattrs(mink_pre, maxk_pre,
                                           nodesattr_pre[n])
                tmpattrs_new = scaledattrs(mink_new, maxk_new,
                                           nodesattr_new[n])
                differ[1] = LA.norm(
                    np.array(tmpattrs_new) - np.array(tmpattrs_pre))
            elif n in nodes_pre:
                tmpattrs_pre = scaledattrs(mink_pre, maxk_pre,
                                           nodesattr_pre[n])
                differ[1] = LA.norm(np.array(tmpattrs_pre))
            elif n in nodes_new:
                tmpattrs_new = scaledattrs(mink_new, maxk_new,
                                           nodesattr_new[n])
                differ[1] = LA.norm(np.array(tmpattrs_new))
            if differ[1] == 0:
                differ[1] = 0.0001
            difference.append(differ)
        difference = np.array(difference)
        # 阈值,设一个度为1的节点消失产生的属性差为differ,阈值设为differ*1.5,小于阈值的异常不计
        mindegree = float(1) / len(nodes_pre)
        threshold = 0
        for n in nodes_pre:
            if nodesattr_pre[n][0] < mindegree * 1.1:
                tmpattrs_pre = scaledattrs(mink_pre, maxk_pre,
                                           nodesattr_pre[n])
                threshold = LA.norm(np.array(tmpattrs_pre))
                break
        threshold = threshold * 1.5
        #检测outlier
        dectector = EllipticEnvelope(contamination=0.1)
        results = dectector.fit(difference).predict(difference)
        #更新outlier记录
        recordnodes = outlierrecord.keys()
        for i in range(len(results)):
            if results[i] == -1:
                if difference[i][1] > threshold:
                    outlierrecord[nodes[i]] = difference[i][1]
                    '''
                    #记录每个节点为outlier的次数
                    if nodes[i] in recordnodes:
                        outlierrecord[nodes[i]]=outlierrecord[nodes[i]]+1
                    else:
                        outlierrecord[nodes[i]]=1
                    '''
    else:
        #只对前后都出现的节点聚类
        nodes_together = list(set(nodes_pre) & set(nodes_new))
        #k core num归一化,提取value
        nodesval_pre = []
        nodesval_new = []
        for i in range(len(nodes_together)):
            n = nodes_together[i]
            nodesval_pre.append(
                scaledattrs(mink_pre, maxk_pre, nodesattr_pre[n]))
            nodesval_new.append(
                scaledattrs(mink_new, maxk_new, nodesattr_new[n]))
        '''for i in range(len(nodes_pre)):
            n=nodes_pre[i]
            nodesval_pre[n] = scaledattrs(mink_pre, maxk_pre, nodesattr_pre[n])
        for i in range(len(nodes_new)):
            n = nodes_new[i]
            nodesval_new[n] = scaledattrs(mink_new, maxk_new, nodesattr_new[n])'''
        nodesval_pre = np.array(nodesval_pre)
        nodesval_new = np.array(nodesval_new)
        #聚类
        class_pre = BayesianGaussianMixture(
            n_components=6, n_init=3).fit(nodesval_pre).predict(nodesval_pre)
        class_new = BayesianGaussianMixture(
            n_components=6, n_init=3).fit(nodesval_new).predict(nodesval_new)
        #类分裂
        visited_class = []
        for c in class_pre:
            #对每个前一次类别,获取该类别节点在最新一次聚类中的类别
            if not c in visited_class:
                visited_class.append(c)
                nodesindex = np.where(class_pre == c)[0]
                newclass = []
                for id in nodesindex:
                    newclass.append(class_new[id])
                newclass = np.array(newclass)
                uniqnewclass = np.array(list(set(newclass)))
                # 如果不再是同一类
                if len(uniqnewclass) != 1:
                    for cnew in uniqnewclass:
                        tmpnodesid = np.where(newclass == cnew)[0]
                        # 如果有30%节点一起分为了另一类,不属于异常(如:原本同属于一个类的8个节点变成了4个节点各属于一类),否则认为是异常
                        if not (len(tmpnodesid) > 0.3 * len(nodesindex)):
                            for id2 in tmpnodesid:
                                tmpnode = nodes_together[nodesindex[id2]]
                                outlierrecord[tmpnode] = 1 - float(
                                    len(tmpnodesid)) / len(nodesindex)
        '''
        #类合并
        visited_class = []
        for c in class_new:
            if not c in visited_class:
                visited_class.append(c)
                nodesindex = np.where(class_new == c)[0]
                preclass = []
                for id in nodesindex:
                    preclass.append(class_pre[id])
                preclass = np.array(preclass)
                uniqpreclass = np.array(list(set(preclass)))
                if len(uniqpreclass) != 1:
                    for cpre in uniqpreclass:
                        tmpnodesid = np.where(preclass == cpre)[0]
                        if not (len(tmpnodesid) > 0.3 * len(nodesindex)):
                            for id2 in tmpnodesid:
                                tmpnode = nodes_together[nodesindex[id2]]
                                tmpabnormal=1 - float(len(tmpnodesid)) / len(nodesindex)
                                if not(tmpnode in outlierrecord.keys()) or (tmpnode in outlierrecord.keys() and outlierrecord[tmpnode]<tmpabnormal):
                                    outlierrecord[tmpnode] = tmpabnormal
        '''
    time_end = time.time()
    print('detect outliers', time_end - time_start)
    return outlierrecord
Exemplo n.º 14
0
def fit_gmm(
    max_components,
    n_distances,
    atoms,
    distances,
    regularization_type="bic",
    covariance_type="diag",
):
    """
    Fit a GMM to a set of distances.

    This routine will fit a Gaussian mixture model from a set
    of input distances using sklearn_. The resulting set of parameters can
    be used to initialize a `GMMDistanceRestraint` in a MELD simulation.

    .. _sklearn: http://scikit-learn.org/stable/modules/mixture.html

    Parameters
    ----------
    max_components: int
        Maximum number of components to use in fitting GMM.
    n_distances: int
        Number of distances involved in GMM
    atoms: list of (int, str, int, str) tuples.
        The atoms that are involved in each distance are specified
        as a list of `n_distances` tuples, each of the form
        (r1, n1, r2, n2), where r1, r2 are the integer residue
        indices starting from one, and n1, n2 are the atom names.
    distances: array_like(n_dim=2)
        An (n_samples, n_distances) array of distances (in nm) to fit.
    regularization_type: str
        The type of regularization to use, options are "bic"
        and "dirichlet".
    covariance_type: str
        The form of the covariance matrix, options are "diag"
        and "full".

    Returns
    -------
    GMMParams
        The fit parameters, which can be used to initialize
        a `meld.system.restraints.GMMDistanceRestraint` using
        ``GMMDistanceRestraint.from_params``.

    Notes
    -----
    There are two ways to regularize in order to prevent over fitting.

    ``regularization_type="bic"`` will use the Bayesian information
    criterion to penalize models that have more parameters. When
    using ``bic``, The final number of components in the model
    will be less than or equal to `max_components`.

    ``regularization_type=dirichlet`` will use a Dirichlet process
    prior on the weight distributions. The final number of components
    in the model will always be equal to `max_components`, but most
    of the weights will be small.

    There are two forms for the covariance matrix, which differ in
    the number of parameters and expressiveness.

    ``covariance_type="diag"`` will fit using a diagonal covariance
    matrix. This has few parameters, but does not capture correlations
    between input distances. Typically, choosing ``"diag"`` will
    result in a model with more components.

    ``covariance_type="full"`` will fit using a full representation
    of the covariance matrix. This captures correlations between
    input distances, but has far more parameters and is potentially
    prone to over fitting.
    """

    #
    # Constants
    #
    N_INIT = 25
    MAX_ITER = 1000
    KFOLD_SPLITS = 5
    REG_COVAR = 1e-4
    RANDOMSEARCH_TRIALS = 32

    #
    # Check the inputs
    #
    if distances.shape[1] != n_distances:
        raise ValueError("distances must have shape (n_samples, n_distances)")

    if len(atoms) != n_distances:
        raise ValueError(
            "atoms must be a list of (ind1, name1, ind2, name2) of "
            "length n_components"
        )

    if regularization_type not in ["bic", "dirichlet"]:
        raise ValueError('regularization_type must be one of ["bic", "dirichlet"]')

    if covariance_type not in ["diag", "full"]:
        raise ValueError('covariance_type must be one of ["diag", "full"]')

    if max_components < 1:
        raise ValueError("max_components must be >= 1")
    if max_components > 32:
        raise ValueError("MELD supports a maximum of 32 GMM components")

    #
    # Create and fit the model
    #
    if regularization_type == "bic":
        # BIC fit
        # Search different values of n_components to find the minimal
        # BIC.
        models = []
        for i in range(1, max_components + 1):
            g = GaussianMixture(
                n_components=i,
                n_init=N_INIT,
                max_iter=MAX_ITER,
                covariance_type=covariance_type,
                reg_covar=REG_COVAR,
            )
            g.fit(distances)
            models.append((g.bic(distances), g))

        gmm = sorted(models, key=lambda x: x[0])[0][1]

    else:
        # Dirichlet process fit
        # use RandomSearchCV to optimize hyperparameters
        params = {
            "weight_concentration_prior": LogUniformSampler(1e-6, 10),
            "mean_precision_prior": LogUniformSampler(1, 10),
        }
        model = BayesianGaussianMixture(
            max_components,
            n_init=N_INIT,
            max_iter=MAX_ITER,
            covariance_type=covariance_type,
            reg_covar=REG_COVAR,
        )
        rs = RandomizedSearchCV(
            model,
            param_distributions=params,
            n_iter=RANDOMSEARCH_TRIALS,
            cv=KFold(n_splits=KFOLD_SPLITS, shuffle=True),
        )
        rs.fit(distances)
        gmm = rs.best_estimator_

    # turn the vector representation of the diagonal into a full
    # precision matrix
    if covariance_type == "diag":
        precisions = gmm.precisions_
        assert len(precisions.shape) == 2
        new_precisions = []
        for i in range(precisions.shape[0]):
            new_precisions.append(np.diag(precisions[i, :]))
        precisions = np.array(new_precisions)
    else:
        precisions = gmm.precisions_

    # convert the list of atoms into the correct form
    new_atoms = []
    for r1, n1, r2, n2 in atoms:
        new_atoms.append((r1, n1))
        new_atoms.append((r2, n2))

    # Return the parameters for a GMM
    return GMMParams(
        n_components=gmm.weights_.shape[0],
        n_distances=n_distances,
        atoms=new_atoms,
        weights=gmm.weights_,
        means=gmm.means_,
        precisions=precisions,
    )
Exemplo n.º 15
0
def test_bayesian_mixture_precisions_prior_initialisation():
    rng = np.random.RandomState(0)
    n_samples, n_features = 10, 2
    X = rng.rand(n_samples, n_features)

    # Check raise message for a bad value of degrees_of_freedom_prior
    bad_degrees_of_freedom_prior_ = n_features - 1.0
    bgmm = BayesianGaussianMixture(
        degrees_of_freedom_prior=bad_degrees_of_freedom_prior_,
        random_state=rng)
    msg = ("The parameter 'degrees_of_freedom_prior' should be greater than"
           f" {n_features -1}, but got {bad_degrees_of_freedom_prior_:.3f}.")
    with pytest.raises(ValueError, match=msg):
        bgmm.fit(X)

    # Check correct init for a given value of degrees_of_freedom_prior
    degrees_of_freedom_prior = rng.rand() + n_features - 1.0
    bgmm = BayesianGaussianMixture(
        degrees_of_freedom_prior=degrees_of_freedom_prior,
        random_state=rng).fit(X)
    assert_almost_equal(degrees_of_freedom_prior,
                        bgmm.degrees_of_freedom_prior_)

    # Check correct init for the default value of degrees_of_freedom_prior
    degrees_of_freedom_prior_default = n_features
    bgmm = BayesianGaussianMixture(
        degrees_of_freedom_prior=degrees_of_freedom_prior_default,
        random_state=rng).fit(X)
    assert_almost_equal(degrees_of_freedom_prior_default,
                        bgmm.degrees_of_freedom_prior_)

    # Check correct init for a given value of covariance_prior
    covariance_prior = {
        "full": np.cov(X.T, bias=1) + 10,
        "tied": np.cov(X.T, bias=1) + 5,
        "diag": np.diag(np.atleast_2d(np.cov(X.T, bias=1))) + 3,
        "spherical": rng.rand(),
    }

    bgmm = BayesianGaussianMixture(random_state=rng)
    for cov_type in ["full", "tied", "diag", "spherical"]:
        bgmm.covariance_type = cov_type
        bgmm.covariance_prior = covariance_prior[cov_type]
        bgmm.fit(X)
        assert_almost_equal(covariance_prior[cov_type], bgmm.covariance_prior_)

    # Check raise message for a bad spherical value of covariance_prior
    bad_covariance_prior_ = -1.0
    bgmm = BayesianGaussianMixture(
        covariance_type="spherical",
        covariance_prior=bad_covariance_prior_,
        random_state=rng,
    )
    msg = ("The parameter 'spherical covariance_prior' "
           f"should be greater than 0., but got {bad_covariance_prior_:.3f}.")
    with pytest.raises(ValueError, match=msg):
        bgmm.fit(X)

    # Check correct init for the default value of covariance_prior
    covariance_prior_default = {
        "full": np.atleast_2d(np.cov(X.T)),
        "tied": np.atleast_2d(np.cov(X.T)),
        "diag": np.var(X, axis=0, ddof=1),
        "spherical": np.var(X, axis=0, ddof=1).mean(),
    }

    bgmm = BayesianGaussianMixture(random_state=0)
    for cov_type in ["full", "tied", "diag", "spherical"]:
        bgmm.covariance_type = cov_type
        bgmm.fit(X)
        assert_almost_equal(covariance_prior_default[cov_type],
                            bgmm.covariance_prior_)
Exemplo n.º 16
0
                    color=clrs[i],
                    alpha=0.5,
                    clip_box=ax.bbox)
        ax.add_artist(e)

    ax1_min, ax1_max, ax2_min, ax2_max = plt.axis()
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.title('GMM', fontsize=15)
    plt.grid(b=True, ls=':', color='#606060')

    # DPGMM
    dpgmm = BayesianGaussianMixture(
        n_components=n_components,
        covariance_type='full',
        max_iter=1000,
        n_init=5,
        weight_concentration_prior_type='dirichlet_process',
        weight_concentration_prior=0.1)
    dpgmm.fit(x)
    centers = dpgmm.means_
    covs = dpgmm.covariances_
    print u'DPGMM均值 = \n', centers
    print u'DPGMM方差 = \n', covs
    y_hat = dpgmm.predict(x)
    print y_hat

    ax = plt.subplot(212)
    grid_hat = dpgmm.predict(grid_test)
    grid_hat = grid_hat.reshape(x1.shape)
    plt.pcolormesh(x1, x2, grid_hat, cmap=cm)
Exemplo n.º 17
0
def test_compare_covar_type():
    # We can compare the 'full' precision with the other cov_type if we apply
    # 1 iter of the M-step (done during _initialize_parameters).
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng, scale=7)
    X = rand_data.X["full"]
    n_components = rand_data.n_components

    for prior_type in PRIOR_TYPE:
        # Computation of the full_covariance
        bgmm = BayesianGaussianMixture(
            weight_concentration_prior_type=prior_type,
            n_components=2 * n_components,
            covariance_type="full",
            max_iter=1,
            random_state=0,
            tol=1e-7,
        )
        bgmm._check_initial_parameters(X)
        bgmm._initialize_parameters(X, np.random.RandomState(0))
        full_covariances = (
            bgmm.covariances_ *
            bgmm.degrees_of_freedom_[:, np.newaxis, np.newaxis])

        # Check tied_covariance = mean(full_covariances, 0)
        bgmm = BayesianGaussianMixture(
            weight_concentration_prior_type=prior_type,
            n_components=2 * n_components,
            covariance_type="tied",
            max_iter=1,
            random_state=0,
            tol=1e-7,
        )
        bgmm._check_initial_parameters(X)
        bgmm._initialize_parameters(X, np.random.RandomState(0))

        tied_covariance = bgmm.covariances_ * bgmm.degrees_of_freedom_
        assert_almost_equal(tied_covariance, np.mean(full_covariances, 0))

        # Check diag_covariance = diag(full_covariances)
        bgmm = BayesianGaussianMixture(
            weight_concentration_prior_type=prior_type,
            n_components=2 * n_components,
            covariance_type="diag",
            max_iter=1,
            random_state=0,
            tol=1e-7,
        )
        bgmm._check_initial_parameters(X)
        bgmm._initialize_parameters(X, np.random.RandomState(0))

        diag_covariances = bgmm.covariances_ * bgmm.degrees_of_freedom_[:, np.
                                                                        newaxis]
        assert_almost_equal(
            diag_covariances,
            np.array([np.diag(cov) for cov in full_covariances]))

        # Check spherical_covariance = np.mean(diag_covariances, 0)
        bgmm = BayesianGaussianMixture(
            weight_concentration_prior_type=prior_type,
            n_components=2 * n_components,
            covariance_type="spherical",
            max_iter=1,
            random_state=0,
            tol=1e-7,
        )
        bgmm._check_initial_parameters(X)
        bgmm._initialize_parameters(X, np.random.RandomState(0))

        spherical_covariances = bgmm.covariances_ * bgmm.degrees_of_freedom_
        assert_almost_equal(spherical_covariances,
                            np.mean(diag_covariances, 1))
    ['#0072B2', '#F0E442', '#D55E00', '#EE82EE', '#A0522D', '#2E8B57'])

covars = np.array([[[.1, .02], [.02, .15]], [[.3, -.01], [-.01, .3]],
                   [[.7, .4], [.3, .6]], [[.3, .03], [.09, .3]],
                   [[.6, -.07], [-.05, .6]], [[.6, .13], [.12, .86]]])
samples = np.array([300, 500, 400, 400, 400, 300])
means = np.array([[.8, -2.0], [-2.5, -.05], [-2, 2.0], [1.2, 2.5], [2, 0.7],
                  [-1, -2.0]])

# mean_precision_prior= 0.8 to minimize the influence of the prior
estimators = [("Variational Inference in Finite mixture with Dirichlet Prior ",
               BayesianGaussianMixture(
                   weight_concentration_prior_type="dirichlet_distribution",
                   n_components=3 * n_components,
                   reg_covar=0,
                   init_params='random',
                   max_iter=5,
                   mean_precision_prior=.8,
                   random_state=random_state,
                   tol=1e-5), [1])]

#Discard small proportions
SMALL_PROBS = 0

# Generate data
rng = np.random.RandomState(random_state)
X = np.vstack([
    rng.multivariate_normal(means[j], covars[j], samples[j])
    for j in range(n_components)
])
y = np.concatenate(
Exemplo n.º 19
0
    def execute(self, namespace):
        from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
        from PYME.IO import MetaDataHandler

        points = namespace[self.input_points]
        X = np.stack([points['x'], points['y'], points['z']], axis=1)

        if self.mode == 'n':
            gmm = GaussianMixture(n_components=self.n,
                                  covariance_type=self.covariance,
                                  max_iter=self.max_iter,
                                  init_params=self.init_params)
            predictions = gmm.fit_predict(X) + 1  # PYME labeling scheme
            log_prob = gmm.score_samples(X)
            if not gmm.converged_:
                logger.error('GMM fitting did not converge')
                predictions = np.zeros(len(points), int)
                log_prob = -np.inf * np.ones(len(points))

        elif self.mode == 'bic':
            n_components = range(1, self.n + 1)
            bic = np.zeros(len(n_components))
            for ind in range(len(n_components)):
                gmm = GaussianMixture(n_components=n_components[ind],
                                      covariance_type=self.covariance,
                                      max_iter=self.max_iter,
                                      init_params=self.init_params)
                gmm.fit(X)
                bic[ind] = gmm.bic(X)
                logger.debug('%d BIC: %f' % (n_components[ind], bic[ind]))

            best = n_components[np.argmin(bic)]
            if best == self.n or (self.n > 10 and best > 0.9 * self.n):
                logger.warning(
                    'BIC optimization selected n components near n max')

            gmm = GaussianMixture(n_components=best,
                                  covariance_type=self.covariance,
                                  max_iter=self.max_iter,
                                  init_params=self.init_params)
            predictions = gmm.fit_predict(X) + 1  # PYME labeling scheme
            log_prob = gmm.score_samples(X)
            if not gmm.converged_:
                logger.error('GMM fitting did not converge')
                predictions = np.zeros(len(points), int)
                log_prob = -np.inf * np.ones(len(points))

        elif self.mode == 'bayesian':
            bgm = BayesianGaussianMixture(n_components=self.n,
                                          covariance_type=self.covariance,
                                          max_iter=self.max_iter,
                                          init_params=self.init_params)
            predictions = bgm.fit_predict(X) + 1  # PYME labeling scheme
            log_prob = bgm.score_samples(X)
            if not bgm.converged_:
                logger.error('GMM fitting did not converge')
                predictions = np.zeros(len(points), int)
                log_prob = -np.inf * np.ones(len(points))

        out = tabular.MappingFilter(points)
        try:
            out.mdh = MetaDataHandler.DictMDHandler(points.mdh)
        except AttributeError:
            pass

        out.addColumn(self.label_key, predictions)
        out.addColumn(self.label_key + '_log_prob', log_prob)
        avg_log_prob = np.empty_like(log_prob)
        for label in np.unique(predictions):
            mask = label == predictions
            avg_log_prob[mask] = np.mean(log_prob[mask])
        out.addColumn(self.label_key + '_avg_log_prob', avg_log_prob)
        namespace[self.output_labeled] = out
Exemplo n.º 20
0
                                           batch_size=batch_size,
                                           shuffle=True)

model = VAE(n_genes, latent_dim=args.latent_dim).to(device)
fit(train_loader, model, args.epochs, n_genes)

#%% Visualization ------------------------------
from sklearn.manifold import TSNE
from sklearn.mixture import BayesianGaussianMixture
import matplotlib.pyplot as plt
#from sklearn.cluster import KMeans

params = {'edgecolor': 'white'}
clustering = BayesianGaussianMixture(
    n_components=args.nclusters,
    covariance_type='diag',
    max_iter=1000,
    weight_concentration_prior_type='dirichlet_process')

dimred = TSNE(n_components=2)
fig2, ax2 = plt.subplots(1, 1)
cmap = iter([plt.cm.tab20(x) for x in range(0, 20)])

with torch.no_grad():
    diter = iter(train_loader)
    y, lab = diter.next()
    mu, lvar = model.encode(y.view(-1, n_genes))
    y2 = model.reparam(mu, lvar)
    clustering.fit(y2.numpy())
    idx = clustering.fit_predict(y2.numpy())
    #    idx = km.fit_predict(y2.numpy())
Exemplo n.º 21
0
def train(data:np.ndarray,
		  obs_len:int,
		  filter_name:str,
		  model_dir:str,
		  result_dir:str,
		  save_model:bool=True)->NoReturn:
	
	print('[Bayesian Gaussian Mixture Clustering][train] creating model...')

	bgm = BayesianGaussianMixture(n_components=3,
						  		  covariance_type="full",
						  		  max_iter=1000,
						  		  tol=1e-5,
						  		  n_init=10,
						  		  random_state=7,
						  		  weight_concentration_prior_type='dirichlet_process',
						  		  init_params="kmeans")

	print('[Bayesian Gaussian Mixture Clustering][train] training...')

	_y = bgm.fit_predict(X=data)
	_y = np.expand_dims(_y, axis=1)

	print(f'[Bayesian Gaussian Mixture Clustering][train] converged?:{bgm.converged_}')

	print('[Bayesian Gaussian Mixture Clustering][train] params (center and covariance):')
	for i, m, c, w in zip(range(1, 4), bgm.means_, bgm.covariances_, bgm.weights_):
		print(f'\tc_{i}-> mean: {m}')
		print(f'\t\tcov: {c}')
		print(f'\t\tweight: {w}')

	print('[Bayesian Gaussian Mixture Clustering][train] results:')
	_c, _l = np.unique(_y, return_counts=True)
	for i, c in zip(_c,_l):
		print (f'\tc_{i}: {c}')

	if save_model:
		model_file=f'bgm_{obs_len}s_{filter_name}.pkl'
		print (f'[Bayesian Gaussian Mixture Clustering][train] saving model ({model_file})...')
		with open(os.path.join(model_dir, model_file), 'wb') as f:
			pickle.dump(bgm, f)


	result_file = f'results_bgm_train_{obs_len}s_{filter_name}.csv'
	print (f'[Bayesian Gaussian Mixture Clustering][train] saving results ({result_file})...')
	labels = ['mean_velocity', 
			  'mean_acceleration', 
			  'mean_deceleration', 
			  'std_lateral_jerk', 
			  'driving_style']

	result = np.concatenate((data, _y), axis=1)
	df = pd.DataFrame(data=result, columns=labels)
	df.to_csv(os.path.join(result_dir,result_file))

	result_file = result_file.replace('results', 'params').replace('csv', 'json')
	print (f'[Bayesian Gaussian Mixture Clustering][train] saving results ({result_file})...')
	_d = {}
	_d['means'] = bgm.means_.tolist()
	_d['covariances'] = bgm.covariances_.tolist()
	_d['weights'] = bgm.weights_.tolist()
	with open(os.path.join(result_dir, result_file), 'w') as f:
		json.dump(_d, f)
Exemplo n.º 22
0
expected_mean = X[:, :, 1].mean(axis=0)
expected_std = X[:, :, 1].std(axis=0)

n_demonstrations, n_steps, n_task_dims = X.shape
X_train = np.empty((n_demonstrations, n_steps, n_task_dims + 1))
X_train[:, :, 1:] = X
t = np.linspace(0, 1, n_steps)
X_train[:, :, 0] = t
X_train = X_train.reshape(n_demonstrations * n_steps, n_task_dims + 1)

random_state = check_random_state(0)
n_components = 4
initial_means = kmeansplusplus_initialization(X_train, n_components,
                                              random_state)
initial_covs = covariance_initialization(X_train, n_components)
bgmm = BayesianGaussianMixture(n_components=n_components,
                               max_iter=100).fit(X_train)
gmm = GMM(n_components=n_components,
          priors=bgmm.weights_,
          means=bgmm.means_,
          covariances=bgmm.covariances_,
          random_state=random_state)

plt.figure(figsize=(10, 5))
plt.subplot(121)
plt.title("Model and data adaptation")

plt.plot(X[:, :, 0].T, X[:, :, 1].T, c="k", alpha=0.1)

means_over_time = []
y_stds = []
for step in t:
Exemplo n.º 23
0
def main(data_directory_path, init_file):
    pred = pklload('predidcted.txt')
    wins = pklload('windows.txt')
    raws = pklload('r_data.txt')

    print('DATA_DIRECTORY:{}'.format(data_directory_path))
    print('CONFIGURATION_FILE: {}'.format(init_file))

    # settings recovering
    # via settings.ini file
    print('Parameters recovering..')
    config = ConfigParser()
    config.read('settings.ini')

    # parameters recovering
    # features domain
    fdom = config.get('section_b', 'fdom')
    sampling_freq = config.getfloat('section_b', 'sampling_freq')
    # epoch half size as int
    epk_half_sizei = config.getint('section_a', 'epk_half_size')

    # frequencies banks
    frequency_bands = eval(config.get('section_a', 'frequency_bands'))

    # best setting recovering
    best_setting = config.get('section_c', 'best_setting').split(',')

    if (best_setting[0] == 'None'):
        print('please run training_pipeline script before testing!!')
    else:
        # freatures domain
        fdom = best_setting[0]
        # reduction  procedure
        redux_proc = best_setting[1]
        # classifiers
        clf_type = best_setting[2]

    # Raw data recovering
    print('Data loading..')
    r_data = load_raws_within_dir(data_directory_path)

    # BUILDING ARTIFICIAL EQUALLY SPACED WINDOWS OVER PSEUDO EVENTS
    windows = []
    for raw in r_data:
        windows.append(
            windower(raw, 0, -epk_half_sizei / sampling_freq,
                     epk_half_sizei / sampling_freq))

    # FEATURES COMPUTATION
    features_set = None
    if (fdom == 'time') or (fdom == 'time_frequency'):
        print('######################## Time Domain Features - computations -')
        tdf = extract_td_features_from_epks(windows)

        # data formatting/reshaping
        rtdf = reshape_numpy(tdf)

        # standardization
        rtdf_std = []
        for data in rtdf:
            rtdf_std.append(standardize_data(data))
        features_set = rtdf_std

    if (fdom == 'frequency') or (fdom == 'time_frequency'):
        # frequency domain coefficients computing
        print(
            '########################Frequency domain coefficients computation..'
        )
        print(type(frequency_bands))
        fd_coeffs = band_filter(windows, frequency_bands)

        print(
            '######################## Frequency Domain Features - computations -'
        )
        fdf = []
        for dec in fd_coeffs:
            fdf.append(svm_features(dec))

        # data formatting (reshaping)
        rfdf = reshape_numpy(fdf)

        # standardization
        rfdf_std = []
        for data in rfdf:
            rfdf_std.append(standardize_data(data))

        features_set = rfdf_std

    if fdom == 'time_frequency':
        # time and frequency domain features concatenation
        rtfdf = []
        for tf, ff in zip(rtdf, rfdf):
            print(tf.shape, ff.shape)
            rtfdf.append(np.concatenate((tf, ff), axis=1))

        # standardization_events_to_raws
        rtfdf_std = []
        for features in rtfdf:
            rtfdf_std.append(standardize_data(features))

        features_set = rtfdf_std

    # DIMENSION REDUCTION
    redux_set = []
    for features in features_set:
        if redux_proc == 'pca':
            redux_set.append(pca(features, 2))
        elif redux_proc == 'ica':
            redux_set.append(ica(features, 2))
        #elif redux_proc == 'lda':
        #    redux = eest.lda(fset, 2, labset)
        else:  # no reduction -> ident
            redux_set.append(ident(features))

    # CLASSIFICATION

    # classifier selection
    n_classes = 2
    if clf_type == 'kmeans':
        clf = KMeans(n_clusters=n_classes)
    #elif clf_type == 'svm':
    #    # SVM- support vector machine
    #    clf = svm.SVC()
    elif clf_type == 'hc':  # hierarchical clustering
        clf = AgglomerativeClustering(n_clusters=n_classes,
                                      affinity='euclidean',
                                      linkage='ward')
    elif clf_type == 'if':  # isolation forest
        clf = IsolationForest()
    elif clf_type == 'em':
        # n_components shall be chosen via bic criterion
        # cv_type: full(default)/spherical/tied/dag
        clf = GaussianMixture(n_components=n_classes, covariance_type='full')
    elif clf_type == 'ap':  # affinity propagation
        clf = AffinityPropagation(
            random_state=5,
            max_iter=1000)  # convergence issues might need tuning
    elif clf_type == 'bgm':  # BayesianGaussianMixture
        clf = BayesianGaussianMixture(n_components=n_classes, max_iter=200)
    else:  # error handling (default behaviour) todo
        print('lkajdflkj----- bad clf_type')
        clf = None

    # PREDICTION
    predicted = []
    for features in redux_set:
        clf.fit(features[0])
        predicted.append(clf.predict(features[0]))

    # RAW OBJECT: EVENT ADDITION
    pkldump(r_data, 'r_data.txt')
    pkldump(windows, 'windows.txt')
    pkldump(predicted, 'predidcted.txt')

    tagged = add_events_to_raws(predicted, windows, r_data)

    a = 11
Exemplo n.º 24
0
    def get(self, request, name):

        if not request.user.info.is_teacher or name == 'undefined':
            return HttpResponse('Unauthorized', status=401)
        try:
            user = User.objects.get(username=request.user)

            classroom = Classes.objects.get(name=name, creator=user)

            users = [x.info for x in classroom.students.all()]

            learners = [Serial.parm_to_skill(info.params[0]) for info in users]
            reprs = []

            for i in learners:
                rep = []
                for j in i.learners:

                    rep.append(i.learners[j].mu)
                if rep != []:
                    reprs.append(rep)

            reprs = np.array(reprs)

            # print(type(reprs[0]))
            # print(type(np.array(reprs)))

            reprs = np.array(reprs)

            # print(reprs.shape)

            tsne = TSNE(n_components=2)
            tsne = tsne.fit(reprs)

            # print('here')
            repX = tsne.embedding_[:, 0].tolist()
            repY = tsne.embedding_[:, 1].tolist()

            GM = BayesianGaussianMixture(n_components=2, max_iter=200)
            GM = GM.fit(reprs)
            clusters = GM.predict(reprs).tolist()

            usernames = [x.user.username for x in users]

            types = [x.userType for x in users]

            usersinfo = [{
                'x': obj[0],
                'y': obj[1],
                'type': obj[2],
                'r': 10,
                'user': obj[3]
            } for obj in zip(repX, repY, types, usernames)]

            formatted = [{
                'label': 'Blind students',
                'data': [x for x in usersinfo if x['type'] == 0],
                'backgroundColor': '#EA9AAD85'
            }, {
                'label': 'Partially blind students',
                'data': [x for x in usersinfo if x['type'] == 1],
                'backgroundColor': '#7EB7DF75'
            }]

            # print(formatted)

            return (Response(formatted))
        except Exception as e:
            print(e)
            return HttpResponse(e, status=500)
Exemplo n.º 25
0
scl = [0, "rgb(150,0,90)"], [0.125, "rgb(0, 0, 200)"], [0.25, "rgb(0, 25, 255)"], \
      [0.375, "rgb(0, 152, 255)"], [0.5, "rgb(44, 255, 150)"], [0.625, "rgb(151, 255, 0)"], \
      [0.75, "rgb(255, 234, 0)"], [0.875, "rgb(255, 111, 0)"], [1, "rgb(255, 0, 0)"]

if __name__ == '__main__':

    scoords = SitesCoords()
    sites_i = 31265
    sites_f = 12100
    nc = 50
    mutual = True
    lsites = scoords.get_direct_neighbors(sites_i, 0.35)
    # lsites = range(sites_i, sites_f)
    lclust = compute_clusterings(lsites, nc, mutual=mutual)
    mdist = compute_distance_matrix(lclust, mutual=mutual)
    #plot_md_scaling(mdist)
    tdata = md_scaling(mdist)

    #cs = adjust_nc(tdata)
    #kmeans = KMeans(n_clusters=cs)
    #labels = kmeans.fit_predict(tdata)

    gmm = BayesianGaussianMixture(n_components=10,
                                  covariance_type='full',
                                  max_iter=1000,
                                  n_init=10,
                                  tol=0.00001)
    labels = gmm.fit_predict(tdata)
    create_plot(data_plot(lsites, labels), str(sites_i))
Exemplo n.º 26
0
# Parameters of the dataset
random_state, n_components, n_features = 2, 3, 2
colors = np.array(['#0072B2', '#F0E442', '#D55E00'])

covars = np.array([[[.7, .0], [.0, .1]], [[.5, .0], [.0, .1]],
                   [[.5, .0], [.0, .1]]])
samples = np.array([200, 500, 200])
means = np.array([[.0, -.70], [.0, .0], [.0, .70]])

# mean_precision_prior= 0.8 to minimize the influence of the prior
estimators = [("Finite mixture with a Dirichlet distribution\nprior and "
               r"$\gamma_0=$",
               BayesianGaussianMixture(
                   weight_concentration_prior_type="dirichlet_distribution",
                   n_components=2 * n_components,
                   reg_covar=0,
                   init_params='random',
                   max_iter=1500,
                   mean_precision_prior=.8,
                   random_state=random_state), [0.001, 1, 1000]),
              ("Infinite mixture with a Dirichlet process\n prior and"
               r"$\gamma_0=$",
               BayesianGaussianMixture(
                   weight_concentration_prior_type="dirichlet_process",
                   n_components=2 * n_components,
                   reg_covar=0,
                   init_params='random',
                   max_iter=1500,
                   mean_precision_prior=.8,
                   random_state=random_state), [1, 1000, 100000])]

# Generate data
Exemplo n.º 27
0
def km_em(x_train_scaled, dataset_name="", true_vals = y_train, reg_covar = 1e-01):
    distortions = []
    sil = []
    n = 22
    # v_measure = []
    homogeneity = []
    completeness = []
    mutual_info = []
    adj_rand_score = []
    sil = []
    kmeans_times = []
    homogeneity_em = []
    completeness_em = []
    mutual_info_em = []
    adj_rand_score_em = []
    sil_em = []
    em_times = []
    em_likelihood = []
    for i in range(2,n+1):
#         print(i)
        start_time = time.time()
        kmeans = KMeans(n_clusters=i, random_state=random_state)
        kmeans.fit(x_train_scaled)
        distortions.append(kmeans.inertia_)
        y_pred = kmeans.predict(x_train_scaled)
        kmeans_times.append(time.time()-start_time)
        homogeneity.append(homogeneity_score(true_vals, y_pred.tolist()))
        completeness.append(completeness_score(true_vals, y_pred.tolist()))
        mutual_info.append(adjusted_mutual_info_score(true_vals, y_pred.tolist()))
        adj_rand_score.append(adjusted_rand_score(true_vals, y_pred.tolist()))
        sil.append(silhouette_score(x_train_scaled, kmeans.labels_, metric='euclidean'))
        start_time = time.time()
        gm = BayesianGaussianMixture(n_components = i, random_state=random_state, reg_covar=reg_covar)
        y_pred = gm.fit_predict(x_train_scaled)
        em_times.append(time.time()-start_time)
        homogeneity_em.append(homogeneity_score(true_vals, y_pred.tolist()))
        completeness_em.append(completeness_score(true_vals, y_pred.tolist()))
        mutual_info_em.append(adjusted_mutual_info_score(true_vals, y_pred.tolist()))
        adj_rand_score_em.append(adjusted_rand_score(true_vals, y_pred.tolist()))
        if len(set(y_pred))>1:
            sil_em.append(silhouette_score(x_train_scaled, y_pred, metric='euclidean'))
        else:
            sil_em.append(1)
        em_likelihood.append(gm.score(x_train_scaled))
    # plot
    plt.plot(range(2, n+1), distortions, marker='o')
    plt.title("K-means Elbow ("+(str(dataset_name))+")")
    plt.xlabel('Number of clusters')
    plt.ylabel('Sum of Squared Distances')
    plt.savefig((str(dataset_name))+' km elbow.png')
    plt.show()

    plt.plot(range(2, n+1), sil, marker='o')
    plt.title('K-means Silhouette Scores ('+(str(dataset_name))+')')
    plt.xlabel('Number of clusters')
    plt.ylabel('Silhouette Score')
    plt.savefig((str(dataset_name))+' km silho.png')
    plt.show()

    plt.plot(range(2, n+1), em_likelihood, marker='o')
    plt.title('EM likelihood ('+(str(dataset_name))+')')
    plt.xlabel('Number of clusters')
    plt.ylabel('Likelihood')
    plt.savefig((str(dataset_name))+' em likelihood.png')
    plt.show()
    
    plt.plot(range(2, n+1), sil_em, marker='o')
    plt.title('EM Silhouette Scores ('+(str(dataset_name))+')')
    plt.xlabel('Number of clusters')
    plt.ylabel('Silhouette Score')
    plt.savefig((str(dataset_name))+' em silho.png')
    plt.show()
    
    plt.close()
    plot_data(list(range(1, n)), homogeneity, title="Performance Evaluation k-means ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="blue", label='Homogeneity')
    plot_data(list(range(1, n)), completeness, title="Performance Evaluation k-means ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="orange", label='Completeness')
    plot_data(list(range(1, n)), mutual_info, title="Performance Evaluation k-means ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="red", label='Adgusted Mutual Info')
    plot_data(list(range(1, n)), adj_rand_score, title="Performance Evaluation k-means ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="green", label='Adjusted random index')
    # plot_data(list(range(1, n)), v_measure, title="Performance Evaluation k-means", x_label="Number of Clusters", y_label="Score", color="brown", label='V-measure')
    plt.savefig((str(dataset_name))+' km perfo.png')
    plt.show()

    plt.close()
    plot_data(list(range(1, n)), homogeneity_em, title="Performance Evaluation EM ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="blue", label='Homogeneity')
    plot_data(list(range(1, n)), completeness_em, title="Performance Evaluation EM ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="orange", label='Completeness')
    plot_data(list(range(1, n)), mutual_info_em, title="Performance Evaluation EM ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="red", label='Adgusted Mutual Info')
    plot_data(list(range(1, n)), adj_rand_score_em, title="Performance Evaluation EM ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="green", label='Adjusted random index')
    # plot_data(list(range(1, n)), v_measure, title="Performance Evaluation EM", x_label="Number of Clusters", y_label="Score", color="brown", label='V-measure')
    plt.savefig((str(dataset_name))+' em perfo.png')
    plt.show()

    plt.close()
    plot_data(list(range(1, n)), kmeans_times, title="k-means/EM Running Time ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Time", color="red", label='k-means')
    plot_data(list(range(1, n)), em_times, title="k-means/EM Running Time ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Time", color="blue", label='EM')
    plt.savefig((str(dataset_name))+' km-em time.png')
    plt.show()
    print('kmeans_times')
    print(kmeans_times)
    print('em_times')
    print(em_times)
    
    return {'sil': sil, 'kmeans_times':kmeans_times, 'em_times':em_times, 'homogeneity':homogeneity, 'completeness':completeness, 'mutual_info':mutual_info, 'adj_rand_score':adj_rand_score, 'homogeneity_em':homogeneity_em, 'completeness_em':completeness_em, 'mutual_info_em':mutual_info_em, 'adj_rand_score_em':adj_rand_score_em}
Exemplo n.º 28
0
def adtk_init(model, colname=None):
    if model == 'iqr':
        from adtk.detector import InterQuartileRangeAD
        clf = InterQuartileRangeAD()
    elif model == 'ar':
        from adtk.detector import AutoregressionAD
        clf = AutoregressionAD()
    elif model == 'esd':
        from adtk.detector import GeneralizedESDTestAD
        clf = GeneralizedESDTestAD()
    elif model == 'level':
        from adtk.detector import LevelShiftAD
        clf = LevelShiftAD(15)
    elif model == 'persist':
        from adtk.detector import PersistAD
        clf = PersistAD(15)
    elif model == 'quantile':
        from adtk.detector import QuantileAD
        clf = QuantileAD()
    elif model == 'seasonal':
        from adtk.detector import SeasonalAD
        clf = SeasonalAD()
    elif model == 'volatility':
        from adtk.detector import VolatilityShiftAD
        clf = VolatilityShiftAD(15)
    elif model == 'kmeans':
        from adtk.detector import MinClusterDetector
        from sklearn.cluster import KMeans
        clf = MinClusterDetector(KMeans(n_clusters=2))
    elif model == 'birch':
        from adtk.detector import MinClusterDetector
        from sklearn.cluster import Birch
        clf = MinClusterDetector(Birch(threshold=0.25, branching_factor=25))
    elif model == 'gmm':
        from adtk.detector import MinClusterDetector
        from sklearn.mixture import GaussianMixture
        clf = MinClusterDetector(GaussianMixture(n_components=2, max_iter=50))
    elif model == 'vbgmm':
        from adtk.detector import MinClusterDetector
        from sklearn.mixture import BayesianGaussianMixture
        clf = MinClusterDetector(BayesianGaussianMixture(n_components=2, max_iter=50))
    elif model == 'eliptic':
        from adtk.detector import OutlierDetector
        from sklearn.covariance import EllipticEnvelope
        clf = OutlierDetector(EllipticEnvelope())
    elif model == 'mcdad':
        from adtk.detector import OutlierDetector
        from sklearn.covariance import MinCovDet
        clf = OutlierDetector(MinCovDet())
    elif model == 'isof':
        from adtk.detector import OutlierDetector
        from sklearn.ensemble import IsolationForest
        clf = OutlierDetector(IsolationForest())
    elif model == 'lofad':
        from adtk.detector import OutlierDetector
        from sklearn.neighbors import LocalOutlierFactor
        clf = OutlierDetector(LocalOutlierFactor())
    elif model == 'pcaad':
        from adtk.detector import PcaAD
        clf = PcaAD()
    elif model == 'linear':
        from adtk.detector import RegressionAD
        from sklearn.linear_model import LinearRegression
        clf = RegressionAD(LinearRegression(), target=colname)
    elif model == 'rf':
        from adtk.detector import RegressionAD
        from sklearn.ensemble import RandomForestRegressor
        clf = RegressionAD(RandomForestRegressor(), target=colname)
    elif model == 'huber':
        from adtk.detector import RegressionAD
        from sklearn.linear_model import HuberRegressor
        clf = RegressionAD(HuberRegressor(), target=colname)
    elif model == 'knnad':
        from adtk.detector import RegressionAD
        from sklearn.neighbors import KNeighborsRegressor
        clf = RegressionAD(KNeighborsRegressor(), target=colname)
    elif model == 'kernridge':
        from adtk.detector import RegressionAD
        from sklearn.kernel_ridge import KernelRidge
        clf = RegressionAD(KernelRidge(), target=colname)
    else:
        clf = ADTKDefault()
    return clf
Exemplo n.º 29
0
def train(loader, epoch, model_list, method='ocsvm'):
    # 大于阈值表示属于正常
    # model_list 对需要多轮训练的模型有效, 传入上一次训练的模型,例如ocnn
    datas, labels = get_features(loader)

    threshold_list = []
    update_models = []
    update_optimizer = []
    clf_list, optimizers = model_list

    for label in range(args.class_num):  # 为每个类别拟合ocsvm模型
        condition_index = np.where(labels == label)[0]
        fit_data = datas[condition_index]  # 标签label的训练数据
        optimizer = optimizers[label]

        if method == 'ocsvm':
            clf = OneClassSVM()
        elif method == 'isofore':
            clf = IsolationForest()
        elif method == 'gmm':
            clf = BayesianGaussianMixture()
        elif method == 'svdd':
            clf = SVDD(parameters)
        elif method == 'lof':
            clf = LocalOutlierFactor(novelty=True,
                                     n_neighbors=int(fit_data.size * 0.1))
        elif method == 'cnn':
            clf = ''
        elif method != 'sp':
            clf = clf_list[label]

        # 训练异常检测模型
        if method == 'ocnn':
            clf, optimizer = fit(clf, fit_data, optimizer, epoch)
            scores_temp = score_samples(clf, fit_data, epoch)
        elif method == 'lof':
            clf.fit(fit_data)
            scores_temp = clf.decision_function(fit_data)
        elif method == 'sp':
            pass
        elif method == 'cnn':
            pass
        else:
            clf.fit(fit_data)
            scores_temp = clf.score_samples(fit_data)

        # 异常检测模型阈值的计算
        if method != 'sp' and method != 'gmm' and method != 'cnn':
            threshold = np.mean(scores_temp) - \
                args.threshold_std_times*np.std(scores_temp)
            update_optimizer.append(optimizer)
            update_models.append(clf)
            threshold_list.append(threshold)
        elif method == 'gmm':
            threshold = np.mean(scores_temp)
            update_optimizer.append(optimizer)
            update_models.append(clf)
            threshold_list.append(threshold)
        elif method == 'sp':
            from cnn import get_c_v
            threshold_list = get_c_v(p_s=datas, labels=labels)
        elif method == 'cnn':
            threshold_list = ''

    model_list = (update_models, optimizers)
    return model_list, threshold_list
Exemplo n.º 30
0
#         ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color)
#         ell.set_clip_box(splot.bbox)
#         ell.set_alpha(0.5)
#         splot.add_artist(ell)
#
#     plt.title(title)
#     plt.show()


maxscore, now_c = 0, 0
for now_community in range(min_community,max_community+1, 1):
    model = BayesianGaussianMixture(n_components=now_community,  ####4
                                    covariance_type="full",
                                    # reg_covar=0,   去掉提高了0.01
                                    max_iter=expected_iter_times,  # 100
                                    init_params="random",
                                    random_state=5,
                                    weight_concentration_prior_type="dirichlet_distribution",
                                    # weight_concentration_prior=1e-2,
                                    # mean_precision_prior=.8,
                                    verbose=0, verbose_interval=10, warm_start=True)  # 0.259

    model.fit(data)
    label_pred = model.predict(data)  # label_pred=[],多此一举
    pred_community = np.column_stack((num_list, label_pred))

    pred_community = pred_community[np.argsort(pred_community[:, 0])]
    labels_pred = pred_community[:, 1]

    if metrics.normalized_mutual_info_score(labels_true, labels_pred) > maxscore:
        maxscore = metrics.normalized_mutual_info_score(labels_true, labels_pred)
        now_c = now_community