예제 #1
0
with open('C:\Users\kenny\Desktop\CAAM 495 - Senior Design\LandLL\histograms\code\histogram_2.csv') as histfile:
    histdata = csv.reader(histfile)
    intensities = []
    frequencies = []
    histogram = []
    for row in histdata:
        #if int(row[0]) > -500:  <-- Use this line of code to block out noise
            #histogram.append([float(i) for i in row])
            intensities.append(float(row[0]))
            #frequencies.append(float(row[1]))
n=len(intensities)
x = np.asarray(intensities, order='F').reshape(n,1)
#plt.hist(myarray,bins=256)
#plt.show()
print x
gmm = mixture.GMM(n_components=2) # gmm for two components
gmm.fit(x) # train it

print gmm

# linspace = np.linspace(-10, 10, 1000)
#
# fig, ax1 = plt.subplots()
# ax2 = ax1.twinx()
#
# ax1.hist(x, 100) # draw samples
# ax2.plot(linspace, np.exp(gmm.score_samples(linspace)[0]), 'r') # draw GMM
# plt.show()


예제 #2
0
# Generate random sample following a sine curve
np.random.seed(0)
X = np.zeros((n_samples, 2))
step = 4 * np.pi / n_samples

for i in xrange(X.shape[0]):
    x = i * step - 6
    X[i, 0] = x + np.random.normal(0, 0.1)
    X[i, 1] = 3 * (np.sin(x) + np.random.normal(0, .2))

color_iter = itertools.cycle(
    ['navy', 'turquoise', 'cornflowerblue', 'darkorange'])

for i, (clf, title) in enumerate([
    (mixture.GMM(n_components=10, covariance_type='full',
                 n_iter=100), "Expectation-maximization"),
    (mixture.DPGMM(n_components=10,
                   covariance_type='full',
                   alpha=0.01,
                   n_iter=100), "Dirichlet Process,alpha=0.01"),
    (mixture.DPGMM(n_components=10,
                   covariance_type='diag',
                   alpha=100.,
                   n_iter=100), "Dirichlet Process,alpha=100.")
]):

    clf.fit(X)
    splot = plt.subplot(3, 1, 1 + i)
    Y_ = clf.predict(X)
    for i, (mean, covar,
            color) in enumerate(zip(clf.means_, clf._get_covars(),
예제 #3
0
        corr_list.append(corr)
    file.close()
    corr_list = np.array(corr_list)

    ##============================= part#1: PDF of the original data ==============================
    plt.hist(corr_list, histtype='step', bins=500, normed=1, alpha=0.5)
    plt.plot([], [], color='blue', label="original pdf")
    #plt.title("Hidden batch distribution after one sample (with the initialization we calculated)")
    #plt.xlabel("Value of hidden batch")
    #plt.ylabel("Frequency")
    #plt.axis([-1, 1, 0, 7])  # un-normalized version: y ~ [0, 300]; normalized version: y ~ [0, 7]
    #plt.show()

    ##============================= part#2: get the two Gaussian mixture ==============================
    np.random.seed(1)
    g = mixture.GMM(n_components=2)

    ##=========== transform the corr_list into the required format (a wired format) ===========
    list = []
    for corr in corr_list:
        list.append([corr])
    corr_list = np.array(list)

    obs = corr_list
    #obs = np.concatenate((np.random.randn(100, 1), 10 + np.random.randn(300, 1)))
    g.fit(obs)
    #GMM(covariance_type='diag', init_params='wmc', min_covar=0.001, n_components=2, n_init=1, n_iter=100, params='wmc', random_state=None, thresh=None, tol=0.001)

    ##=========== get model parameters ===========
    ## show the learned model:
    # weight:
예제 #4
0
import pylab as pl
from sklearn import mixture

n_samples = 300
c_types = ['full', 'diag', 'spherical']
np.random.seed(0)
C = np.array([[0., -0.7], [3.5, 1.7]])
X_train = np.dot(np.random.randn(n_samples, 2), C)

pl.figure(dpi=100, figsize=(3, 3))
pl.scatter(X_train[:, 0], X_train[:, 1], .8)
pl.axis('tight')
pl.savefig('GaussianFit-data.svg')
pl.close()
for c_type in c_types:
    clf = mixture.GMM(n_components=1, covariance_type=c_type)
    clf.fit(X_train)
    x = np.linspace(-15.0, 20.0, num=200)
    y = np.linspace(-10.0, 10.0, num=200)
    X, Y = np.meshgrid(x, y)
    XX = np.c_[X.ravel(), Y.ravel()]  # flatten
    Z = np.log(-clf.eval(XX)[0])
    Z = Z.reshape(X.shape)

    pl.figure(dpi=100, figsize=(3, 3))
    CS = pl.contour(X, Y, Z)
    pl.scatter(X_train[:, 0], X_train[:, 1], .8)
    pl.axis('tight')
    pl.savefig('GaussianFit-%s.svg' % c_type)
    pl.close()
            meanDist = run_grmean_meanDist(samples[sample], dataDir, thr, hemi)
            stdev = run_stdev_meanDist(samples[sample], dataDir, thr, hemi)
            meanDist_norm = np.zeros((10242))
            meanDist_norm[cort] = (meanDist[np.nonzero(meanDist)] -
                                   meanDist[np.nonzero(meanDist)].mean()
                                   ) / meanDist[np.nonzero(meanDist)].std()
            trt_4 = run_icc_meanDist(samples[sample], dataDir, fsDir, thr,
                                     hemi, ['1a', '1b', '2a', '2b'])
            trt_2 = run_icc_meanDist(samples[sample], dataDir, fsDir, thr,
                                     hemi, ['1ab', '2ab'])
            nan_mask = run_nan_grmask(samples[sample], dataDir, hemi)

            data_gmm = {}
            for n_comp in num_gmm_comps:
                data = meanDist * 1000
                gmm = mixture.GMM(n_components=n_comp, n_iter=1000)
                gmm.fit(data[cort])
                bic = gmm.bic(data[cort])
                aic = gmm.aic(data[cort])
                res = np.zeros(10242)
                res[cort] = gmm.predict(data[cort])
                res[cort] = res[cort] + 1
                data_gmm[n_comp] = res
                homogeneity = homogeneity_score(res[cort], yeo7[0][cort])
                df_gmm_eval.loc[len(df_gmm_eval)] = [
                    str(thr), hemi, n_comp, bic, aic, homogeneity,
                    gmm.converged_
                ]

            for node in range(10242):
                df.loc[len(df)] = [
예제 #6
0
d1_2=sc.spatial.distance.cdist(segedpsds1[lone_half:lone],basis_set,'sqeuclidean')

d2=sc.spatial.distance.cdist(segedpsds2[:ltwo_half],basis_set,'sqeuclidean')

d2_2=sc.spatial.distance.cdist(segedpsds2[ltwo_half:ltwo],basis_set,'sqeuclidean')

mx=np.max([np.max(d1),np.max(d2),np.max(d1_2),np.max(d2_2)])

#convert to similarity matrices:
s1=1-(d1/mx)
s1_2=1-(d1_2/mx)
s2=1-(d2/mx)
s2_2=1-(d2_2/mx)

#estimate GMMs:
mod1=mixture.GMM(n_components=k,n_iter=100000,n_init=5,covariance_type='full')
mod1.fit(s1)

mod2=mixture.GMM(n_components=k2,n_iter=100000,n_init=5,covariance_type='full')
mod2.fit(s2)

len2=len(s2)
len1=len(d1)

#calculate likelihoods for held out data:
score1_1=mod1.score(s1_2)
score2_1=mod2.score(s1_2)

score1_2=mod1.score(s2_2)
score2_2=mod2.score(s2_2)
예제 #7
0
def cluster_gmm():
    snp_data_grouped_by_snp = hqa.generate_snp_data()
    for i, snp_data_for_all_samples in enumerate(snp_data_grouped_by_snp):
        print(snp_data_for_all_samples['snp_id'])
        snp_data = snp_data_for_all_samples['snp_data']
        x_vals = [
            float('nan') if x['x_norm'] is None else x['x_norm']
            for x in snp_data
        ]
        y_vals = [
            float('nan') if x['y_norm'] is None else x['y_norm']
            for x in snp_data
        ]

        X = np.transpose([x_vals, y_vals])

        # remove any non-finite values
        x_finite = np.isfinite(X)
        X = X[np.all(x_finite, axis=1), ]

        print('finding model')
        lowest_info_crit = None
        best_model = None
        best_n_components = 0
        cv_types = ['tied']  #['spherical', 'tied', 'diag', 'full']
        for cv_type in cv_types:
            print(cv_type)
            for n_components in range(1, 6):
                model = mixture.GMM(n_components=n_components,
                                    covariance_type=cv_type,
                                    n_iter=100)

                model.fit(X)
                info_crit = model.bic(X)
                print('SNP ID: {}, # Clusts: {}, BIC: {}'.format(
                    snp_data_for_all_samples['snp_id'], n_components,
                    info_crit))
                if lowest_info_crit is None or info_crit < lowest_info_crit:
                    best_model = model
                    lowest_info_crit = info_crit
                    best_n_components = n_components
                    best_cv_type = cv_type

        print('found best n_components: {}, cv_type: {}'.format(
            best_n_components, best_cv_type))

        Y_ = best_model.predict(X)

        #do_plot = len(set(Y_)) > 1
        do_plot = False
        if do_plot:
            color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm'])

            fig = pylab.figure()
            ax = fig.add_subplot(111, aspect='equal')

            for j, (mean, covar, color) in enumerate(
                    zip(best_model.means_, best_model._get_covars(),
                        color_iter)):
                v, w = linalg.eigh(covar)
                u = w[0] / linalg.norm(w[0])

                # as the DP will not use every component it has access to
                # unless it needs it, we shouldn't plot the redundant
                # components.
                if not np.any(Y_ == j):
                    continue

                if do_plot:
                    ax.scatter(X[Y_ == j, 0], X[Y_ == j, 1], .8, color=color)

                    # Plot an ellipse to show the Gaussian component
                    angle = np.arctan(u[1] / u[0])
                    angle = 180 * angle / np.pi  # convert to degrees
                    ell = mpl.patches.Ellipse(mean,
                                              v[0],
                                              v[1],
                                              180 + angle,
                                              color=color)
                    ell.set_clip_box(ax.bbox)
                    ell.set_alpha(0.5)
                    ax.add_artist(ell)

            fig.suptitle('SNP ID: {}, # Clusts: {}, BIC: {}'.format(
                snp_data_for_all_samples['snp_id'], best_n_components,
                lowest_info_crit))
            # pylab.show()
            fig.savefig('imgout/{}-best.png'.format(
                snp_data_for_all_samples['snp_id']),
                        bbox_inches='tight')
            plt.clf()
# Generate random sample following a sine curve
np.random.seed(0)
X = np.zeros((n_samples, 2))
step = 4*np.pi/n_samples

for i in xrange(X.shape[0]):
    x = i*step-6
    X[i,0] = x+np.random.normal(0, 0.1)
    X[i,1] = 3*(np.sin(x)+np.random.normal(0, .2))


color_iter = itertools.cycle (['r', 'g', 'b', 'c', 'm'])


for i, (clf, title) in enumerate([
        (mixture.GMM(n_components=10, covariance_type='diag'), \
             "Expectation-maximization"),
        (mixture.DPGMM(n_components=10, covariance_type='diag', alpha=0.01),
         "Dirichlet Process,alpha=0.01"),
        (mixture.DPGMM(n_components=10, covariance_type='diag', alpha=100.),
         "Dirichlet Process,alpha=100.")
        ]):

    clf.fit(X, n_iter=100)
    splot = pl.subplot(3, 1, 1+i)
    Y_ = clf.predict(X)
    for i, (mean, covar, color) in enumerate(zip(clf.means, clf.covars,
                                                 color_iter)):
        v, w = linalg.eigh(covar)
        u = w[0] / linalg.norm(w[0])
        # as the DP will not use every component it has access to
예제 #9
0
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
ax.imshow(np.rot90(Z),
          cmap=plt.cm.gist_earth_r,
          extent=[xmin, xmax, ymin, ymax])
#ax.plot(gps[:,0], gps[:,1], 'k.', markersize=1)
ax.set_xlim([xmin, xmax])
ax.set_ylim([ymin, ymax])
plt.show()

n_components = 10
covariance_type = 'spherical'  # 'full'#'diag'
gmm = mixture.GMM(n_components=n_components,
                  covariance_type=covariance_type,
                  min_covar=0.00001,
                  n_iter=1000)
#gmm._set_covars(np.ones((n_components, 2)) * 0.05)
gmm.fit(data)

import itertools
import matplotlib as mpl


def plot_mixture(gmm, ax, scale=1.0):
    color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm'])
    for n, color in zip(range(gmm.n_components), color_iter):
        v, w = np.linalg.eigh(gmm._get_covars()[n][:2, :2])
        u = w[0] / np.linalg.norm(w[0])
        angle = np.arctan2(u[1], u[0])
        angle = 180 * angle / np.pi  # convert to degrees
예제 #10
0
def fit_gaussians_etc(x, y, surrogate_repeat, gain, run_direc, file, savefig=True, fz=14):

    # window length is in bins (1 FR bin = 2cm) => 3*2cm = 6cm Kernel (Chen = 8.5cm, Ravassard = 5cm)
    y = signale.tools.smooth(y, window_len=3.)

    # generate one more x-point between all existing x-points:________________________________________________
    x_doublePointNum = numpy.arange(x[0], x[-1]+numpy.diff(x)[0]/2., numpy.diff(x)[0]/2.)

    # data = pl.hist(x, weights=y, bins=len(numpy.arange(min(x), max(x), x[1]-x[0])))

    # in case firing rates are too small (<5) multiply them with factor that resulting histogram represents
    # data form better________________________________________________________________________________________
    if max(y) < 5:
        input_y = 5. * (y/max(y))
    else:
        input_y = multi*y

    # generate data histogram___________________________________________________________________________________
    data = numpy.repeat(x, numpy.around(input_y).astype(int))

    # generate surrogat data____________________________________________________________________________________
    for su in [0]:  # numpy.arange(surrogate_repeat+1):

        # if su != 0 and good == 1:  # for su=0 actual data will be plotted
        #
        #     # generate randomly shuffled data___________________________________________________________________
        #     surrogate_data = numpy.random.choice(list(x), len(data))
        #     data = surrogate_data
        #
        #     # generate histogram for shuffled data______________________________________________________________
        #     bin_num = len(numpy.arange(min(x), max(x), abs(x[1]-x[0])))
        #     new_y = numpy.histogram(surrogate_data, bins=bin_num, range=(min(x), max(x)))[0]
        #
        #     # undo multiplication from line 97 to get actual firing rates back__________________________________
        #     if max(y) < 5:
        #         y = (new_y * max(y))/5.
        #     else:
        #         y = new_y

        # fit two gaussians_____________________________________________________________________________________
        gmm = mixture.GMM(n_components=2, covariance_type='full', min_covar=0.0000001)  # gmm for two components
        gmm.fit(numpy.vstack(data))  #numpy.vstack(data))  #numpy.vstack(data))  # train it!

        # get functions for two fitted gaussians________________________________________________________________
        gauss1 = (gmm.weights_[0] * matplotlib.mlab.normpdf(x_doublePointNum, gmm.means_[0], numpy.sqrt(gmm.covars_[0])))[0]
        gauss2 = (gmm.weights_[1] * matplotlib.mlab.normpdf(x_doublePointNum, gmm.means_[1], numpy.sqrt(gmm.covars_[1])))[0]

        # calculate basic values for the FR distribution y_______________________________________________________
        std = numpy.std(y)
        mean = numpy.mean(y)

        # calculate basic values for the gaussians_______________________________________________________________
        mg1a = max(gauss1)
        mg2a = max(gauss2)

        stdg1 = numpy.sqrt(gmm.covars_[0])[0][0]
        stdg2 = numpy.sqrt(gmm.covars_[1])[0][0]

        # calculate x difference between two gaussian peaks_______________________________________________________
        xDiff = abs(x_doublePointNum[numpy.argmax(gauss1)]-x_doublePointNum[numpy.argmax(gauss2)])

        # define x-window depending on difference of gaussian peaks, in which gauss amplitudes will be normalised
        # ________________________________________________________________________________________________________
        if file.endswith('normalised.hkl'):
            xDiff_cutoff = 0.2/float(gain)
        else:
            xDiff_cutoff = 0.20

        # define devident of the standart deviation from the maximum FR that should be used____________________
        std_dev1 = 2.
        std_dev2 = 2.

        if xDiff > xDiff_cutoff < (1./4.)*max(x) and stdg1 > 0.3:   # for larger fields 1/8. of the std will be used
            std_dev1 = 8.
        if xDiff > xDiff_cutoff < (1./4.)*max(x) and stdg2 > 0.3:
            std_dev2 = 8.

        # amplitude maximum for gauss 1___________________________________________________________________________
        x1 = numpy.argmax(gauss1)+1

        if x1 >= len(x)-1:  # if gauss 1 fit maximum is outside the data use maximum close to track end
            x1 = len(x)-3

        # find indices where the FR maximum should be take out of__________________________________
        g1_max0 = numpy.argmin(abs(x_doublePointNum[0:x1]-(x_doublePointNum[x1-1]-stdg1/std_dev1)))
        g1_max1 = numpy.argmin(abs(x_doublePointNum[x1:-1]-(x_doublePointNum[x1-1]+stdg1/std_dev1)))+x1

        # define new indices for Sonderfaelle_______________________________
        if g1_max0 >= len(y)-1 and g1_max1+1 >= len(y)-1:  # maximum should be taken from outside the data
            g1_max0 = len(y)-5
            g1_max1 = len(y)-2
        if g1_max0 == g1_max1+1:  # indices are equal, so that there is no range from which the max can be taken
            g1_max0 -= 2
            g1_max1 += 2
        if g1_max0 < 0:  # maximum should be taken from outside the data
            g1_max0 = 0
        if g1_max1+1 > len(y)-1:  # larger index only is outside the data
            g1_max1 = len(y)-2

        # get gauss maximum in area of interest_____________
        g1_maxFR = max(y[g1_max0:g1_max1+1])

        # amplitude maximum for gauss 2___________________________________________________________________________
        x2 = numpy.argmax(gauss2)+1

        if x2 >= len(x)-1:  # if gauss 1 fit maximum is outside the data use maximum close to track end
            x2 = len(x)-3

        # find indices where the FR maximum should be take out of__________________________________
        g2_max0 = numpy.argmin(abs(x_doublePointNum[0:x2]-(x_doublePointNum[x2-1]-stdg2/std_dev2)))
        g2_max1 = numpy.argmin(abs(x_doublePointNum[x2:-1]-(x_doublePointNum[x2-1]+stdg2/std_dev2)))+x2

        # define new indices for Sonderfaelle (s.o.)_______________________________
        if g2_max0 >= len(y)-1 and g2_max1+1 >= len(y)-1:
            g2_max0 = len(y)-5
            g2_max1 = len(y)-2
        if g2_max0 == g2_max1+1:
            g2_max0 -= 2
            g2_max1 += 2
        if g2_max0 < 0:
            g2_max0 = 0
        if g2_max1+1 > len(y)-1:
            g2_max1 = len(y)-2

        # get gauss maximum in area of interest_____________
        g2_maxFR = max(y[g2_max0:g2_max1+1])

        # set gauss closest to y distribution maximum to its maximum___________________________________________
        nearest_yMax_gauss = signale.tools.findNearest(numpy.array([x_doublePointNum[numpy.argmax(gauss1)],
                                                                    x_doublePointNum[numpy.argmax(gauss2)]]),
                                                       x[numpy.argmax(y)])[0]
        if nearest_yMax_gauss == 0:
            g1_maxFR = max(y)
        else:
            g2_maxFR = max(y)

        # normalise gaussians to FR maximum of distribution y within gausstian maximum + / - 0.5 of its std:__________
        gauss1 = g1_maxFR*(gauss1/mg1a) # first normalise gauss to max=1 then multiply with new maximum
        gauss2 = g2_maxFR*(gauss2/mg2a)

        # get gauss amplitude and weights______________________
        amplitude_g1 = gmm.weights_[0] * g1_maxFR/mg1a
        amplitude_g2 = gmm.weights_[1] * g2_maxFR/mg2a

        weight_g1 = amplitude_g1/(amplitude_g1 + amplitude_g2)
        weight_g2 = amplitude_g2/(amplitude_g1 + amplitude_g2)

        # get maxima auf gaussians with new amplitude__________
        mg1 = max(gauss1)
        mg2 = max(gauss2)
        # max_mean_diff_in_std1 = (mg1 - mean)/std
        # max_mean_diff_in_std2 = (mg2 - mean)/std

        # define plot colors based on which gauss is bigger______
        if mg1 >= mg2:
            colour = ['r', 'k']
            small_max = mg2
            small_max_index = numpy.argmax(gauss2)
        else:
            colour = ['k', 'r']
            small_max = mg1
            small_max_index = numpy.argmax(gauss1)

        # calculate values to get m = deltaF/Fmean:____________________________________________
        # derivative1 = numpy.diff(gauss1+gauss2) / numpy.diff(x_doublePointNum)
        #
        # # remove negative values in beginning of derivative
        # if run_direc == 'left':
        #     # for leftwards runs the array is starting from the end of the track!
        #     sc = -1
        #     pre_sign = 1
        #     sign_array = numpy.arange(len(derivative1))[::-1]  # backwards array
        # else:
        #     sc = 0
        #     pre_sign = -1
        #     sign_array = numpy.arange(len(derivative1))
        #
        # # set negative slopes at the beginning of the derivative to zero, as they are artifacts___
        # zero_crossings = numpy.where(numpy.diff(numpy.sign(derivative1)))[0]
        # if len(zero_crossings):
        #     first_sign_change = zero_crossings[sc]+1
        #
        #     if run_direc == 'left':
        #         derivative1[first_sign_change:len(derivative1)][derivative1[first_sign_change:len(derivative1)] < 0] = 0.
        #     else:
        #         derivative1[0:first_sign_change][derivative1[0:first_sign_change] < 0] = 0.
        # # ________________________________________________________________________________________
        #
        # # use sign change of derivative to detect zero crossings (for that replace zeros with neighbouring values)____
        # sign = numpy.sign(derivative1)
        #
        # # get rid of zeros and use sign value from the value before
        # for l in sign_array:
        #     if sign[l] == 0.:
        #         if run_direc == 'right' and l == 0:
        #             sign[l] = sign[l+1]
        #         elif run_direc == 'left' and l == len(sign)-1:
        #             sign[l] = sign[l-1]
        #         else:
        #             sign[l] = sign[l+pre_sign]
        # # get rid of remaining zeros in the array edges
        # for l in sign_array[::-1]:
        #     if sign[l] == 0.:
        #         if run_direc == 'left' and l == 0:
        #             sign[l] = sign[l+1]
        #         elif run_direc == 'right' and l == len(sign)-1:
        #             sign[l] = sign[l-1]
        #         else:
        #             sign[l] = sign[l-pre_sign]
        #
        # # find derivative zero crossings____________________________________________________________
        # deri1_zero = numpy.where(numpy.diff(sign))[0]+1
        #
        # if len(deri1_zero) == 3:  # with 3 zero crossings m-value can be calculated____________
        #     between_peak_min_index = deri1_zero[1]
        #
        #     between_peak_min = (gauss1+gauss2)[between_peak_min_index]
        #     index_delta = abs(between_peak_min_index-small_max_index)
        #
        #     delta_F = small_max-between_peak_min
        #
        #     # sonderfaelle______________________
        #     if small_max_index-index_delta < 0:
        #         s_index = 0
        #     else:
        #         s_index = small_max_index-index_delta
        #
        #     if small_max_index+index_delta+1 > len(x)-1:
        #         l_index = len(x_doublePointNum)-1
        #     else:
        #         l_index = small_max_index+index_delta+1
        #     # __________________________________
        #
        #     small_peak_mean = numpy.mean((gauss1+gauss2)[s_index: l_index])
        #
        #     # calculate m-value_______________________________________________________________________
        #     m = delta_F/small_peak_mean
        #
        #     if numpy.isnan(m):
        #         print 'delta_F = ', delta_F
        #         print 'small_peak_mean = ', small_peak_mean
        #         print 'mean for index1 to index2 : ', small_max_index-index_delta, small_max_index+index_delta+1
        #         print (gauss1+gauss2)[small_max_index-index_delta: small_max_index+index_delta+1]
        #         sys.exit()
        #
        #     if su != 0:
        #         M.append(m)
        #     else:
        #         M_data.append(m)
        #         good = 1
        #         extra_path = 'Deriv_good/'
        #
        # else:  # not 3 zero crossings -> m-value cannot be calculated
        #     if su == 0:
        #         M_data.append(numpy.nan)
        #         good = 0
        #         extra_path = 'Deriv_bad/'

        if su == 0:
            # plot data and gaussians from mixture model

            fig22, ax22 = pl.subplots(1, 1, figsize=(18, 12))

            ax22.axhline(mean, linestyle='-', color=custom_plot.pretty_colors_set2[0], alpha=0.8, zorder=0)
            ax22.axhspan(mean-std, mean+std, facecolor=custom_plot.pretty_colors_set2[0], alpha=0.2, linewidth=False, zorder=0)
            ax22.plot(x, y, 'b')
            ax22.plot(x_doublePointNum, gauss1, linewidth=2, color=colour[0])  # gauss1 = small gauss
            ax22.plot(x_doublePointNum, gauss2, linewidth=2, color=colour[1])
            ax22.plot(x_doublePointNum, gauss1+gauss2, linewidth=2, color='g')
            ax22.set_xlabel('Position from start point (m)', fontsize=fz)
            ax22.set_ylabel('Firing rate (Hz)', fontsize=fz)

            ax22.set_ylim(0, max(gauss1+gauss2)+0.01)
            ax22.set_xlim(0, max(x))

    return fig22, ax22
예제 #11
0
plt.xticks(())
plt.yticks(())
plt.show()

#Alternative clustering methods

aff = cluster.AffinityPropagation()
aff.fit(X_train)
print(aff.cluster_centers_indices_.shape)

ms = cluster.MeanShift()
ms.fit(X_train)
print(ms.cluster_centers_.shape)

from sklearn import mixture
gm = mixture.GMM(n_components=n_digits,
                 covariance_type='tied',
                 random_state=42)
gm.fit(X_train)

# Print train clustering and confusion matrix
y_pred = gm.predict(X_test)
print("Adjusted rand score:{:.2}".format(
    metrics.adjusted_rand_score(y_test, y_pred)))

print("Homogeneity score:{:.2} ".format(
    metrics.homogeneity_score(y_test, y_pred)))

print("Completeness score: {:.2} ".format(
    metrics.completeness_score(y_test, y_pred)))
예제 #12
0
        S2.append(x)
    else:
        S3.append(x)

# print(S1)
# print(S2)
# print(S3)

# 'spherical', 'diag', 'tied', 'full'
covariance = 'tied'
n_gauss = 4

accuracy_results = []
for i in xrange(num_simulation):

    gmix1 = mixture.GMM(n_components=n_gauss, covariance_type=covariance)
    gmix1.fit(S1)
    # print gmix1.means_

    gmix2 = mixture.GMM(n_components=n_gauss, covariance_type=covariance)
    gmix2.fit(S2)
    # print gmix2.means_

    gmix3 = mixture.GMM(n_components=n_gauss, covariance_type=covariance)
    gmix3.fit(S3)
    # print gmix3.means_

    y_train_pred1 = gmix1.score_samples(X_train)
    y_train_pred2 = gmix2.score_samples(X_train)
    y_train_pred3 = gmix1.score_samples(X_train)
예제 #13
0
data = pd.read_csv(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data',
    sep=",",
    header=None)
colnames = [
    'preg', 'plas', 'pres', 'skin', 'insu', 'mass', 'pedi', 'age', 'class'
]
data.columns = colnames
X, y = data.iloc[:, :-1], data.iloc[:, -1]
X.columns = colnames[:len(colnames) - 1]

rp = SparseRandomProjection(n_components=6)
projected_data = rp.fit_transform(X)

gm = mixture.GMM(n_components=2, covariance_type='diag')
gm.fit(projected_data)
X_expect = y
y_pred = gm.predict(projected_data)

both = pd.concat([pd.DataFrame(y_pred), pd.DataFrame(y)], 1)
both.columns = ['pred', 'class']

from sklearn.metrics import accuracy_score
print "Accuracy"
print accuracy_score(both['class'], both['pred'])

for k in range(1, 8):
    model = mixture.GMM(n_components=k, covariance_type='diag')
    labels = model.fit_predict(projected_data)
    if k == 2:
예제 #14
0
def M_EM(gm, X):
    from sklearn import mixture
    k = gm.k
    sklgmm = mixture.GMM(n_components=k, covariance_type='diag', n_init=5, n_iter = 10, thresh = 1e-2)
    sklgmm.fit(X)
    return sklgmm.means_, sklgmm.covars_
예제 #15
0
def cluster_and_display_waters(site_number, w_positions_np):
    def optimize_n(positions_np, n_data):

        bic = {}
        for n in [x + 1 for x in range(20)]:
            if n < len(positions_np):
                gmm = mixture.GMM(n_components=n,
                                  covariance_type='spherical',
                                  n_iter=20)
                gmm.fit(positions_np)
                score = sum(gmm.score(positions_np))
                lambda_c = 15  # 3 too few
                bic_l = score - lambda_c * 0.5 * math.log(n_data) * n
                bic[n] = bic_l

        for key in bic:
            print("   water bic", key, bic[key])

        key, value = max(iter(bic.items()), key=lambda x: x[1])
        return key

    n_components = optimize_n(w_positions_np, len(w_positions_np))
    print("optimize_n for water:::::::::::::", n_components)
    dpgmm = mixture.GMM(n_components, covariance_type='spherical', n_iter=40)
    dpgmm.fit(w_positions_np)

    cluster_assignments = dpgmm.predict(w_positions_np)

    color_list = [
        'green', 'greentint', "sea", 'yellow', "yellowtint", "aquamarine",
        "forestgreen", "goldenrod", "orangered", "orange", "cyan", 'red',
        "blue"
    ]
    color_list.extend(color_list)
    color_list.extend(color_list)
    color_list.extend(color_list)
    color_list.extend(color_list)
    color_list.extend(color_list)
    color_list.extend(color_list)
    color_list.extend(color_list)
    color_list.extend(color_list)
    color_list.extend(color_list)
    color_list.extend(color_list)
    color_list.extend(color_list)

    means = dpgmm.means_
    cvs = dpgmm._get_covars()
    weights = dpgmm.weights_

    obj = coot.new_generic_object_number("CFC Site " + str(site_number) +
                                         " selected waters")
    for i, pos in enumerate(w_positions_np):
        mean = means[cluster_assignments[i]]
        # reject spheres at the origin - (from DPGMM strangeness)
        d = mean[0] * mean[0] + mean[1] * mean[1] + mean[2] * mean[2]
        if d > 1.0:
            col = color_list[cluster_assignments[i]]
            coot.to_generic_object_add_point(obj, col, 10, pos[0], pos[1],
                                             pos[2])
        else:
            print("reject prediction", i, "for cluster",
                  cluster_assignments[i])

    # set_display_generic_object(obj, 1)

    obj = coot.new_generic_object_number("CFC Site " + str(site_number) +
                                         " water cluster means")

    for i, cv in enumerate(cvs):

        mean = means[i]
        d = mean[0] * mean[0] + mean[1] * mean[1] + mean[2] * mean[2]
        v, w = linalg.eigh(cv)

        # print "mean  ", mean
        # print "weight", weights[i], "prec", precs[i]
        # print "weight", weights[i]
        # print "v", v

        if d > 1.0:

            pos = mean
            thick = 2
            cluster_star_obj(obj, pos, thick, v[0])

        else:
            print("reject", mean, v)

    coot.set_display_generic_object(obj, 1)

    cluster_assignments_as_list = [int(x) for x in cluster_assignments]

    return (dpgmm, cluster_assignments_as_list)
예제 #16
0
print("Testing accuracy: ", clf.score(X_test, y_test_bin))

print("Zero: ",
      sum(y_test_bin == np.zeros(y_test_bin.shape[0])) / y_test_bin.shape[0])
print("One: ",
      sum(y_test_bin == np.ones(y_test_bin.shape[0])) / y_test_bin.shape[0])

scores = cross_val_score(clf, X_train, X_test, cv=5)
print("Cross Val: ", scores)

print("#################")

##### GMM #######
print("GMM")

clf = mixture.GMM(n_components=4)
clf.fit(X_train)
print("GMM score")
print(clf.score_samples(X_test))

print("#################")

##### Naive Bayes #######
print("Gaussian Naive Bayes")

clf = GaussianNB()
clf.fit(X_train, y_train)

print("Training accuracy: ", clf.score(X_train, y_train))
print("Testing accuracy: ", clf.score(X_test, y_test))
def do_system_training(dataset,
                       model_path,
                       feature_normalizer_path,
                       feature_path,
                       classifier_params,
                       dataset_evaluation_mode='folds',
                       classifier_method='gmm',
                       overwrite=False):
    """System training

    model container format:

    {
        'normalizer': normalizer class
        'models' :
            {
                'office' : mixture.GMM class
                'home' : mixture.GMM class
                ...
            }
    }

    Parameters
    ----------
    dataset : class
        dataset class

    model_path : str
        path where the models are saved.

    feature_normalizer_path : str
        path where the feature normalizers are saved.

    feature_path : str
        path where the features are saved.

    classifier_params : dict
        parameter dict

    dataset_evaluation_mode : str ['folds', 'full']
        evaluation mode, 'full' all material available is considered to belong to one fold.
        (Default value='folds')

    classifier_method : str ['gmm']
        classifier method, currently only GMM supported
        (Default value='gmm')

    overwrite : bool
        overwrite existing models
        (Default value=False)

    Returns
    -------
    nothing

    Raises
    -------
    ValueError
        classifier_method is unknown.

    IOError
        Feature normalizer not found.
        Feature file not found.

    """

    if classifier_method != 'gmm' and classifier_method != 'dnn':
        raise ValueError("Unknown classifier method [" + classifier_method +
                         "]")

    # Check that target path exists, create if not
    check_path(model_path)

    for fold in dataset.folds(mode=dataset_evaluation_mode):
        current_model_file = get_model_filename(fold=fold, path=model_path)
        if not os.path.isfile(current_model_file) or overwrite:
            # Load normalizer
            feature_normalizer_filename = get_feature_normalizer_filename(
                fold=fold, path=feature_normalizer_path)
            if os.path.isfile(feature_normalizer_filename):
                normalizer = load_data(feature_normalizer_filename)
            else:
                raise IOError("Feature normalizer not found [%s]" %
                              feature_normalizer_filename)

            # Initialize model container
            model_container = {'normalizer': normalizer, 'models': {}}

            # Collect training examples
            file_count = len(dataset.train(fold))
            data = {}
            for item_id, item in enumerate(dataset.train(fold)):
                progress(title_text='Collecting data',
                         fold=fold,
                         percentage=(float(item_id) / file_count),
                         note=os.path.split(item['file'])[1])

                # Load features
                feature_filename = get_feature_filename(
                    audio_file=item['file'], path=feature_path)
                if os.path.isfile(feature_filename):
                    feature_data = load_data(feature_filename)['feat']
                else:
                    raise IOError("Features not found [%s]" % (item['file']))

                # Scale features
                feature_data = model_container['normalizer'].normalize(
                    feature_data)

                # Store features per class label
                if item['scene_label'] not in data:
                    data[item['scene_label']] = feature_data
                else:
                    data[item['scene_label']] = numpy.vstack(
                        (data[item['scene_label']], feature_data))

            le = pp.LabelEncoder()
            tot_data = {}

            # Train models for each class
            for label in data:
                progress(title_text='Train models', fold=fold, note=label)
                if classifier_method == 'gmm':
                    model_container['models'][label] = mixture.GMM(
                        **classifier_params).fit(data[label])
                elif classifier_method == 'dnn':
                    if 'x' not in tot_data:
                        tot_data['x'] = data[label]
                        tot_data['y'] = numpy.repeat(label,
                                                     len(data[label]),
                                                     axis=0)
                    else:
                        tot_data['x'] = numpy.vstack(
                            (tot_data['x'], data[label]))
                        tot_data['y'] = numpy.hstack(
                            (tot_data['y'],
                             numpy.repeat(label, len(data[label]), axis=0)))
                else:
                    raise ValueError("Unknown classifier method [" +
                                     classifier_method + "]")

            if classifier_method == 'dnn':
                clf = skflow.TensorFlowDNNClassifier(**classifier_params)
                tot_data['y'] = le.fit_transform(tot_data['y'])
                clf.fit(tot_data['x'], tot_data['y'])
                clf.save('dnn/dnnmodel1')

            # Save models
            save_data(current_model_file, model_container)
예제 #18
0
def fit_samples(samples, ncomponents):
    gmix = mixture.GMM(n_components=ncomponents, covariance_type='full')
    gmix.fit(samples)
    return gmix
예제 #19
0
def latent_cluster_centers(SAMObject,
                           X=None,
                           labels=None,
                           center='gaussian',
                           plot=True,
                           which_indices=(0, 1),
                           randSeed=None,
                           ax=None):
    """
    Find centers for the clusters identified in param. labels for the latent space. Centers can be a gaussian density (so, mean and covar.)
    or (not implemented yet) mean and median, as controlled by the param. center.
    """
    from sklearn import mixture

    assert (labels is not None)

    if X is None:
        X = SAMObject._get_latent()

    cluster_labels = np.unique(labels)
    K = len(cluster_labels)
    Q = X.shape[1]

    cntr = np.zeros((K, Q)) * np.nan
    if center == 'gaussian':
        covars = np.zeros((K, Q, Q)) * np.nan
    else:
        covars = None

    for i in range(K):
        if center == 'gaussian':
            g = mixture.GMM(covariance_type='full',
                            init_params='wmc',
                            min_covar=0.001,
                            n_components=1,
                            n_init=1,
                            n_iter=300,
                            params='wmc',
                            random_state=randSeed,
                            thresh=None,
                            tol=0.001,
                            verbose=0)
            g.fit(X[labels == cluster_labels[i], :])
            cntr[i, :] = g.means_
            covars[i] = g.covars_
        elif center == 'median':
            raise NotImplementedError("This is not implemented yet")
        elif center == 'mean':
            raise NotImplementedError("This is not implemented yet")
        else:
            print('Not known center type')
            raise

    if plot:
        color_iter = colors = cm.rainbow(np.linspace(0, 1, 20))
        myperm = np.random.permutation(color_iter.shape[0])
        color_iter = color_iter[myperm, :]
        marker_iter = itertools.cycle((',', '+', '.', 'o', '*', 'v', 'x', '>'))
        splot = pb.subplot(1, 1, 1)

        for i, (color, marker) in enumerate(zip(color_iter, marker_iter)):
            pb.scatter(X[labels == cluster_labels[i], which_indices[0]],
                       X[labels == cluster_labels[i], which_indices[1]],
                       s=40,
                       color=color,
                       marker=marker)

            if i == K - 1:
                break
        if ax is None:
            ax = pb.gca()
        for i in range(K):
            util_plot_cov_ellipse(cntr[i, :],
                                  covars[i],
                                  ax=ax,
                                  which_indices=which_indices)
    return cntr, covars
예제 #20
0
# <codecell>

# %pdoc sklmix.GMM
# %psource sklmix.GMM

#sklmix.GMM?
#sklmix.GMM??

# <headingcell level=4>

# Pick your favorite clustering algorithm

# <codecell>

gmm_model = sklmix.GMM(n_components=3, covariance_type='full')
gmm_model.fit(iris[['PW', 'PL', 'SW']])
yhat = gmm_model.predict(iris[['PW', 'PL', 'SW']])
crosstab = pd.crosstab(iris['Type'],
                       yhat,
                       rownames=['true'],
                       colnames=['predicted'])
print crosstab

# <headingcell level=4>

# Align the confusion matrix with a non-standard package

# <codecell>

import munkres
예제 #21
0
def cluster_weights(links, threshold):
    weights = np.transpose(np.array([links]))

    # Clustering links
    n = len(links)
    MIN_NUM_SAMPLES = 2
    if n > MIN_NUM_SAMPLES:
        # Fit a mixture of Gaussians with EM
        lowest_bic = np.infty
        bic = []
        global best_gmm
        n_components_range = range(2, n)
        cv_types = ['spherical', 'tied', 'diag', 'full']
        for cv_type in cv_types:
            for n_components in n_components_range:
                gmm = mixture.GMM(n_components=n_components,
                                  covariance_type='full')
                gmm.fit(weights)
                # Bayesian information criterion
                bic.append(gmm.bic(weights))
                if bic[-1] < lowest_bic:
                    lowest_bic = bic[-1]
                    best_gmm = gmm

        # Averaging

        ids = best_gmm.predict(weights)
        print(best_gmm.n_components)
        total_weights = []
        unique_ids = list(set(ids))

        for i in unique_ids:
            c_ids = ids == i
            average = np.sum(links[c_ids]) / len(links[c_ids])
            links[c_ids] = average
            total_weights.append(np.sum(links[c_ids]))
        print('averaged : ', links)
        weak_cluster_ids = [
            i for w, i in zip(total_weights, unique_ids) if w < threshold
        ]
        strong_clusters = [
            links[ids == i] for i in unique_ids if i not in weak_cluster_ids
        ]
        """
        combos = []
        for cluster in strong_clusters:
            i = 1
            for e in cluster:
                if e * i < threshold:
                    combos.append(e)
                i += 1
        print('combos', combos)
        possible_combo = 0
        for i in range(0, len(combos) + 1):
            combo = set(combinations(combos, i))
            for c in combo:
                w = np.sum(list(c))
                if possible_combo < w < threshold:
                    possible_combo = w

        weak_clusters = [(total_weights[i], i) for i in weak_cluster_ids]
        for w, j in weak_clusters:
            weighted_combo = w + possible_combo
            if weighted_combo < threshold:
                idx = ids == j
                links[idx] = 0
        """
        print("output", links)
        print("clusters", ids)
        #print("weak", weak_clusters)
        print("strong", strong_clusters)
        return links, ids
    elif n == 2:
        return links, np.array([0, 1])
    else:
        return links, np.zeros(len(links))
def process_subfold(sf,fold):
    print("Fold "+str(fold));
    t0 = time.time();

    snoring_dataset = dm.load_ComParE2017(featPath, filetype)  # load dataset
    trainset, develset, testset = dm.split_ComParE2017_simple(snoring_dataset)  # creo i trainset per calcolare media e varianza per poter normalizzare
    del snoring_dataset

    # Read dataset size and preallocate
    a=trainset[0][1].shape
    if (filetype == 'npy'):
        nfeat = a[0]
    else:
        nfeat = a[1]

    # Read the features
    trainFeat=np.empty([1,nfeat])
    #for seq in trainset:
    for seq in develset:
        if (filetype == 'npy'):
            feat = seq[1].transpose()
        else:
            feat = seq[1]
        # metto tutte le features in una matrice che poi passero al gmm.fit per adattaare l'UBM
        trainFeat = np.vstack((trainFeat, feat))
    trainFeat = np.delete(trainFeat, 0, 0)
    print("DONE!")

    #trainFeat = trainFeat.astype(dtype='float32')


    for m in mixtures:
        # Train the UBM
        print("Fold "+str(fold)+"-->Mixture: "+str(m)+" ");
        sys.stdout.flush();
        gmm = mixture.GMM(n_components=m, n_iter=1000, random_state=1);
        gmm.fit(trainFeat);
        ubmPath = os.path.join(curUbmsPath, str(m));
        if (not os.path.exists(ubmPath)):
            try:#handle the simultaneous creation of folders from multiple processes
                os.makedirs(ubmPath);
            except OSError, e:
                if e.errno != 17:
                    raise   
                else:
                    print "OSError.errno 17 ignored"
                pass
        if (not gmm.converged_):
            print("Fold "+str(fold)+"-->Convergence not reached with " + str(m) +" mixtures");
        joblib.dump(gmm, os.path.join(ubmPath, "ubm_" + str(sf)));         #salvo l'ubm. mi crea le varie compie tipo ubm_1_02 ecc... per poterle magari riutilizzare per il debug

        # Extract trainset supervectors
        curSupervecSubPath = os.path.join(curSupervecPath, str(m));
        if (not os.path.exists(curSupervecSubPath)):
            try:#handle the simultaneous creation of folders from multiple processes
                os.makedirs(curSupervecSubPath);
            except OSError, e:
                if e.errno != 17:
                    raise   
                else:
                    print "OSError.errno 17 ignored"
                pass
예제 #23
0
# ,[[4,7], [1,2], [3,7], [4,8], [2,7], [3,8], [1,7], [5,6], [2,1], [4,8]]
# ,[[6,8], [3,4], [4,6], [8,6], [4,5], [5,7],[4,1], [4,3], [7,2],[2,1]]];
# unseen = [[[4,1],[1,2],[8,5],[1,2],[1,2], [6,1],[2,8],[3,4]]
# ,[[4,1], [1,2], [4,3], [2,3], [3,1], [1,3], [2,8], [4,7]]
# ,[[6,5], [3,8], [8,5], [4,8], [5,6],[4,6],  [7,6],[2,5]]];

unseen = [[[2, 3], [1, 3], [3, 7], [5, 7], [2, 3], [4, 8], [4, 3], [2, 1],
           [1, 6], [8, 5]],
          [[2, 3], [1, 3], [7, 1], [1, 3], [2, 3], [4, 8], [2, 8], [5, 2],
           [5, 1], [8, 5]],
          [[5, 6], [7, 6], [8, 4], [6, 8], [4, 5], [6, 7], [4, 3], [7, 3],
           [1, 6], [2, 6]]]

# unseen=[]
for i in range(n_class):
    g = mixture.GMM(n_components=1, covariance_type='full')
    g.fit(data[i])
    mean.append(g.means_[0])
    covar.append(g.covars_[0])
    gaudist.append(g)

for i in range(len(unseen)):
    me = []
    s = [0 for i in range(n_class)]
    support = 0
    for j in range(len(unseen[i])):
        me.append(
            np.array(
                (mean[unseen[i][j][0] - 1][j] + mean[unseen[i][j][1] - 1][j]) /
                2))
        s[unseen[i][j][0] - 1] = 1
예제 #24
0
dataset = dataframe.values

"""Split dataset into input(X) and output(Y) variables,
where the first 9 columns are removed since they are identifiers and
population_count0, and the Y (hotel existance) variable is classified
by 1 if it exists in a particular geohash otherwise it is classified by 0
"""

X = dataset[:,10:-3].astype(float)
#X = StandardScaler().fit_transform(X)
Y = [0]*len(X)
for sample in range(len(X)):
	if dataset[sample,-1] > 0:
		Y[sample] += 1

clf = mixture.GMM(n_components=6, covariance_type='full', random_state=7)
clusters = clf.fit(X)
cluster_means = clusters.means_
print(cluster_means)
cluster_predict = clusters.predict(X)

#lons = dataset[:,3]
#lats = dataset[:,2]
#pts = zip(lons,lats)
lons = []
lats = []
for i in range(len(cluster_predict)):
	if cluster_predict[i] == 5:
		lons.append(dataset[i,3])
		lats.append(dataset[i,2])
def do_system_training(dataset,
                       model_path,
                       feature_normalizer_path,
                       feature_path,
                       feature_params,
                       classifier_params,
                       dataset_evaluation_mode='folds',
                       classifier_method='gmm',
                       clean_audio_errors=False,
                       overwrite=False):
    """System training

    model container format:

    {
        'normalizer': normalizer class
        'models' :
            {
                'office' : mixture.GMM class
                'home' : mixture.GMM class
                ...
            }
    }

    Parameters
    ----------
    dataset : class
        dataset class

    model_path : str
        path where the models are saved.

    feature_normalizer_path : str
        path where the feature normalizers are saved.

    feature_path : str
        path where the features are saved.

    feature_params : dict
        parameter dict

    classifier_params : dict
        parameter dict

    dataset_evaluation_mode : str ['folds', 'full']
        evaluation mode, 'full' all material available is considered to belong to one fold.
        (Default value='folds')

    classifier_method : str ['gmm']
        classifier method, currently only GMM supported
        (Default value='gmm')

    clean_audio_errors : bool
        Remove audio errors from the training data
        (Default value=False)

    overwrite : bool
        overwrite existing models
        (Default value=False)

    Returns
    -------
    nothing

    Raises
    -------
    ValueError
        classifier_method is unknown.

    IOError
        Feature normalizer not found.
        Feature file not found.

    """

    if classifier_method != 'gmm':
        raise ValueError("Unknown classifier method [" + classifier_method +
                         "]")

    # Check that target path exists, create if not
    check_path(model_path)

    for fold in dataset.folds(mode=dataset_evaluation_mode):
        current_model_file = get_model_filename(fold=fold, path=model_path)
        if not os.path.isfile(current_model_file) or overwrite:
            # Load normalizer
            feature_normalizer_filename = get_feature_normalizer_filename(
                fold=fold, path=feature_normalizer_path)
            if os.path.isfile(feature_normalizer_filename):
                normalizer = load_data(feature_normalizer_filename)
            else:
                raise IOError("Feature normalizer not found [%s]" %
                              feature_normalizer_filename)

            # Initialize model container
            model_container = {'normalizer': normalizer, 'models': {}}

            # Collect training examples
            file_count = len(dataset.train(fold))
            data = {}
            for item_id, item in enumerate(dataset.train(fold)):
                progress(title_text='Collecting data',
                         fold=fold,
                         percentage=(float(item_id) / file_count),
                         note=os.path.split(item['file'])[1])

                # Load features
                feature_filename = get_feature_filename(
                    audio_file=item['file'], path=feature_path)
                if os.path.isfile(feature_filename):
                    feature_data = load_data(feature_filename)['feat']
                else:
                    raise IOError("Features not found [%s]" % (item['file']))

                # Scale features
                feature_data = model_container['normalizer'].normalize(
                    feature_data)

                # Audio error removal
                if clean_audio_errors:
                    current_errors = dataset.file_error_meta(item['file'])
                    if current_errors:
                        removal_mask = numpy.ones((feature_data.shape[0]),
                                                  dtype=bool)
                        for error_event in current_errors:
                            onset_frame = int(
                                numpy.floor(
                                    error_event['event_onset'] /
                                    feature_params['hop_length_seconds']))
                            offset_frame = int(
                                numpy.ceil(
                                    error_event['event_offset'] /
                                    feature_params['hop_length_seconds']))
                            if offset_frame > feature_data.shape[0]:
                                offset_frame = feature_data.shape[0]
                            removal_mask[onset_frame:offset_frame] = False
                        feature_data = feature_data[removal_mask, :]

                # Store features per class label
                if item['scene_label'] not in data:
                    data[item['scene_label']] = feature_data
                else:
                    data[item['scene_label']] = numpy.vstack(
                        (data[item['scene_label']], feature_data))

            # Train models for each class
            for label in data:
                progress(title_text='Train models', fold=fold, note=label)
                if classifier_method == 'gmm':
                    model_container['models'][label] = mixture.GMM(
                        **classifier_params).fit(data[label])
                else:
                    raise ValueError("Unknown classifier method [" +
                                     classifier_method + "]")

            # Save models
            save_data(current_model_file, model_container)
예제 #26
0
def cluster_and_display_chemical_features(site_number, type,
                                          chemical_features_list):
    def optimize_n(type, positions_np, n_data):

        print("cluster_and_display_chemical_features.optimize_n called " \
               "with n_data = ", n_data)

        bic = {}
        for n in [x + 1 for x in range(10)]:
            if n < n_data:
                gmm = mixture.GMM(n_components=n,
                                  covariance_type='spherical',
                                  n_iter=20)
                gmm.fit(positions_np)
                score = sum(gmm.score(positions_np))
                lambda_c = 15
                if type == 'Aromatic':
                    lambda_c = 20
                bic_l = score - lambda_c * 0.5 * math.log(n_data) * n
                bic[n] = bic_l

        if len(bic) > 1:
            key, value = max(iter(bic.items()), key=lambda x: x[1])
            return key
        else:
            return 1

    def analyse_bic(type, positions_np, n_data):

        for n in [x + 1 for x in range(14)]:
            gmm = mixture.GMM(n_components=n,
                              covariance_type='spherical',
                              n_iter=20)
            gmm.fit(positions_np)
            score = sum(gmm.score(positions_np))
            lambda_c = 3
            if type == 'Aromatic':
                lambda_c = 3000
            bic = score - lambda_c * 0.5 * n_data * n
            print(type, len(positions_np), n, "converged?", gmm.converged_,
                  "score:", score, "bic", bic)

    def get_cfc_col(type):
        if type == "Donor":
            return "blue"
        if type == "Acceptor":
            return "red"
        if type == "Hydrophobe":
            return "yellow"
        if type == "Aromatic":
            return "orange"
        return "grey"

    # --- main line ----

    # no fake points
    # positions_np = np.array([item[0] for item in chemical_features_list])

    ext_chemical_features_list = [item[0] for item in chemical_features_list]

    for item_b in chemical_features_list:
        delta = 0.25
        item = item_b[0]
        p1 = [item[0], item[1], item[2] + delta]
        p2 = [item[0], item[1], item[2] - delta]
        p3 = [item[0], item[1] + delta, item[2]]
        p4 = [item[0], item[1] - delta, item[2]]
        p5 = [item[0] + delta, item[1], item[2]]
        p6 = [item[0] - delta, item[1], item[2]]
        ext_chemical_features_list.append(p1)
        ext_chemical_features_list.append(p2)
        ext_chemical_features_list.append(p3)
        ext_chemical_features_list.append(p4)
        ext_chemical_features_list.append(p5)
        ext_chemical_features_list.append(p6)

    positions_np = np.array(ext_chemical_features_list)

    # analyse_bic(type, positions_np, len(chemical_features_list))

    n_data = len(chemical_features_list)
    n = 1
    if n_data > 1:
        n = optimize_n(type, positions_np, n_data)

    if n <= len(chemical_features_list):
        gmm = mixture.GMM(n_components=n,
                          covariance_type='spherical',
                          n_iter=20)
        gmm.fit(positions_np)
        print(type, len(positions_np), n, "converged? ", gmm.converged_,
              "score:", sum(gmm.score(positions_np)))

        cluster_assignments = gmm.predict(positions_np)

        features = []
        for i, cf in enumerate(chemical_features_list):
            # print "     ", cf, cluster_assignments[i]
            features.append([cf, int(cluster_assignments[i])])

        means = gmm.means_
        means_as_list = [[x[0], x[1], x[2]] for x in means]

        obj_name = "CFC Site " + str(
            site_number) + " " + type + " pharmacophore-clusters"
        cfc_obj = coot.new_generic_object_number(obj_name)
        cfc_col = get_cfc_col(type)
        for mean in means_as_list:
            # coot.to_generic_object_add_dodecahedron(cfc_obj, cfc_col, 0.2, mean[0], mean[1], mean[2])
            coot.to_generic_object_add_pentakis_dodecahedron(
                cfc_obj, cfc_col, 2.3, 0.1, mean[0], mean[1], mean[2])
        coot.set_display_generic_object(cfc_obj, 1)

        return [type, features, means_as_list]

    # oops too many parameters for the model
    return False
예제 #27
0
n_samples = 500

# Generate random sample, two components
np.random.seed(0)
C = np.array([[0., -0.1], [1.7, .4]])
X = np.r_[np.dot(np.random.randn(n_samples, 2), C),
          .7 * np.random.randn(n_samples, 2) + np.array([-6, 3])]

lowest_bic = np.infty
bic = []
n_components_range = range(1, 7)
cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
    for n_components in n_components_range:
        # Fit a mixture of Gaussians with EM
        gmm = mixture.GMM(n_components=n_components, covariance_type=cv_type)
        gmm.fit(X)
        bic.append(gmm.bic(X))
        if bic[-1] < lowest_bic:
            lowest_bic = bic[-1]
            best_gmm = gmm

bic = np.array(bic)
color_iter = itertools.cycle(
    ['navy', 'turquoise', 'cornflowerblue', 'darkorange'])
clf = best_gmm
bars = []

# Plot the BIC scores
spl = plt.subplot(2, 1, 1)
for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)):
예제 #28
0
    def find_the_sites(self, file_name_comp_id_list):

        # main line
        #
        coords_with_spec = []

        for fn_comp_id in file_name_comp_id_list:
            fn = fn_comp_id[0]
            comp_id = fn_comp_id[1]
            imol = coot.handle_read_draw_molecule_with_recentre(
                fn_comp_id[0], 0)
            # what are the residue specs for the given comp_ids?
            residue_specs = coot.get_residue_specs_in_mol_py(imol, comp_id)
            print(fn, residue_specs)

            for spec in residue_specs:
                # centre = residue_centre_from_spec_py(imol, spec)
                chain_id = rsu.residue_spec_to_chain_id(spec)
                res_no = rsu.residue_spec_to_res_no(spec)
                ins_code = ''

                res_info = coot.residue_info_py(imol, chain_id, res_no,
                                                ins_code)

                for atom in res_info:
                    coords_with_spec.append(
                        [rsu.residue_atom_to_position(atom), imol, spec])

        # print coords_with_spec

        # now cluster coords. There will be 1 (usually), maybe 2 possibly 3 sites

        if len(coords_with_spec) < 3:

            return False

        else:

            coords = [x[0] for x in coords_with_spec]
            positions_np = np.array(coords)
            n_components = self.optimize_n(positions_np, len(positions_np))
            print("optimize_n for sites::::::::::::", n_components)
            dpgmm = mixture.GMM(n_components,
                                covariance_type='full',
                                n_iter=40)
            dpgmm.fit(positions_np)

            cluster_assignments = dpgmm.predict(positions_np)
            means = dpgmm.means_
            weights = dpgmm.weights_

            print(cluster_assignments)
            print(means)
            print(weights)

            print("cluster_assignments", cluster_assignments)

            merge_map = self.find_mergeable_clusters(means, weights)
            # which key (i.e. cluster index) has the most number of other clusters
            # that can be merged in?
            #
            # convert to a list of ints (not <type 'numpy.int64'>) (because, on decoding Python->C++ object
            # we do a PyInt_Check for the site_idx (and a <type 'numpy.int64'> fails that test)
            #
            new_cluster_assignments = [
                int(x)
                for x in self.merge_clusters(cluster_assignments, merge_map)
            ]
            print("new cluster_assignments", new_cluster_assignments)

            specs = [x[1:] for x in coords_with_spec]
            cluster_assignments_with_specs = zip(new_cluster_assignments,
                                                 specs)

            sites = coot.chemical_feature_clusters_accept_site_clusters_info_py(
                cluster_assignments_with_specs)

            # show me them
            if True:  # debug
                o = coot.new_generic_object_number("site clusters")
                for mean in means:
                    cluster_star_obj(o, mean, 2, 2)
                # coot.set_display_generic_object(o, 1) this is for debugging

            self.sites = sites
예제 #29
0
def DPF_distrib_histogram(ax, x):
    """
    Parameters
    ax: contains the reference to the plot
    x: array containing all the pits DPF values in the considered areal
    """
    
    X  = x.reshape(-1, 1)
    lowest_bic = np.inf
    bic = []
    # Find the mixture with lowest BIC (Best information criterion)
    for n_components in range(1,3):
        # Fit a mixture of Gaussians with EM
        gmm = mixture.GMM(n_components=n_components)
        gmm.fit(X) # train it!
        bic.append(gmm.bic(X))
        if bic[-1] < lowest_bic:
            lowest_bic = bic[-1]
            best_gmm = gmm
            print ("Best gmm with components "+str(n_components)+" and BIC "+
                   str(bic[-1]))
    gmm = best_gmm

    COLOR, label, alpha, NORMED = 'blue', 'pits', 1, False
    number_bins = int(x.shape[0]/16)
    if number_bins <= 0:
        number_bins = 1
    print number_bins
    a,b = min(x), max(x)
    ax2 = ax.twinx()
    ax2.spines['right'].set_visible(False)
    ax2.spines['top'].set_visible(False)
    ax2.xaxis.set_ticks_position('none')
    ax2.yaxis.set_ticks_position('none')
    ax2.spines['left'].set_visible(False)
    for ylabel_i in ax2.get_yticklabels():
        ylabel_i.set_visible(False)
        ylabel_i.set_fontsize(0.0)

    n, bins, patches = ax.hist(X, number_bins, facecolor=COLOR, alpha=alpha,
                               range=(a,b), label=label+": "+str(x.shape[0]),
                               normed=NORMED)
    x_bins=(bins[1:]+bins[:-1])/2
    
    linspace = np.linspace(-2, 2, 1000).reshape(-1, 1)  
    ax2.plot(linspace, np.exp(gmm.score_samples(linspace)[0]), 'r')
    mu1 = gmm.means_[0]
    std1 = np.sqrt(gmm.covars_[0])

    if gmm.n_components == 1:
        threshold = mu1-2*std1
    elif gmm.n_components == 2:
        mu2 = gmm.means_[1]
        std2 = np.sqrt(gmm.covars_[1])
        A2 = gmm.weights_[1]
        A1 = gmm.weights_[0]
        x_samples = [mu2+(mu1-mu2)/250.0*p for p in range(250)]
        gauss1 = gauss(x_samples, mu1, std1, 1)
        gauss2 = gauss(x_samples, mu2, std2, 1)
        threshold = x_samples[np.argmin(np.abs(gauss1-gauss2))]

    if gmm.n_components == 1:
        ax.plot(np.repeat(threshold,200), np.linspace(min(n), max(n), num=200),
                color='magenta', lw=3, label = 'threshold')
    elif gmm.n_components == 2:
        ax.plot(np.repeat(threshold,200), np.linspace(min(n), max(n), num=200),
                color='green', lw=3, label = 'threshold')
    ax.set_xlim([-2,2])
    return threshold
예제 #30
0
max_i = 20.5

len_test = len(test)
test_below = len(test[test[:, 0] < min_i])
test_above = len(test[test[:, 0] > max_i])

if not CACHED:
    for n in range(60):
        print "testing Gaussian with components:", n
        score = 0.
        for bin in np.arange(min_i, max_i, diff):
            print "processing bin", bin
            train_bin = train[train[:, 0] > bin]
            train_bin = train_bin[train_bin[:, 0] < (bin + diff)]

            g = mixture.GMM(n_components=n, covariance_type='full')
            g.fit(train_bin[:, 1:])

            val_bin = val[val[:, 0] > bin]
            val_bin = val_bin[val_bin[:, 0] < (bin + diff)]
            score += np.sum(g.score(val_bin[:, 1:]))

        print "score", score
        if score > max_score:
            max_score = score
            max_n = n

    print "best n is:", max_n

max_n = 25