Exemplo n.º 1
0
def Get_row_similarity(idx):
    row = mat_contents['Euclid_matrix'][:, idx]
    #row = np.square(row)

    min_covariance = np.min(np.abs(np.diff(row))) / 100

    bic_vals = np.array([])
    prediction_table = {}
    bic_table = {}
    for m in range(2, 11):
        gmm = GMM(n_components=m,
                  covariance_type='diag',
                  init_params='wc',
                  min_covar=min_covariance)
        gmm.fit(row)
        cluster_mean = gmm.means_

        sorted_indices = np.argsort(
            cluster_mean, axis=None)  # get index of 2 smallest mean value
        bic_rating = round(gmm.bic(row), 2)
        prediction_results = gmm.predict(row)

        if cluster_mean.shape[0] != np.unique(prediction_results).shape[0]:
            continue

            #print 'shape : ' , np.unique(prediction_results).shape[0]
            #if np.unique(prediction_results).shape[0] == 1: continue
            #if np.unique(prediction_results).shape[0] > 1:
            #	print 'Mis-match break : ' , cluster_mean.shape[0], np.unique(prediction_results).shape[0]
            #	break

        prediction_table[m] = [
            prediction_results, sorted_indices, cluster_mean
        ]
        bic_vals = np.append(bic_vals, bic_rating)
        bic_table[bic_rating] = m

    min_bic_vals = np.min(bic_vals)
    assignment = prediction_table[bic_table[min_bic_vals]][0]
    two_smallest_indices = prediction_table[bic_table[min_bic_vals]][1]
    all_means = prediction_table[bic_table[min_bic_vals]][2]
    unique_assignments = np.unique(assignment)

    cluster_1 = row[assignment == two_smallest_indices[0]]
    cluster_2 = row[assignment == two_smallest_indices[1]]
    if len(two_smallest_indices) > 2:
        cluster_3 = row[assignment == two_smallest_indices[2]]

    #	Make sure cluster_1 is not too small
    if len(two_smallest_indices) > 2:
        print str(idx) + ' cluster merged due small cluster 1'
        if len(cluster_1) < 10:
            cluster_1 = np.append(cluster_1, cluster_2)
            cluster_2 = cluster_3

    #	Make sure cluster_1 and 2 are not too close
    mean_1 = np.mean(cluster_1)
    mean_2 = np.mean(cluster_2)
    std_1 = np.std(cluster_1)

    #print 'mean 1 : ' , mean_1
    #print 'mean 2 : ' , mean_2
    #print 'top : ' , mean_1 + std_1/100.0
    #print 'bottom : ' , mean_1 - std_1/100.0
    if ((mean_2 < mean_1 + std_1) and (mean_2 > mean_1 - std_1)):
        print str(idx) + ' cluster merged due to mean proximity'
        total_cluster = np.sort(np.append(cluster_1, cluster_2))
        cluster_1 = total_cluster[0:len(total_cluster) - 4]
        cluster_2 = total_cluster[-int(np.floor(len(total_cluster) /
                                                3.0)):len(total_cluster)]

    cluster_1 = np.sort(cluster_1)
    cluster_2 = np.sort(cluster_2)

    cluster_len = len(cluster_1)
    first_half_len = int(np.ceil(cluster_len / 2.0))
    cluster_2_half_len = int(np.floor(len(cluster_2) / 2))
    second_half_len = cluster_len - first_half_len + len(cluster_2)
    first_half = cluster_1[0:first_half_len]
    two_half = cluster_2[0:cluster_2_half_len]
    second_rest = cluster_1[first_half_len:cluster_len]

    X = np.expand_dims(
        np.append(np.append(np.append(cluster_1, two_half), second_rest),
                  cluster_2), 0)
    Y = np.transpose(
        np.append(np.ones(cluster_len + cluster_2_half_len),
                  np.zeros(second_half_len)))

    #	print 'Cluster 1 mean : ', np.mean(cluster_1)
    #	print 'Cluster 2 mean : ', np.mean(cluster_2)
    #	print 'Cluster 1 :', cluster_1
    #	print '\n'
    #	print 'Cluster 2 :', cluster_2
    #	print int(cluster_len - 10)
    #	print 'cluster len : ' , cluster_1.shape
    #	print '\n'
    #	print 'first_half_len : ' , first_half_len
    #	print '\n'
    #	print second_rest
    #	print '2nd rest : ' , second_rest.shape
    #	print '\n'
    #	print 'Cluster 2: ', cluster_2
    #	print '\n'
    #	print 'Cluster 2 half: ', two_half
    #	print X.shape
    #	print X
    #	print Y.shape
    #	print Y

    logreg = linear_model.LogisticRegression(C=1e5)
    logreg.fit(np.transpose(X), Y)
    prob_matrix = logreg.predict_proba(np.expand_dims(row, 1))

    row_similarity = np.expand_dims(prob_matrix[:, 1], 0)
    #print prob_matrix

    lower_range = np.min(row)
    upper_range = np.max(row)
    increment = (upper_range - lower_range) / 20.0

    logistic_range = np.arange(lower_range, upper_range, increment)
    logistic_range = np.expand_dims(logistic_range, 1)
    sigmoid = logreg.predict_proba(logistic_range)
    sigmoid = sigmoid[:, 1]

    all_means = np.transpose(np.sort(all_means, 0))
    print str(idx) + ' all means : ', all_means
    f, (ax1, ax2) = plt.subplots(2, 1, sharey=False)
    ax1.set_title('Histogram and the likelihood drop')
    histVals = ax1.hist(row, 80)
    max_hist = np.max(histVals[0])
    ax1.plot(logistic_range, max_hist * sigmoid)
    ax1.plot([np.median(row), np.median(row)], [0, max_hist])
    ax1.text(increment, max_hist - 2, all_means)
    #ax2.plot(row_similarity)
    ax2.set_title('BIC model selection')
    ax2.plot(range(2, bic_vals.shape[0] + 2), bic_vals)
    #plt.show()
    plt.savefig('histogram_graphs/output_' + str(idx) + '.png')
    plt.close(f)

    return row_similarity
Exemplo n.º 2
0
 def fit_new(self, x, label):
     self.y.append(label)
     gmm = GMM(self.gmm_order)
     gmm.fit(x)
     self.gmms.append(gmm)
Exemplo n.º 3
0
    def trainModelFV_LOOCV_Classifiers(self, extension='*.txt'):
        """
        This method contains the entire module
        required for training the bag of visual words model
        Use of helper functions will be extensive.
        """

        print('trainModelFV_LOOCV_Classifiers')
        names = ["Linear SVM"]
        classifiers = [SVC(kernel='linear')]

        self.name_dict, self.number_dict, self.count_class = self.file_helper.getLabelsFromFile(
            self.label_path)

        # read file. prepare file lists.
        self.images, self.trainImageCount = self.file_helper.getFilesFromDirectory(
            self.base_path, self.datasets, extension)

        self.parameters += 'Classifier Parameters\n'
        self.parameters += '%s' % self.classifier_helper.clf

        features_nd = np.asarray(self.images)
        #features_nd.sort(axis=0)
        loo = LeaveOneOut()
        predictions = {}
        p = {}
        l = []
        hits = {}
        for name in names:
            predictions[name] = []
            p[name] = []
            hits[name] = 0

        c = 0
        for train, test in loo.split(features_nd):
            feature_test_file = str(features_nd[test][0][0])
            class_name_test = feature_test_file.split(os.sep)[-2]
            c += 1
            currenInvDate = datetime.datetime.now().strftime(
                "%d/%m/%Y %H:%M:%S")
            print('Step: %i/%i - %s - %s' %
                  (c, features_nd.shape[0], currenInvDate, feature_test_file))
            # if c == 1 or c % 25 == 0:
            #    self.mail_helper.sendMail("Progress: %s - %s" % (self.test_name, self.OsName), "Samples processed: %i" % c)

            self.descriptor_list = []
            self.train_labels = []
            for feature in features_nd[train]:
                feature = feature[0]
                label_number = self.number_dict[feature.split(os.sep)[-2]]
                self.train_labels = np.append(self.train_labels, label_number)
                des = self.file_helper.loadFeaturesFromFile(feature)
                self.descriptor_list.append(des)

            # format data as nd array
            self.classifier_helper.formatND(self.descriptor_list)

            gmm = GMM(n_components=self.no_clusters, covariance_type='diag')
            gmm.fit(self.classifier_helper.descriptor_vstack)

            fv_dim = self.no_clusters + 2 * self.no_clusters * self.classifier_helper.descriptor_vstack.shape[
                1]
            print(fv_dim)
            n_videos = train.shape[0]
            features = np.array([np.zeros(fv_dim) for i in range(n_videos)])
            count = 0
            for i in range(n_videos):
                len_video = len(self.descriptor_list[i])
                fv = fisher_vector(
                    self.classifier_helper.descriptor_vstack[count:count +
                                                             len_video], gmm)
                features[i] = fv
                count += len_video

            print(features.shape)
            print('Data normalization.')
            scaler = StandardScaler()
            # train normalization
            features = scaler.fit_transform(features)
            features = power_normalize(features, 0.5)
            features = L2_normalize(features)

            # real label
            l.extend([self.number_dict[feature_test_file.split(os.sep)[-2]]])

            # test features
            feature_test = self.file_helper.loadFeaturesFromFile(
                feature_test_file)
            test_fv = fisher_vector(feature_test, gmm)
            # train normalization
            test_fv = test_fv.reshape(1, -1)
            test_fv = scaler.transform(test_fv)
            test_fv = power_normalize(test_fv, 0.5)
            test_fv = L2_normalize(test_fv)

            # train classifiers
            for name, clf in zip(names, classifiers):
                print(name)
                clf.fit(features, self.train_labels)
                cl = int(clf.predict(test_fv)[0])
                class_name_predict = self.name_dict[str(cl)]
                if class_name_test == class_name_predict:
                    hits[name] += 1

                # predicted label
                p[name].extend([cl])
                predictions[name].append({
                    'image': feature_test_file,
                    'class': cl,
                    'object_name': self.name_dict[str(cl)]
                })
            msg_progress = ''
            for name in names:
                msg_progress += 'Classifier: %s - Hits:%i/%i - Accuracy: %.4f\n' % (
                    name.ljust(20), hits[name], c, hits[name] / c)

            print(msg_progress)
            print('\n\n')
            if c == 1 or c % 25 == 0:
                self.mail_helper.sendMail(
                    "Progress: %s - %s" % (self.test_name, self.OsName),
                    msg_progress)

        for name in names:
            print(name)
            self.saveResults(predictions[name],
                             p[name],
                             l,
                             features_nd.shape[0],
                             classifier_name=name)
    def __do_perform(self, custom_out=None, main_experiment=None):
        if custom_out is not None:
            # if not os.path.exists(custom_out):
            #     os.makedirs(custom_out)
            self._old_out = self._out
            self._out = custom_out
        elif self._old_out is not None:
            self._out = self._old_out

        if main_experiment is not None:
            self.log("Performing {} as part of {}".format(
                self.experiment_name(), main_experiment.experiment_name()))
        else:
            self.log("Performing {}".format(self.experiment_name()))

        # Adapted from https://github.com/JonathanTay/CS-7641-assignment-3/blob/master/clustering.py
        # %% Data for 1-3
        sse = defaultdict(list)
        ll = defaultdict(list)
        bic = defaultdict(list)
        sil = defaultdict(lambda: defaultdict(list))
        sil_s = np.empty(shape=(2 * len(self._clusters) *
                                self._details.ds.training_x.shape[0], 4),
                         dtype='<U21')
        acc = defaultdict(lambda: defaultdict(float))
        adj_mi = defaultdict(lambda: defaultdict(float))
        km = kmeans(random_state=self._details.seed)
        gmm = GMM(random_state=self._details.seed)

        st = clock()
        j = 0
        for k in self._clusters:
            km.set_params(n_clusters=k)
            gmm.set_params(n_components=k)
            km.fit(self._details.ds.training_x)
            gmm.fit(self._details.ds.training_x)

            km_labels = km.predict(self._details.ds.training_x)
            gmm_labels = gmm.predict(self._details.ds.training_x)

            sil[k]['Kmeans'] = sil_score(self._details.ds.training_x,
                                         km_labels)
            sil[k]['GMM'] = sil_score(self._details.ds.training_x, gmm_labels)

            km_sil_samples = sil_samples(self._details.ds.training_x,
                                         km_labels)
            gmm_sil_samples = sil_samples(self._details.ds.training_x,
                                          gmm_labels)
            # There has got to be a better way to do this, but I can't brain right now
            for i, x in enumerate(km_sil_samples):
                sil_s[j] = [k, 'Kmeans', round(x, 6), km_labels[i]]
                j += 1
            for i, x in enumerate(gmm_sil_samples):
                sil_s[j] = [k, 'GMM', round(x, 6), gmm_labels[i]]
                j += 1

            sse[k] = [km.score(self._details.ds.training_x)]
            ll[k] = [gmm.score(self._details.ds.training_x)]
            bic[k] = [gmm.bic(self._details.ds.training_x)]

            acc[k]['Kmeans'] = cluster_acc(self._details.ds.training_y,
                                           km_labels)
            acc[k]['GMM'] = cluster_acc(self._details.ds.training_y,
                                        gmm_labels)

            adj_mi[k]['Kmeans'] = ami(self._details.ds.training_y, km_labels)
            adj_mi[k]['GMM'] = ami(self._details.ds.training_y, gmm_labels)

            self.log("Cluster: {}, time: {}".format(k, clock() - st))

        sse = (-pd.DataFrame(sse)).T
        sse.index.name = 'k'
        sse.columns = ['{} sse (left)'.format(self._details.ds_readable_name)]

        ll = pd.DataFrame(ll).T
        ll.index.name = 'k'
        ll.columns = [
            '{} log-likelihood'.format(self._details.ds_readable_name)
        ]

        bic = pd.DataFrame(bic).T
        bic.index.name = 'k'
        bic.columns = ['{} BIC'.format(self._details.ds_readable_name)]

        sil = pd.DataFrame(sil).T
        sil_s = pd.DataFrame(sil_s, columns=['k', 'type', 'score',
                                             'label']).set_index('k')  #.T
        # sil_s = sil_s.T
        acc = pd.DataFrame(acc).T
        adj_mi = pd.DataFrame(adj_mi).T

        sil.index.name = 'k'
        sil_s.index.name = 'k'
        acc.index.name = 'k'
        adj_mi.index.name = 'k'

        sse.to_csv(self._out.format('{}_sse.csv'.format(
            self._details.ds_name)))
        ll.to_csv(
            self._out.format('{}_logliklihood.csv'.format(
                self._details.ds_name)))
        bic.to_csv(self._out.format('{}_bic.csv'.format(
            self._details.ds_name)))
        sil.to_csv(
            self._out.format('{}_sil_score.csv'.format(self._details.ds_name)))
        sil_s.to_csv(
            self._out.format('{}_sil_samples.csv'.format(
                self._details.ds_name)))
        acc.to_csv(self._out.format('{}_acc.csv'.format(
            self._details.ds_name)))
        adj_mi.to_csv(
            self._out.format('{}_adj_mi.csv'.format(self._details.ds_name)))

        # %% NN fit data (2,3)
        grid = {
            'km__n_clusters': self._clusters,
            'NN__alpha': self._nn_reg,
            'NN__hidden_layer_sizes': self._nn_arch
        }
        mlp = MLPClassifier(activation='relu',
                            max_iter=2000,
                            early_stopping=True,
                            random_state=self._details.seed)
        km = kmeans(random_state=self._details.seed,
                    n_jobs=self._details.threads)
        pipe = Pipeline([('km', km), ('NN', mlp)],
                        memory=experiments.pipeline_memory)
        gs, _ = self.gs_with_best_estimator(pipe, grid, type='kmeans')
        self.log("KMmeans Grid search complete")

        tmp = pd.DataFrame(gs.cv_results_)
        tmp.to_csv(
            self._out.format('{}_cluster_kmeans.csv'.format(
                self._details.ds_name)))

        grid = {
            'gmm__n_components': self._clusters,
            'NN__alpha': self._nn_reg,
            'NN__hidden_layer_sizes': self._nn_arch
        }
        mlp = MLPClassifier(activation='relu',
                            max_iter=2000,
                            early_stopping=True,
                            random_state=self._details.seed)
        gmm = CustomGMM(random_state=self._details.seed)
        pipe = Pipeline([('gmm', gmm), ('NN', mlp)],
                        memory=experiments.pipeline_memory)
        gs, _ = self.gs_with_best_estimator(pipe, grid, type='gmm')
        self.log("GMM search complete")

        tmp = pd.DataFrame(gs.cv_results_)
        tmp.to_csv(
            self._out.format('{}_cluster_GMM.csv'.format(
                self._details.ds_name)))

        # %% For chart 4/5
        self._details.ds.training_x2D = TSNE(
            verbose=10, random_state=self._details.seed).fit_transform(
                self._details.ds.training_x)

        ds_2d = pd.DataFrame(np.hstack(
            (self._details.ds.training_x2D,
             np.atleast_2d(self._details.ds.training_y).T)),
                             columns=['x', 'y', 'target'])
        ds_2d.to_csv(
            self._out.format('{}_2D.csv'.format(self._details.ds_name)))
        self.log("Done")
Exemplo n.º 5
0
 def gmm(nclusters, coords, n_init=50, n_iter=500):
     est = GMM(n_components=nclusters, n_init=n_init, n_iter=n_iter)
     est.fit(coords)
     return Partition(est.predict(coords))
Exemplo n.º 6
0

# CWLM MODEL TRAINING
K = 3
cwlm = CWLM(n_components=K, eta=7, plot=True, n_init=10, tol=1e-10, 
            init_params='gmm', smoothing=False)
cwlm.fit(X, y)
mu_est = cwlm.means_
mu_ext = np.concatenate((np.ones((K, 1)), mu_est), axis=1)
W_clust = cwlm.reg_weights_
y_, score = cwlm.predict_score(X_tst, y_tst)

print('\nR2 score = {}'.format(score))

# GMM TRAINING
gmm = GMM(n_components=K)
gmm.fit(X)

# Depict the likelyhoods on Y for the clusterwise linear model
rx = np.ones((100, 2))
r = np.linspace(X.min(), X.max(), 100)
rx[:, 1] = r
c = ['r', 'g', 'b']

fig = plt.figure(1)
y_est = []


# Depict all clustering approaches for clusterwise linear model

fig = plt.figure(2)
Exemplo n.º 7
0
print "Raw ", rf.feature_importances_
print "Normed ", rf_norm.feature_importances_
print "Extra ", rf_extra.feature_importances_
"""

#-----------------------------------------------------------------------

###############################
### Gaussian Mixture Models ###
###############################

#Initialize index at which we want to split training and cross-validation data sets
i_divide = 10000

#Initialize GMM object
gmm_2 = GMM(n_components=2, covariance_type='full')
gmm_3 = GMM(n_components=4, covariance_type='full')

#Fit the data
gmm_2.fit(X_norm[:i_divide])
gmm_3.fit(X_norm[:i_divide])

#Print means of the fit
print headers[headers != 'BSN']
print "2 models ", gmm_2.means_
print "3 models ", gmm_3.means_

#Print BIC and score
print "2 models ", gmm_2.bic(X_norm[i_divide:])
print "3 models ", gmm_3.bic(X_norm[i_divide:])
Exemplo n.º 8
0
#patch: <class 'numpy.ndarray'>; (16020,); need to reshape it to (16020,1)
#Create ndarray of the data from above lists
patch = np.stack(patch, axis=0).reshape(-1, 1)
x = np.stack(x, axis=0).reshape(-1, 1)
y = np.stack(y, axis=0).reshape(-1, 1)
d = np.stack(d, axis=0).reshape(-1, 1)
sad = np.stack(sad, axis=0).reshape(-1, 1)
blm = np.stack(blm, axis=0).reshape(-1, 1)
sbm = np.stack(sbm, axis=0).reshape(-1, 1)
gcs = np.stack(gcs, axis=0).reshape(-1, 1)

gmm = GMM(
    n_components=3,
    max_iter=100000,
    tol=1e-10,
    covariance_type='full',
    random_state=50,
).fit(patch)

#label: <class 'numpy.ndarray'>; (16020,); need to reshape it to (16020,1)
label = gmm.predict(patch.reshape(-1, 1))
label = label.reshape(-1, 1)

pat_lab = np.column_stack((x, y, d, patch, label, sad, blm, sbm, gcs))

print(type(pat_lab))
print(pat_lab.shape)

np.save('Patch9_Lab', pat_lab)
np.savetxt('Patch_Labs.csv', pat_lab, delimiter=',')
Exemplo n.º 9
0
 def Fit(self, Mask):
     optionDict = self.opts.__dict__
     self.toFit = GMM(**optionDict)
     return self.toFit.fit(self.MaskToMatrix(Mask))
Exemplo n.º 10
0
    res = np.array(res)
    res = res[idxs]
    images, residuals = get_images(res)
    values = latents
elif clusteron == "embed":
    values = np.array([embed[0], embed[1]]).T
    #idxs = embed[3]
    #idxs = [int(idx) for idx in idxs]
    #reals = reals[idxs]
    #residuals = residuals[idx]
    #images, residuals = get_images(res)
    print(values.shape)

#n_components = 2
print("Making GMM")
gmm = GMM(n_components=n_components).fit(values)
print("Predicting")
labels = gmm.predict(values)
#plt.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis')
plt.scatter(embed[0], embed[1], c=labels, s=10, cmap='viridis')
plt.savefig(plot_fn)


def make_batch(arr, save_fn, n=128):
    arr = np.array(arr)
    ims = arr[:n]
    ims.reshape((-1, 96, 96, NBANDS))
    saver.save_images(ims, save_fn)


print("writing clusters")
Exemplo n.º 11
0
covar_type = 'full'  # you can try out 'diag' as well
reps = 15  # number of fits with different initalizations, best result will be kept

# Allocate variables
BIC = np.zeros((T, 1))
AIC = np.zeros((T, 1))
CVE = np.zeros((T, 1))

# K-fold crossvalidation
CV = cross_validation.KFold(N, 10, shuffle=True)

for t, K in enumerate(KRange):
    print('Fitting model for K={0}\n'.format(K))
    # Fit Gaussian mixture model
    gmm = GMM(n_components=K,
              covariance_type=covar_type,
              n_init=reps,
              params='wmc').fit(X)
    # Get BIC and AIC
    BIC[t, 0] = gmm.bic(X)
    AIC[t, 0] = gmm.aic(X)
    cds = gmm.means_  # extract cluster centroids (means of gaussians)
    print(cds)
    cls = gmm.predict(X)  # extract cluster labels
    print(cls)
    # For each crossvalidation fold
    for train_index, test_index in CV:
        # extract training and test set for current CV fold
        X_train = X[train_index]
        X_test = X[test_index]
        # Fit Gaussian mixture model to X_train
        gmm = GMM(n_components=K,
Exemplo n.º 12
0
    # plt.ylabel('WCSS')
    # plt.rc('xtick', labelsize=10)
    # plt.rc('ytick', labelsize=10)
    # plt.savefig(os.getcwd() + '/plots/' + str(n_latent)+"d_"+ 'k-means_WCSS_clusters' +'.png')
    # plt.close()

    # Running and plotting k-means clustering with different cluster numbers
    kmeans_plot(par[:, 1:], X, 2)
    kmeans_plot(par[:, 1:], X, 3)
    kmeans_plot(par[:, 1:], X, 4)

    #GMM cluster - similar method as k-means for clustering, assigning labels, ordering labels and saving to lists.
    n_components = np.arange(1, 11, 1)
    cov_type = 'diag'
    models = [
        GMM(n, covariance_type=cov_type, random_state=0).fit(X)
        for n in n_components
    ]

    aic_list.append([m.aic(X) for m in models])
    bic_list.append([m.bic(X) for m in models])
    # fig = plt.figure(161, figsize=(12,10))
    # plt.plot(n_components, [m.bic(X) for m in models], label='BIC')
    # plt.plot(n_components, [m.aic(X) for m in models], label='AIC')
    # plt.legend(loc='best')
    # plt.xlabel('Number of Components');
    # plt.rc('xtick', labelsize=10)
    # plt.rc('ytick', labelsize=10)
    # fig.suptitle('GMM BIC/AIC values', fontsize=16)
    # plt.savefig(os.getcwd() + '/plots/' + str(n_latent)+"d_"+'gmm_bic-aic_'+cov_type+'.png')
    # plt.close()
Exemplo n.º 13
0
# (25%) sets.
skf = StratifiedKFold(iris.target, n_folds=4)
# Only take the first fold.
train_index, test_index = next(iter(skf))

X_train = iris.data[train_index]
y_train = iris.target[train_index]
X_test = iris.data[test_index]
y_test = iris.target[test_index]

n_classes = len(np.unique(y_train))

# Try GMMs using different types of covariances.
classifiers = dict((covar_type,
                    GMM(n_components=n_classes,
                        covariance_type=covar_type,
                        init_params='wc',
                        n_iter=20))
                   for covar_type in ['spherical', 'diag', 'tied', 'full'])

n_classifiers = len(classifiers)

plt.figure(figsize=(3 * n_classifiers / 2, 6))
plt.subplots_adjust(bottom=.01,
                    top=0.95,
                    hspace=.15,
                    wspace=.05,
                    left=.01,
                    right=.99)

for index, (name, classifier) in enumerate(classifiers.items()):
    # Since we have class labels for the training data, we can
rng = np.random.RandomState(13)
X_stretched = np.dot(X, rng.randn(2, 2))

kmeans = KMeans(n_clusters=4, random_state=0)
plot_kmeans(kmeans, X_stretched)
plt.show()

print("#---------------------------------------#")
print(" Generalizing E-M: Gaussian mixture models")
print("    expectation-maximization approach    ")
print("#---------------------------------------#")
print("\n")

from sklearn.mixture import GMM
gmm = GMM(n_components=4).fit(X)
labels = gmm.predict(X)
plt.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap="viridis")
plt.show()

probs = gmm.predict_proba(X)
print(probs[:5].round(3))

from matplotlib.patches import Ellipse


def draw_ellipse(position, covariance, ax=None, **kwargs):
    '''
	Draw a ellipse with a give position and covariance
	'''
    ax = ax or plt.gca()
Exemplo n.º 15
0
def new_component_weighted_EM(comp_list, X, W, num_iter=5,
                              debug_ax=None,
                              clamp_existing_params=True):
    """
    Given a comp_list, add a new component, fit with weighted em
    """
    if True:
        # first component LLs go unchanged
        lnpdf_C, sample_C, lnpdf_Cplus, init_params = \
               components.make_new_component_mixture_lnpdf(comp_list)
        ll_q0 = lnpdf_C(X)
        D = X.shape[1]

        # resample X's
        zs = np.random.choice(len(W), size=len(W), p=W/len(W))
        X  = X[zs,:]

        # initialize the new component --- this mean may be too conservative
        init_mu    = np.mean(X, 0) #np.sum(X*W[:,None], 0) / np.sum(W)
        init_var   = np.var(X, 0)  #
        #init_var   = np.sum((X-init_mu)**2 * W[:,None], 0) / np.sum(W)
        ##init_mu = sample_C(1000).mean(0)
        #init_var = np.var(sample_C(1000), 0)

        init_lnstd = .5*np.log(init_var)
        lam = np.concatenate([init_mu, init_lnstd])
        #lam = comp_list[-1][1]
        #lam[:D] = X[np.argmax(W)]
        rho = .1

        # em iterations
        for i in xrange(num_iter):

            if debug_ax is not None:
                ii, jj = DEBUG_I, DEBUG_J
                debug_ax.scatter(lam[:D][ii], lam[:D][jj], s=100, c='red')
                debug_ax.text(lam[:D][ii], lam[:D][jj], "iter %d"%i, fontsize=14)
                print "iter %d rho = %2.5f"%(i, rho)

            # compute lngamma_old = ln P(Z=0 | X) and lngamma_new
            ll_new   = misc.mvn_diag_logpdf(X, lam[:D], lam[D:])
            lnjoint  = np.column_stack([ ll_q0 + np.log(1.-rho),
                                         ll_new + np.log(rho) ])
            lngammas = lnjoint - scpm.logsumexp(lnjoint, 1, keepdims=True)

            ll_marg = scpm.logsumexp(lnjoint, 1).mean()
            print "iter %d ll = %2.5f"%(i, ll_marg)
            #print "   rho = ", rho, lngammas

            # weighted M step
            ws = np.exp(lngammas) #* W[:,None]
            rho_new = np.sum(ws, 0) + 1.
            rho = (rho_new / np.sum(rho_new))[1]
            #print ws

            new_mu  = np.sum(X * ws[:,1,None], 0) / np.sum(ws[:,1,None])
            new_var = np.sum((X-new_mu)**2 * ws[:,1,None], 0) / np.sum(ws[:,1,None])
            lam = np.concatenate([new_mu, np.log(np.sqrt(new_var))])

        return rho, lam

    # initialize model object
    num_comp = len(comp_list) + 1
    mod      = GMM(n_components=num_comp, covariance_type='diag', n_iter=1)
    mod.fit(X)

    # initialize all of the means, covs, pis
    existing_means, covars, _, _, _, pis = \
        components.comp_list_to_matrices(comp_list)
    existing_covars = np.array([np.diag(c) for c in covars])

    # initialize model
    mod.weights_[:-1] = pis * .5
    mod.weights_[-1]  = .5
    # set the first mean to be the location of the highest weighted w...
    # set covar to be marginal covariance for the whole thing
    mod.means_[-1,:] = X[W.argmax(), :]
    init_c = mog.mog_covariance(existing_means, covars, pis)
    mod.covars_[-1,:] = np.diag(init_c)

    def clamp_params(mod):
        mod.means_[:-1,:]  = existing_means
        mod.covars_[:-1,:] = existing_covars
        #mod.covars_[-1,:]  = np.diag(init_c)   #todo experiment with clamping covariance!

    clamp_params(mod)

    # run importance weighted EM
    prev_ll = -np.inf
    for i in xrange(num_iter):

        if debug_ax is not None:
            ii, jj = DEBUG_I, DEBUG_J

            for mi in xrange(mod.means_.shape[0]):
                debug_ax.scatter(mod.means_[mi,ii], mod.means_[mi,jj], s=100, c='red')
                debug_ax.text(mod.means_[mi,ii], mod.means_[mi,jj], "iter %d"%i, fontsize=14)

        log_likelihoods, responsibilities = mod.score_samples(X)
        current_log_likelihood = log_likelihoods.mean()
        weighted_responsibilities = responsibilities * W[:,None]
        mod._do_mstep(X, weighted_responsibilities, mod.params,
                            mod.min_covar)
        if clamp_existing_params:
            clamp_params(mod)

        if (current_log_likelihood - prev_ll) < (1e-10*np.abs(prev_ll)):
            print "  current ll increase too small (after %d iters) "%i
            break

        print "ll = %2.4f"%current_log_likelihood
        prev_ll = current_log_likelihood

    return mod
Exemplo n.º 16
0
 def init_gmm(self, nb_gaussians):
     return GMM(n_components=nb_gaussians, covariance_type='full', random_state=self.seed,
                                         warm_start=self.warm_start, n_init=self.nb_em_init)
Exemplo n.º 17
0
print(train.shape)

print('Fitting PCA')
pca = PCA(n_components=6, random_state=0).fit(train)
train_pca = pca.transform(train)
print('Fitted with explained variance of ', pca.explained_variance_ratio_.sum())

pickle.dump(pca, open(pcasave, 'wb'))

print(train_pca.shape)

train_time_fft = np.concatenate((train_pca, time_features ), axis = 1)
print(train_time_fft.shape)

print('Fitting GMM')
gmm = GMM(n_components=int(args.n_clusters), random_state=0).fit(train_time_fft)
# kmeans = KMeans(n_clusters=int(args.n_clusters), random_state=0).fit(train_time_fft)
labels = gmm.predict(train_time_fft)
db_score = (sklearn.metrics.davies_bouldin_score(train_time_fft, labels))
print("Converged: ", str(gmm.converged_) )
print('Fitted with DB score: ', db_score)

pickle.dump(gmm, open(gmmsave, 'wb'))

labels = np.reshape(labels, (fft_features.shape[0], fft_features.shape[1]))
unique_elements, counts_elements = np.unique(labels, return_counts=True)
print(labels.shape)
print("Unique elements: ", unique_elements)
print("Frequency: ", counts_elements)

def list2string(list):
Exemplo n.º 18
0
def main(dataset_names=None,
         estimator_type="gmm",
         mc_iterations=20,
         n_folds=5,
         n_ensemble=100,
         seed_num=42):
    if dataset_names is None:
        # All the datasets used in Li2014
        datasets_li2014 = [
            'abalone', 'balance-scale', 'credit-approval', 'dermatology',
            'ecoli', 'german', 'heart-statlog', 'hepatitis', 'horse',
            'ionosphere', 'lung-cancer', 'libras-movement', 'mushroom',
            'diabetes', 'landsat-satellite', 'segment', 'spambase', 'wdbc',
            'wpbc', 'yeast'
        ]

        datasets_hempstalk2008 = [
            'diabetes', 'ecoli', 'glass', 'heart-statlog', 'ionosphere',
            'iris', 'letter', 'mfeat-karhunen', 'mfeat-morphological',
            'mfeat-zernike', 'optdigits', 'pendigits', 'sonar', 'vehicle',
            'waveform-5000'
        ]

        datasets_others = [
            'diabetes', 'ecoli', 'glass', 'heart-statlog', 'ionosphere',
            'iris', 'letter', 'mfeat-karhunen', 'mfeat-morphological',
            'mfeat-zernike', 'optdigits', 'pendigits', 'sonar', 'vehicle',
            'waveform-5000', 'scene-classification', 'tic-tac', 'autos', 'car',
            'cleveland', 'dermatology', 'flare', 'page-blocks', 'segment',
            'shuttle', 'vowel', 'zoo', 'abalone', 'balance-scale',
            'credit-approval', 'german', 'hepatitis', 'lung-cancer'
        ]

        # Datasets that we can add but need to be reduced
        datasets_to_add = ['MNIST']

        dataset_names = list(
            set(datasets_li2014 + datasets_hempstalk2008 + datasets_others))

    # Diary to save the partial and final results
    diary = Diary(name='results_Li2014',
                  path='results',
                  overwrite=False,
                  fig_format='svg')
    # Hyperparameters for this experiment (folds, iterations, seed)
    diary.add_notebook('parameters', verbose=True)
    # Summary for each dataset
    diary.add_notebook('datasets', verbose=False)
    # Partial results for validation
    diary.add_notebook('validation', verbose=True)
    # Final results
    diary.add_notebook('summary', verbose=True)

    columns = ['dataset', 'method', 'mc', 'test_fold', 'acc', 'logloss']
    df = MyDataFrame(columns=columns)

    diary.add_entry('parameters', [
        'seed', seed_num, 'mc_it', mc_iterations, 'n_folds', n_folds,
        'n_ensemble', n_ensemble, 'estimator_type', estimator_type
    ])
    data = Data(dataset_names=dataset_names)
    for name, dataset in data.datasets.iteritems():
        if name in ['letter', 'shuttle']:
            dataset.reduce_number_instances(0.1)
    export_datasets_description_to_latex(data, path=diary.path)

    for i, (name, dataset) in enumerate(data.datasets.iteritems()):
        np.random.seed(seed_num)
        dataset.print_summary()
        diary.add_entry('datasets', [dataset.__str__()])
        for mc in np.arange(mc_iterations):
            skf = StratifiedKFold(dataset.target,
                                  n_folds=n_folds,
                                  shuffle=True)
            test_folds = skf.test_folds
            for test_fold in np.arange(n_folds):
                x_train, y_train, x_test, y_test = separate_sets(
                    dataset.data, dataset.target, test_fold, test_folds)

                # Binary discriminative classifier
                sv = SVC(kernel='linear', probability=True)
                # Density estimator for the background check
                if estimator_type == "svm":
                    gamma = 1.0 / x_train.shape[1]
                    est = OneClassSVM(nu=0.1, gamma=gamma)
                elif estimator_type == "gmm":
                    est = GMM(n_components=1)
                elif estimator_type == "gmm3":
                    est = GMM(n_components=3)
                elif estimator_type == "mymvn":
                    est = MyMultivariateNormal()
                # Multiclass discriminative model with one-vs-one binary class.
                ovo = OvoClassifier(base_classifier=sv)
                classifier = ConfidentClassifier(classifier=ovo,
                                                 estimator=est,
                                                 mu=0.5,
                                                 m=0.5)
                ensemble = Ensemble(base_classifier=classifier,
                                    n_ensemble=n_ensemble)
                # classifier = ConfidentClassifier(classifier=sv,
                #                                  estimator=est, mu=0.5,
                #                                  m=0.5)
                # ovo = OvoClassifier(base_classifier=classifier)
                # ensemble = Ensemble(base_classifier=ovo,
                #                     n_ensemble=n_ensemble)
                xs_bootstrap, ys_bootstrap = ensemble.fit(x_train, y_train)
                accuracy = ensemble.accuracy(x_test, y_test)

                log_loss = ensemble.log_loss(x_test, y_test)
                diary.add_entry('validation', [
                    'dataset', name, 'method', 'our', 'mc', mc, 'test_fold',
                    test_fold, 'acc', accuracy, 'logloss', log_loss
                ])
                df = df.append_rows(
                    [[name, 'our', mc, test_fold, accuracy, log_loss]])

                # Li2014: EP-CC model
                # The classification confidence is used in learning the weights
                # of the base classifier as well as in weighted voting.
                ensemble_li = Ensemble(n_ensemble=n_ensemble, lambd=1e-8)
                ensemble_li.fit(x_train,
                                y_train,
                                xs=xs_bootstrap,
                                ys=ys_bootstrap)

                accuracy_li = ensemble_li.accuracy(x_test, y_test)
                log_loss_li = ensemble_li.log_loss(x_test, y_test)
                diary.add_entry('validation', [
                    'dataset', name, 'method', 'Li2014', 'mc', mc, 'test_fold',
                    test_fold, 'acc', accuracy_li, 'logloss', log_loss_li
                ])
                df = df.append_rows(
                    [[name, 'Li2014', mc, test_fold, accuracy_li,
                      log_loss_li]])

    export_summary(df, diary)



pop_all = pd.read_csv("/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/\
VBM/population.csv")

k_range = [1,2,3,5,8]

#mod = KMeans(n_clusters=3)
#mod = AgglomerativeClustering(n_clusters=4)
#mod.fit(U_all_scz[:,k_range])
#labels_all_scz = mod.labels_


mod = GMM(n_components=3)
labels_all_scz = mod.fit_predict(U_all_scz[:,k_range])


df = pd.DataFrame()
df["labels"] = labels_all_scz
df["age"] = pop_all["age"].values[y_all==1]
df["sex"] = pop_all["sex_num"].values[y_all==1]

for i in (k_range):
    df["U%s"%i] = U_all_scz[:,i]

LABELS_DICT = {0: "cluster 1", 1: "cluster 2", 2: "cluster 3"}
df["labels_name"]  = df["labels"].map(LABELS_DICT)

Exemplo n.º 20
0
    features = mfcc.mfcc(audio, sr, 0.025, 0.02, 13, appendEnergy=False)
    # python_speech_features.base.mfcc(signal, samplerate=16000, winlen=0.025, winstep=0.01, numcep=13, nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, appendEnergy=True, winfunc=<function <lambda>>)
    print(features.shape)
    features = preprocessing.scale(features)
    print(features)
    return features


source = "Dataset/Sad"
dest = "Model/"
files = [
    os.path.join(source, f) for f in os.listdir(source) if f.endswith('.wav')
]
features = np.asarray(())

for f in files:
    sr, audio = read(f)
    vector = get_MFCC(sr, audio)
    if features.size == 0:
        features = vector
    else:
        features = np.vstack((features, vector))

gmm = GMM(n_components=8, max_iter=200, covariance_type='diag', n_init=3)
gmm.fit(features)
picklefile = f.split("/")[-2].split(".wav")[0] + ".gmm"

# model saved as male.gmm
cPickle.dump(gmm, open(dest + picklefile, 'w'))
print 'modeling completed for emotion:', picklefile
Exemplo n.º 21
0
#print(videoPath)
#fourcc = cv2.VideoWriter_fourcc('X','V','I','D')
#video = cv2.VideoWriter(videoPath, fourcc,20,(width,height))
#
#for i,t in enumerate(movieTimes):
#    imgPath = os.path.join("./img/" + str(i) + ".png")
#    img = cv2.imread(imgPath)
#    img = cv2.cvtColor(img,cv2.COLOR_RGB2BGR)
#    video.write(img)
#print('done')
#cv2.destroyAllWindows()
#video.release()

#Gaussian Mixture Model
#fit GMM
gmm = GMM(n_components=2).fit(CC_scaled)
cov = gmm.covariances_
prob_distr = gmm.predict_proba(CC_scaled)

# determine to which of the two gaussians each data point belongs by looking at probability distribution
if gmm.weights_[0] < gmm.weights_[1]:
    gauss1_idx = [
        i for i in range(len(prob_distr))
        if prob_distr[i][0] >= prob_distr[i][1]
    ]
    gauss2_idx = [
        j for j in range(len(prob_distr))
        if prob_distr[j][1] >= prob_distr[j][0]
    ]
else:
    gauss1_idx = [
Exemplo n.º 22
0
from sklearn.mixture import GMM
import numpy as np
import pandas as pd
from sklearn import metrics
balance_data = pd.read_csv('seeds.csv',sep= ',', header= None)
X = balance_data.values[:, 0:6]
Y = balance_data.values[:,7]
estimator = GMM(n_components=3)
estimator.fit(X)
Y_pred=estimator.predict(X)
print("Accuracy:",metrics.adjusted_rand_score(Y, Y_pred))
np.savetxt('em.csv',Y_pred)
Exemplo n.º 23
0
import csv
from sklearn.mixture import GMM
# This file is to draw distribution curve

y_true = list()
y_predicted = list()
with codecs.open("./codelabel_result/bi2vec_spring_8.csv", "r") as f_csv:
    reader = csv.reader(f_csv)

    for i, row in enumerate(reader):
        if i == 0:
            continue
        if int(row[2]) == 1 or int(row[2]) == -1:
            y_true.append(int(row[2]))

            predicted = -1
            print row[5]
            if float(row[5]) > 0.3:
                predicted = 1
            y_predicted.append(predicted)

# print len(y_true)
# np_y_true = np.array(y_true)
# print np_y_true
# np_y_predicted = np.array(y_predicted)
# print np_y_predicted
# print precision_score(np_y_true,np_y_predicted,average="weighted")
# print recall_score(np_y_true,np_y_predicted,average="weighted")

GMM = GMM(n_components=2, init_params="wc", n_iter=20)

# computes accuracy given the predictions and real labels
def accuracy(predictions, labels):
    batch_size = predictions.shape[0]
    sum = np.sum(predictions == labels)
    acc = (100.0 * sum) / batch_size
    return acc


n_classes = 10  # 10 genre classes

# Try GMMs using different types of covariances. I'm only letting 'full' as it performs better but can add different types to try
classifiers = dict((covar_type,
                    GMM(n_components=n_classes,
                        covariance_type=covar_type,
                        init_params='wc',
                        n_iter=5)) for covar_type in ['full'])

print("Training GMM")

for index, (name, classifier) in enumerate(classifiers.items()):
    # Since we have class labels for the training data, we can
    # initialize the GMM parameters in a supervised manner.
    classifier.means_ = np.array(
        [train_data[train_labels == i].mean(axis=0) for i in range(n_classes)])

    # Train the other parameters using the EM algorithm.
    classifier.fit(train_data)

    # getting predictions of training set
    train_predictions = classifier.predict(train_data)
Exemplo n.º 25
0
def wemd_from_pred_samples(y_pred, ):
    gmm = GMM(covariance_type="diag")
    gmm = gmm.fit(y_pred)
    y_s, _ = gmm.sample(len(y_pred))
    return wemd_from_samples(y_s, y_pred)
Exemplo n.º 26
0
def GMM_func(X_train,
             Y_train,
             X_test,
             Y_test,
             n_classes,
             show_results=False,
             fplt=False,
             colors='rgbym',
             select_classifier=2):
    temp1 = X_train[:, 0]
    temp2 = X_train[:, 1]
    temp1 = np.reshape(temp1, (X_train.shape[0], 1))
    temp2 = np.reshape(temp2, (X_train.shape[0], 1))
    mean1 = np.array([temp1[Y_train == i].mean() for i in range(n_classes)])
    mean2 = np.array([temp2[Y_train == i].mean() for i in range(n_classes)])
    #print mean1
    #print mean2

    mean_vector = np.zeros((n_classes, 2))
    mean_vector[:, 0] = mean1
    mean_vector[:, 1] = mean2

    #print mean_vector

    # Try GMMs using different types of classifiers.

    # Since we have class labels for the training data, we can
    # initialize the GMM parameters in a supervised manner.

    if select_classifier == 1:
        classifier1 = GMM(n_components=n_classes,
                          covariance_type='full',
                          init_params='wc',
                          n_iter=200)
        classifier1.means_ = mean_vector
        classifier1.fit(X_train)
        if fplt:
            w_factor = 0.5 / classifier1.weights_.max()
            for pos, covar, w, color in zip(classifier1.means_,
                                            classifier1.covars_,
                                            classifier1.weights_, colors):
                draw_ellipse(pos, covar, alpha=w * w_factor, clr=color)
        Y_pred = classifier1.predict(X_test)

    if select_classifier == 2:
        classifier2 = GaussianMixture(n_components=n_classes,
                                      means_init=mean_vector,
                                      covariance_type='full',
                                      max_iter=5000)
        classifier2.fit(X_train)
        if fplt:
            w_factor = 0.8 / classifier2.weights_.max()
            for pos, covar, w, color in zip(classifier2.means_,
                                            classifier2.covariances_,
                                            classifier2.weights_, colors):
                draw_ellipse(pos, covar, alpha=w * w_factor, clr=color)
        Y_pred = classifier2.predict(X_test)

    Y_pred = np.reshape(Y_pred, (Y_test.shape[0], 1))
    Y_pred[Y_pred != 0] = 1
    Y_test[Y_test != 0] = 1
    #print Y_pred
    #print Y_test
    confusion = confusion_matrix(Y_test, Y_pred)

    eps = 1e-9
    # print confusion
    accuracy = 0
    if float(np.sum(confusion)) != 0:
        accuracy = float(confusion[0, 0] + confusion[1, 1]) / float(
            np.sum(confusion))
    specificity = float(
        confusion[0, 0]) / float(confusion[0, 0] + confusion[0, 1] + eps)
    sensitivity = float(
        confusion[1, 1]) / float(confusion[1, 1] + confusion[1, 0] + eps)
    precision = float(
        confusion[1, 1]) / float(confusion[1, 1] + confusion[0, 1] + eps)
    if show_results:
        print("\nGlobal Accuracy: " + str(accuracy))
        print("Specificity: " + str(specificity))
        print("Sensitivity: " + str(sensitivity))
        print("Precision: " + str(precision))
    return accuracy
Exemplo n.º 27
0
    def trainModelFV_LOOCV_Fusion(self, extension='*.*'):
        """
        This method contains the entire module
        required for training the Bag of Poses model
        Use of helper functions will be extensive.
        """
        self.name_dict, self.number_dict, self.count_class = self.file_helper.getLabelsFromFile(
            self.label_path)

        # read file. prepare file lists.
        self.files1, self.trainFilesCount1 = self.file_helper.getFilesFromDirectory(
            self.base_path, self.datasets, extension)

        self.files2, self.trainFilesCount2 = self.file_helper.getFilesFromDirectory(
            self.base_path2, self.datasets, extension)

        save = True
        self.parameters += 'Classifier Parameters\n'
        self.parameters += '%s' % self.classifier_helper.clf

        features_nd1 = np.asarray(self.files1)
        features_nd2 = np.asarray(self.files2)

        features_nd1.sort(axis=0)
        features_nd2.sort(axis=0)
        # build GMMs
        self.descriptor_list1 = []
        self.descriptor_list2 = []
        for f in features_nd1:
            feature = f[0]
            des1 = self.file_helper.loadFeaturesFromFile(feature)
            self.descriptor_list1.append(des1)

        for f in features_nd2:
            feature = f[0]
            des2 = self.file_helper.loadFeaturesFromFile(feature)
            self.descriptor_list2.append(des2)

        ft1 = self.classifier_helper.formatND(self.descriptor_list1)
        ft2 = self.classifier_helper.formatND(self.descriptor_list2)

        gmm1 = GMM(n_components=self.no_clusters,
                   covariance_type='diag',
                   verbose=0)
        gmm1.fit(ft1)

        gmm2 = GMM(n_components=self.no_clusters,
                   covariance_type='diag',
                   verbose=0)
        gmm2.fit(ft2)

        # Train Classifier
        loo = LeaveOneOut()
        predictions = []
        pre = []
        lab = []
        hits = 0
        c = 0
        for train, test in loo.split(features_nd1):
            feature_test_file1 = str(features_nd1[test][0][0])
            feature_test_file2 = str(features_nd2[test][0][0])

            class_name_test = feature_test_file1.split(os.sep)[-2]
            c += 1

            currenInvDate = datetime.datetime.now().strftime(
                "%d/%m/%Y %H:%M:%S")
            print('Step: %i/%i - %s\n%s\n%s' %
                  (c, features_nd1.shape[0], currenInvDate, feature_test_file1,
                   feature_test_file2))
            if c == 1 or c % 25 == 0:
                self.mail_helper.sendMail(
                    "Progress: %s - %s" % (self.test_name, self.OsName),
                    "Samples processed: %i" % c)

            self.descriptor_list1 = []
            self.descriptor_list2 = []
            self.train_labels = []
            for feature in features_nd1[train]:
                feature = feature[0]
                label_number = self.number_dict[feature.split(os.sep)[-2]]
                self.train_labels = np.append(self.train_labels, label_number)
                des1 = self.file_helper.loadFeaturesFromFile(feature)
                self.descriptor_list1.append(des1)

            for feature in features_nd2[train]:
                feature = feature[0]
                des2 = self.file_helper.loadFeaturesFromFile(feature)
                self.descriptor_list2.append(des2)

            # format data as nd array
            ft1 = self.classifier_helper.formatND(self.descriptor_list1)
            ft2 = self.classifier_helper.formatND(self.descriptor_list2)

            fv_dim1 = self.no_clusters + 2 * self.no_clusters * ft1.shape[1]
            fv_dim2 = self.no_clusters + 2 * self.no_clusters * ft2.shape[1]
            print(fv_dim1, fv_dim2)
            n_videos = train.shape[0]
            features1 = np.array([np.zeros(fv_dim1) for i in range(n_videos)])
            features2 = np.array([np.zeros(fv_dim2) for i in range(n_videos)])
            count1 = 0
            count2 = 0
            for i in range(n_videos):
                len_video1 = len(self.descriptor_list1[i])
                fv1 = fisher_vector(ft1[count1:count1 + len_video1], gmm1)
                features1[i] = fv1
                count1 += len_video1

                len_video2 = len(self.descriptor_list2[i])
                fv2 = fisher_vector(ft2[count2:count2 + len_video2], gmm2)
                features2[i] = fv2
                count2 += len_video2

            print(features1.shape)
            print('Data normalization. 1')
            scaler1 = StandardScaler()
            # train normalization
            features1 = scaler1.fit_transform(features1)
            features1 = power_normalize(features1, 0.5)
            features1 = L2_normalize(features1)

            print(features2.shape)
            print('Data normalization. 2')
            scaler2 = StandardScaler()
            # train normalization
            features2 = scaler2.fit_transform(features2)
            features2 = power_normalize(features2, 0.5)
            features2 = L2_normalize(features2)

            # real label
            lab.extend(
                [self.number_dict[feature_test_file1.split(os.sep)[-2]]])

            # test features 1
            feature_test1 = self.file_helper.loadFeaturesFromFile(
                feature_test_file1)
            test_fv1 = fisher_vector(feature_test1, gmm1)
            # train normalization
            test_fv1 = test_fv1.reshape(1, -1)
            test_fv1 = scaler1.transform(test_fv1)
            test_fv1 = power_normalize(test_fv1, 0.5)
            test_fv1 = L2_normalize(test_fv1)

            # test features 2
            feature_test2 = self.file_helper.loadFeaturesFromFile(
                feature_test_file2)
            test_fv2 = fisher_vector(feature_test2, gmm2)
            # train normalization
            test_fv2 = test_fv2.reshape(1, -1)
            test_fv2 = scaler2.transform(test_fv2)
            test_fv2 = power_normalize(test_fv2, 0.5)
            test_fv2 = L2_normalize(test_fv2)

            ## concatenate two fv test
            feature_test = np.concatenate((test_fv1, test_fv2),
                                          axis=1).reshape(1, -1)

            ## concatenate two fv train
            feature_train = np.concatenate((features1, features2), axis=1)

            # train classifiers
            self.classifier_helper.clf.fit(feature_train, self.train_labels)
            cl = int(self.classifier_helper.clf.predict(feature_test)[0])
            class_name_predict = self.name_dict[str(cl)]
            if class_name_test == class_name_predict:
                hits += 1

            error = c - hits
            msg_progress = 'Hits: %i/%i  -  Accuracy: %.4f  -   Error: %i\n\n' % (
                hits, c, hits / c, error)

            print(msg_progress)
            if c % 25 == 0:
                self.mail_helper.sendMail(
                    "Progress: %s - %s" % (self.test_name, self.OsName),
                    msg_progress)

            if error > 40:
                save = False
                print('Error excedded')
                break

            # predicted label
            pre.extend([cl])
            predictions.append({
                'image1': feature_test_file1,
                'image2': feature_test_file2,
                'class': cl,
                'object_name': self.name_dict[str(cl)]
            })

        if save:
            self.saveResults(predictions, pre, lab, features_nd1.shape[0])
Exemplo n.º 28
0
    X = mog.mog_samples(1000, means,
                        np.array([np.linalg.cholesky(c) for c in covars]),
                        pis)
    W = np.ones(X.shape[0])

    lam0 = np.concatenate([ means[0], .5*np.log(np.diag(covars[0])) ])
    comp_list = [(1., lam0)]

    mod = new_component_weighted_EM(comp_list, X, W, num_iter=50)
    print "GT Means        : ", means
    print "Inferred Means  : ", mod.means_
    print "Inferred Covars : ", mod.covars_
    print "Inferred Weights: ", mod.weights_

    print "\n"
    gmod = GMM(n_components=2)
    gmod.fit(X)
    print "full em means   : ", gmod.means_
    print "full em weights : ", gmod.weights_
    print "full em covars  : ", gmod.covars_


    # test higher level method
    import matplotlib.pyplot as plt; plt.ion()
    import seaborn as sns
    import autil.util.plots as pu
    fig, ax = plt.figure(figsize=(8,8)), plt.gca()
    pu.plot_isocontours(ax, lambda x: np.exp(lnpdf(x,0)), fill=True)
    new_comp_list = fit_new_component(comp_list, lnpdf, df=10000,
                                      num_samples=1000,
                                      importance_dist='tmixture',
Exemplo n.º 29
0
    def FV_LOOCV_Features(self):
        """
        This method contains the entire module
        required for training the bag of visual words model
        Use of helper functions will be extensive.
        """
        self.name_dict, self.number_dict, self.count_class = self.file_helper.getLabelsFromFile(
            self.label_path)
        for count, base_path in enumerate(self.base_path):
            # read file. prepare file lists.
            self.images[count], self.trainImageCount[count] = \
                self.file_helper.getFilesFromDirectory(base_path,
                                                       self.datasets,
                                                       self.features_file_filter[count])
        features_nd = {}
        # Initialize features nd array
        for count, _ in enumerate(self.base_path):
            features_nd[count] = np.asarray(self.images[count])
            features_nd[count].sort(axis=0)

        self.descriptor_list = {}
        labels_train = []
        ft = {}
        gmm = {}
        fv_dim = {}
        features = {}
        n_videos = features_nd[0].shape[0]
        scaler_des = {}

        # Read features from files and compute Gaussian Mixture Models (GMM)
        for count, _ in enumerate(self.base_path):
            self.descriptor_list[count] = []
            for feature in features_nd[count]:
                feature = feature[0]
                if count == 0:
                    label_number = self.number_dict[feature.split(os.sep)[-2]]
                    label_name = self.name_dict[str(label_number)]
                    labels_train = np.append(labels_train, label_name)
                des = self.file_helper.loadFeaturesFromFile(feature)
                self.descriptor_list[count].append(des)
            ft[count] = self.classifier_helper.formatND(
                self.descriptor_list[count])
            # train normalization
            scaler_des[count] = StandardScaler()
            ft[count] = scaler_des[count].fit_transform(ft[count])

            gmm[count] = GMM(n_components=self.no_clusters,
                             covariance_type='diag',
                             verbose=0)
            gmm[count].fit(ft[count])
            fv_dim[count] = self.no_clusters + 2 * self.no_clusters * ft[
                count].shape[1]
            print(fv_dim[count])
            features[count] = np.array(
                [np.zeros(fv_dim[count]) for i in range(n_videos)])

        len_video = {}
        fv = {}
        scaler = {}

        # Compute Fisher Vector from Descriptors using GMM
        for count, _ in enumerate(self.base_path):
            count_videos = 0
            for i in range(n_videos):
                len_video[count] = len(self.descriptor_list[count][i])
                fv[count] = fisher_vector(
                    ft[count][count_videos:count_videos + len_video[count]],
                    gmm[count])
                features[count][i] = fv[count]
                count_videos += len_video[count]

        # Perform FV Normalization
        for count, _ in enumerate(self.base_path):
            print(features[count].shape)
            print('Data normalization. %i' % count)
            scaler[count] = StandardScaler()
            # train normalization
            features[count] = scaler[count].fit_transform(features[count])
            features[count] = power_normalize(features[count], 0.5)
            features[count] = L2_normalize(features[count])

        # Concatenate FV for each feature type
        feature_train = features[0]
        for count in range(1, len(self.base_path)):
            feature_train = np.concatenate((feature_train, features[count]),
                                           axis=1)

        return feature_train, labels_train
Exemplo n.º 30
0
def trainingGMMHMM(
        dataset,  # training dataset.
        n_c,  # number of hmm's components (ie. hidden states)
        n_m,  # number of gmm's mixtures (ie. Gaussian model)
        start_prob_prior=None,  # prior of start hidden states probabilities.
        trans_mat_prior=None,  # prior of transition matrix.
        start_prob=None,  # the start hidden states probabilities.
        trans_mat=None,  # the transition matrix.
        gmms=None,  # models' params of gmm
        covar_type='full',
        n_i=50):
    # Initiation of dataset.
    # d = Dataset(dataset)
    X = dataset.getDataset()
    # Initiation of GMM.
    _GMMs = []
    if gmms is None:
        _GMMs = None
    else:
        for gmm in gmms:
            _GMM = GMM(n_components=n_m, covariance_type=covar_type)
            _GMM.covars_ = np.array(gmm["covars"])
            _GMM.means_ = np.array(gmm["means"])
            _GMM.weights_ = np.array(gmm["weights"])
            _GMMs.append(_GMM)
    # Initiation of GMMHMM.
    model = GMMHMM(startprob_prior=np.array(start_prob_prior),
                   transmat_prior=np.array(trans_mat_prior),
                   startprob=np.array(start_prob),
                   transmat=np.array(trans_mat),
                   gmms=_GMMs,
                   n_components=n_c,
                   n_mix=n_m,
                   covariance_type=covar_type,
                   n_iter=n_i)
    # Training.
    model.fit(X)
    # The result.
    new_gmmhmm = {
        "nComponent": n_c,
        "nMix": n_m,
        "covarianceType": covar_type,
        "hmmParams": {
            "startProb": model.startprob_.tolist(),
            "transMat": model.transmat_.tolist()
        },
        "gmmParams": {
            "nMix": n_m,
            "covarianceType": covar_type,
            "params": []
        }
    }

    for i in range(0, n_m):
        gaussian_model = {
            "covars": model.gmms_[i].covars_.tolist(),
            "means": model.gmms_[i].means_.tolist(),
            "weights": model.gmms_[i].weights_.tolist()
        }
        new_gmmhmm["gmmParams"]["params"].append(gaussian_model)

    return new_gmmhmm