예제 #1
0
def test_bayesian_mixture_check_is_fitted():
    rng = np.random.RandomState(0)
    n_samples, n_features = 10, 2

    # Check raise message
    bgmm = BayesianGaussianMixture(random_state=rng)
    X = rng.rand(n_samples, n_features)

    msg = "This BayesianGaussianMixture instance is not fitted yet."
    with pytest.raises(ValueError, match=msg):
        bgmm.score(X)
예제 #2
0
def bayesian_gaussian_mixture(vector: np.array, n: int, BIC_calculate = False):
    if BIC_calculate == True:
        np.random.seed(140597)
        mask = np.random.choice([False, True], len(vector), p=[0.70, 0.30])
        model_train = BayesianGaussianMixture(n_components=n, covariance_type='full').fit(vector[~mask])
        validation_score = model_train.score(vector[mask])
        train_score = model_train.score(vector[~mask])
        return validation_score, train_score
    else:
        np.random.seed(140597)
        mask = np.random.choice([False, True], len(vector), p=[0.70, 0.30])
        dpgmm = BayesianGaussianMixture(n_components=n, covariance_type='full', max_iter=900, tol=1e-4).fit(vector[~mask])
        cluster_label = dpgmm.predict(vector)
        return cluster_label
예제 #3
0
def genotype(cnvays):
    result = []
    n_com = 10 if cnvays.shape[1] >= 10 else cnvays.shape[1]
    n_init = 3
    for cnvay in cnvays:
        cnv = [[x] for x in cnvay]
        dpgmm = BayesianGaussianMixture(
            n_components=n_com,
            n_init=n_init,
            max_iter=10000,
            weight_concentration_prior_type='dirichlet_process').fit(cnv)
        labels = dpgmm.predict(cnv)
        normed_ay = np.arange(0, np.max(cnvay) + 0.5, 0.5)
        swlabels = {}
        for rawlabel in np.unique(labels):
            swlabels[rawlabel] = normed_ay[np.argmin(
                np.abs(normed_ay - np.median(cnvay[labels == rawlabel])))]
        newlabels = [swlabels[x] for x in labels]
        gtlabes = {0: 'dd', 0.5: 'Ad', 1: 'AA', 1.5: 'AB', 2: 'BB', 2.5: 'BC'}
        finalline = [gtlabes.get(x, 'M') for x in newlabels]
        if len(np.unique(finalline)) > 1:
            sc = silhouette_score(cnv, finalline,
                                  metric='euclidean')  # silhouette_score
            chs = calinski_harabaz_score(cnv, labels)  # calinski_harabaz_score
        else:
            sc = np.nan
            chs = np.nan
        llh = dpgmm.score(
            cnv)  # Log likelihood of the Gaussian mixture given X
        finalline += [sc, chs, llh]
        result.append(finalline)
    return result
def bayes_gauss_classifier(dir_models, ticket, x, x_test, y, y_test):
    # GaussianMixture
    print('getting model...BayesianGaussianMixture')
    clf = BayesianGaussianMixture(n_components=3)

    print('training...')
    clf.fit(x, y)

    print('predicting...')
    predicted = clf.predict(x_test)
    print(classification_report(y_test, predicted))

    id = len(os.listdir(dir_models))
    joblib.dump(
        clf,
        dir_models + ticket + '_bayesian_gaussian_mixture_' + str(id) + '.pkl')

    return clf.score(x_test, y_test)
예제 #5
0
def search_optimal_cluster_size(data_set_name: str,
                                data_points: np.ndarray,
                                start: int,
                                stop: int,
                                max_data_points=2000) -> int:
    """Determine the optimal number of clusters in the given data_points based
    on the maximum GMM log likelihood. It assumes the component in the data
    points are linearly independent. Sampling is performed to improve efficiency.

    Arguments:
        data_set_name {str} -- The name of the data set (for display purpose).
        data_points {np.ndarray} -- Of shape [num_samples, num_components].
        start {int} -- Starting number of clusters to search from.
        stop {int} -- The largest number of clusters to search on.

    Keyword Arguments:
        max_data_points {int} -- The largest number of data points to take on.
            Sampling is performed after this number of samples (default: {2000})

    Returns:
        int -- The optimal number of clusters.
    """
    if data_points.shape[0] > max_data_points:
        inds = np.arange(start=0, stop=data_points.shape[0])
        np.random.shuffle(inds)
        data_points = data_points[inds[:max_data_points], :]

    best_likelihood = -float("inf")
    best_cluster_size = start
    for i in range(start, stop + 1):
        gmm = BayesianGaussianMixture(n_components=i,
                                      covariance_type="diag",
                                      tol=1e-2,
                                      n_init=5)
        gmm.fit(X=data_points)
        if not gmm.converged_:
            continue
        likelihood = gmm.score(X=data_points)
        if likelihood > best_likelihood:
            best_likelihood = likelihood
            best_cluster_size = i
        print(data_set_name, "|cluster_size=", i, "|current_best=",
              best_cluster_size, "|current_best_score=", best_likelihood)
    return best_cluster_size
예제 #6
0
def evaluate_DPMM(y,y_test):
    from sklearn.mixture import BayesianGaussianMixture

    np.random.seed(300)
    d = np.shape(y)[1]
    start = time.time()
    #Fit GMM
    DPMM = BayesianGaussianMixture(n_components=30,covariance_type = 'diag',n_init =100,\
                             weight_concentration_prior_type='dirichlet_process',
                                  covariance_prior = np.ones(d),degrees_of_freedom_prior = d,mean_precision_prior = 1, mean_prior = np.zeros(d))

    DPMM.fit(y)
    end =time.time()
    print('DPMM fitting time is {}'.format(end-start))
      
    #Predict
    test_loglik = DPMM.score(y_test)
    
    return(test_loglik)
예제 #7
0
def km_em(x_train_scaled, dataset_name="", true_vals = y_train, reg_covar = 1e-01):
    distortions = []
    sil = []
    n = 22
    # v_measure = []
    homogeneity = []
    completeness = []
    mutual_info = []
    adj_rand_score = []
    sil = []
    kmeans_times = []
    homogeneity_em = []
    completeness_em = []
    mutual_info_em = []
    adj_rand_score_em = []
    sil_em = []
    em_times = []
    em_likelihood = []
    for i in range(2,n+1):
#         print(i)
        start_time = time.time()
        kmeans = KMeans(n_clusters=i, random_state=random_state)
        kmeans.fit(x_train_scaled)
        distortions.append(kmeans.inertia_)
        y_pred = kmeans.predict(x_train_scaled)
        kmeans_times.append(time.time()-start_time)
        homogeneity.append(homogeneity_score(true_vals, y_pred.tolist()))
        completeness.append(completeness_score(true_vals, y_pred.tolist()))
        mutual_info.append(adjusted_mutual_info_score(true_vals, y_pred.tolist()))
        adj_rand_score.append(adjusted_rand_score(true_vals, y_pred.tolist()))
        sil.append(silhouette_score(x_train_scaled, kmeans.labels_, metric='euclidean'))
        start_time = time.time()
        gm = BayesianGaussianMixture(n_components = i, random_state=random_state, reg_covar=reg_covar)
        y_pred = gm.fit_predict(x_train_scaled)
        em_times.append(time.time()-start_time)
        homogeneity_em.append(homogeneity_score(true_vals, y_pred.tolist()))
        completeness_em.append(completeness_score(true_vals, y_pred.tolist()))
        mutual_info_em.append(adjusted_mutual_info_score(true_vals, y_pred.tolist()))
        adj_rand_score_em.append(adjusted_rand_score(true_vals, y_pred.tolist()))
        if len(set(y_pred))>1:
            sil_em.append(silhouette_score(x_train_scaled, y_pred, metric='euclidean'))
        else:
            sil_em.append(1)
        em_likelihood.append(gm.score(x_train_scaled))
    # plot
    plt.plot(range(2, n+1), distortions, marker='o')
    plt.title("K-means Elbow ("+(str(dataset_name))+")")
    plt.xlabel('Number of clusters')
    plt.ylabel('Sum of Squared Distances')
    plt.savefig((str(dataset_name))+' km elbow.png')
    plt.show()

    plt.plot(range(2, n+1), sil, marker='o')
    plt.title('K-means Silhouette Scores ('+(str(dataset_name))+')')
    plt.xlabel('Number of clusters')
    plt.ylabel('Silhouette Score')
    plt.savefig((str(dataset_name))+' km silho.png')
    plt.show()

    plt.plot(range(2, n+1), em_likelihood, marker='o')
    plt.title('EM likelihood ('+(str(dataset_name))+')')
    plt.xlabel('Number of clusters')
    plt.ylabel('Likelihood')
    plt.savefig((str(dataset_name))+' em likelihood.png')
    plt.show()
    
    plt.plot(range(2, n+1), sil_em, marker='o')
    plt.title('EM Silhouette Scores ('+(str(dataset_name))+')')
    plt.xlabel('Number of clusters')
    plt.ylabel('Silhouette Score')
    plt.savefig((str(dataset_name))+' em silho.png')
    plt.show()
    
    plt.close()
    plot_data(list(range(1, n)), homogeneity, title="Performance Evaluation k-means ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="blue", label='Homogeneity')
    plot_data(list(range(1, n)), completeness, title="Performance Evaluation k-means ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="orange", label='Completeness')
    plot_data(list(range(1, n)), mutual_info, title="Performance Evaluation k-means ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="red", label='Adgusted Mutual Info')
    plot_data(list(range(1, n)), adj_rand_score, title="Performance Evaluation k-means ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="green", label='Adjusted random index')
    # plot_data(list(range(1, n)), v_measure, title="Performance Evaluation k-means", x_label="Number of Clusters", y_label="Score", color="brown", label='V-measure')
    plt.savefig((str(dataset_name))+' km perfo.png')
    plt.show()

    plt.close()
    plot_data(list(range(1, n)), homogeneity_em, title="Performance Evaluation EM ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="blue", label='Homogeneity')
    plot_data(list(range(1, n)), completeness_em, title="Performance Evaluation EM ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="orange", label='Completeness')
    plot_data(list(range(1, n)), mutual_info_em, title="Performance Evaluation EM ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="red", label='Adgusted Mutual Info')
    plot_data(list(range(1, n)), adj_rand_score_em, title="Performance Evaluation EM ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="green", label='Adjusted random index')
    # plot_data(list(range(1, n)), v_measure, title="Performance Evaluation EM", x_label="Number of Clusters", y_label="Score", color="brown", label='V-measure')
    plt.savefig((str(dataset_name))+' em perfo.png')
    plt.show()

    plt.close()
    plot_data(list(range(1, n)), kmeans_times, title="k-means/EM Running Time ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Time", color="red", label='k-means')
    plot_data(list(range(1, n)), em_times, title="k-means/EM Running Time ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Time", color="blue", label='EM')
    plt.savefig((str(dataset_name))+' km-em time.png')
    plt.show()
    print('kmeans_times')
    print(kmeans_times)
    print('em_times')
    print(em_times)
    
    return {'sil': sil, 'kmeans_times':kmeans_times, 'em_times':em_times, 'homogeneity':homogeneity, 'completeness':completeness, 'mutual_info':mutual_info, 'adj_rand_score':adj_rand_score, 'homogeneity_em':homogeneity_em, 'completeness_em':completeness_em, 'mutual_info_em':mutual_info_em, 'adj_rand_score_em':adj_rand_score_em}
예제 #8
0
embeddings = []
for index in indices:
    embeddings.append(np.load(files[index]))

print("Loaded npys")

start_time = time.time()
cluster_data = np.array(embeddings)
print(cluster_data.shape)

gmm = BayesianGaussianMixture(
    n_components=10,
    covariance_type="full",
    tol=1e-4,
    max_iter=1000,
    init_params="random",
    weight_concentration_prior_type="dirichlet_process",
    weight_concentration_prior=1.0 / 10,
    warm_start=False)

gmm.fit(cluster_data)
print(gmm.means_.shape)
print(gmm.covariances_.shape)
print(gmm.weight_concentration_)

print(gmm.lower_bound_)
print(gmm.score(cluster_data))

end_time = time.time()

print("Time taken: ", end_time - start_time)
예제 #9
0
def extract_nodules_best_gmix(segmentation_volume,
                              max_n_components=40,
                              plot=False):
    "This function finds the best gaussian mix for a volume of probabilities"
    no_samples = 10000
    occurences = np.round(no_samples * segmentation_volume /
                          np.sum(segmentation_volume)).astype(int)
    total_occ = np.sum(occurences)
    samples = np.zeros((total_occ, 3))
    counter = 0

    for x in xrange(segmentation_volume.shape[0]):
        for y in xrange(segmentation_volume.shape[1]):
            for z in xrange(segmentation_volume.shape[2]):
                for occ in range(occurences[x, y, z]):
                    samples[counter] = [x, y, z]
                    counter += 1

    best_score = -1
    best_gmix = None
    best_no_c = -1
    for no_c in range(1, max_n_components):
        gmix = BayesianGaussianMixture(n_components=no_c,
                                       covariance_type='full')
        gmix.fit(samples)
        score = gmix.score(samples)
        print 'score', score
        print 'means'
        print gmix.means_
        print 'weights'
        print gmix.weights_
        if plot:
            for idx, mean in enumerate(gmix.means_):
                center = np.round(mean).astype(np.int)
                fig = plt.figure()
                fig.suptitle('weight=' + str(gmix.weights_[idx]))
                ax1 = fig.add_subplot(3, 1, 1)
                ax1.imshow(segmentation_volume[center[0], :, :].transpose())
                circ1 = plt.Circle((center[1], center[2]),
                                   10,
                                   color='g',
                                   fill=False)
                ax1.add_patch(circ1)

                ax2 = fig.add_subplot(3, 1, 2)
                ax2.imshow(segmentation_volume[:, center[1], :])
                circ2 = plt.Circle((center[0], center[2]),
                                   10,
                                   color='g',
                                   fill=False)
                ax2.add_patch(circ2)

                ax3 = fig.add_subplot(3, 1, 3)
                ax3.imshow(segmentation_volume[:, :, center[2]].transpose())
                circ3 = plt.Circle((center[0], center[1]),
                                   10,
                                   color='g',
                                   fill=False)
                ax3.add_patch(circ3)
                fig.savefig('no_c_' + str(no_c) + '_' + str(idx) + '.pdf')

        if score > best_score:
            best_score = score
            best_gmix = best_gmix
            best_no_c = no_c

    print "Best gaussian mix when using", best_no_c, 'gaussians'

    return best_gmix
gmm = BayesianGaussianMixture(
    n_components=n_comps,
    covariance_type="full",
    tol=1e-4,
    max_iter=2500,
    init_params="random",
    weight_concentration_prior_type="dirichlet_distribution",
    weight_concentration_prior=1e+4,
    warm_start=True)

for epoch in range(epochs):
    random.shuffle(all_indices)
    iters = math.floor(len(files) / number_of_data_points_per_iter) - 1
    print("Epoch: " + str(epoch + 1))
    for iter in range(iters):
        start_time = time.time()
        start_index = iter * number_of_data_points_per_iter
        end_index = start_index + number_of_data_points_per_iter
        inds_for_this_iter = all_indices[start_index:end_index]
        cluster_data = embeddings[inds_for_this_iter]
        gmm.fit(cluster_data)

        end_time = time.time()
        print("Likelihood: " + str(gmm.score(val_data)) + ", Time: " +
              str(end_time - start_time))

print("Weight Concentration : ")
print(gmm.weight_concentration_)
np.save(open(os.path.join(output_dir, "gmm_means.npy"), "wb"), gmm.means_)
np.save(open(os.path.join(output_dir, "gmm_covs.npy"), "wb"), gmm.covariances_)
np.save(open(os.path.join(output_dir, "gmm_weights.npy"), "wb"), gmm.weights_)
예제 #11
0
class FisherVectorGMM:
    """
    Fisher Vector derived from GMM
    ---
    Attributes
    -----------
    n_kernels: int
        number of kernels in GMM
    convars_type: str
        convariance type for GMM
    use_bayesian: bool
        whether or not to use Baysian GMM
    gmm: GaussianMixture() or BayesianGaussianMixture()
        GMM instance in sklearn
    means: np.array()
        means learned in GMM
    covars: np.array()
        covariance learned in GMM
    weights: np.array()
        weights learned in GMM
    ---------------------------------------
    Functions
    -----------
    fit(): public
        fit raw data into GMM
    predict(): public
        predict FV for one video (variable frames)
    predict_alternative(): public
        predict FV for one video (variable frames) alternative
        not validated
    save(): public
        save GMM model into external file
    load(): public
        load GMM model from external file
    """
    def __init__(self, n_kernels=1, convars_type='diag', use_bayesian=False):
        # para n_kernels:
        # para convars_type:
        # para use_bayesian:
        assert convars_type in ['diag', 'full']
        assert n_kernels >= 0
        # == 0 dummy instance

        self.name = 'kernels%d_convars%s_bayes%d' % (n_kernels, convars_type,
                                                     use_bayesian)
        self.n_kernels = n_kernels
        self.convars_type = convars_type
        self.use_bayesian = use_bayesian
        self.fitted = False
        self.config = json.load(open('./config/model.json',
                                     'r'))['fisher_vector']
        self.save_dir = self.config['save_dir']
        self.data_dir = self.config['data_dir']
        self.means = None
        self.covars = None
        self.weights = None

        if not self.use_bayesian:
            self.gmm = GaussianMixture(n_components=self.n_kernels,
                                       covariance_type=self.convars_type,
                                       max_iter=1000,
                                       verbose=2)
        else:
            self.gmm = BayesianGaussianMixture(
                n_components=self.n_kernels,
                covariance_type=self.convars_type,
                max_iter=1000,
                verbose=2)

    def fit(self, X):
        # para X: shape [n_frames, n_features, n_feature_dim]
        # if os.path.isfile(os.path.join(self.save_dir, self.name, 'gmm.model')):
        #     print("\nmodel already trained ---", self.name)
        #     self.load()
        #     return
        # elif not os.path.isdir(os.path.join(self.save_dir, self.name)):
        #     os.mkdir(os.path.join(self.save_dir, self.name))

        self.feature_dim = X.shape[-1]
        # X = X.reshape(-1, X.shape[-1])
        print("\nfitting data into GMM with %d kernels" % self.n_kernels)

        self.gmm.fit(X)
        self.means = self.gmm.means_
        self.covars = self.gmm.covariances_
        self.weights = self.gmm.weights_
        print("\nfitting completed")

        # if cov_type is diagonal - make sure that covars holds a diagonal matrix
        if self.convars_type == 'diag':
            cov_matrices = np.empty(shape=(self.n_kernels,
                                           self.covars.shape[1],
                                           self.covars.shape[1]))
            for i in range(self.n_kernels):
                cov_matrices[i, :, :] = np.diag(self.covars[i, :])
            self.covars = cov_matrices

        assert self.covars.ndim == 3
        print("\nmodel trained ---", self.name)
        # self.save()

    def score(self, X):
        return self.gmm.score(X.reshape(-1, X.shape[-1]))

    def predict(self, X, normalized=True):
        # para X: shape [n_frames, n_feature_dim]
        assert X.ndim == 2
        assert X.shape[
            0] >= self.n_kernels, 'n_frames should be greater than n_kernels'

        print("\ninferring fisher vectors with given GMM ...")

        X_matrix = X.reshape(-1, X.shape[-1])  # [n_frames, n_feature_dim]

        # set equal weights to predict likelihood ratio
        self.gmm.weights_ = np.ones(self.n_kernels) / self.n_kernels
        likelihood_ratio = self.gmm.predict_proba(X_matrix).reshape(
            X.shape[0], self.n_kernels)  # [n_frames, n_kernels]

        var = np.diagonal(self.covars, axis1=1,
                          axis2=2)  # [n_kernels, n_feature_dim]

        # decrease the memory use
        norm_dev_from_modes = np.tile(X[:, None, :], (1, self.n_kernels, 1))
        np.subtract(norm_dev_from_modes,
                    self.means[None, :],
                    out=norm_dev_from_modes)
        np.divide(norm_dev_from_modes, var[None, :], out=norm_dev_from_modes)
        """
        norm_dev_from_modes:
            (X - mean) / var
            [n_frames, n_kernels, n_feature_dim]
        """

        # mean deviation
        mean_dev = np.multiply(likelihood_ratio[:, :, None],
                               norm_dev_from_modes).mean(
                                   axis=0)  # [n_kernels, n_feature_dim]
        mean_dev = np.multiply(1 / np.sqrt(self.weights[:, None]),
                               mean_dev)  # [n_kernels, n_feature_dim]

        # covariance deviation
        cov_dev = np.multiply(likelihood_ratio[:, :, None],
                              norm_dev_from_modes**2 - 1).mean(
                                  axis=0)  # [n_kernels, n_feature_dim]
        cov_dev = np.multiply(1 / np.sqrt(2 * self.weights[:, None]),
                              cov_dev)  # [n_kernels, n_feature_dim]

        # stack vectors of mean and covariance
        fisher_vector = np.concatenate([mean_dev, cov_dev], axis=1)

        if normalized:
            fisher_vector = np.sqrt(np.abs(fisher_vector)) * np.sign(
                fisher_vector)  # power normalization
            fisher_vector = fisher_vector / np.linalg.norm(
                fisher_vector, axis=0)  # L2 normalization

        # fisher_vector[fisher_vector < 10**-4] = 0 # threshold
        print("\ninferring completed.")

        assert fisher_vector.ndim == 2
        return fisher_vector

    def predict_alternative(self, X, normalized=True):
        X = np.atleast_2d(X)
        N = X.shape[0]

        # Compute posterior probabilities.
        Q = self.gmm.predict_proba(X)  # NxK

        # Compute the sufficient statistics of descriptors.
        Q_sum = np.sum(Q, 0)[:, np.newaxis] / N
        Q_X = np.dot(Q.T, X) / N
        Q_XX_2 = np.dot(Q.T, X**2) / N

        # compute derivatives with respect to mixing weights, means and variances.
        d_pi = Q_sum.squeeze() - self.gmm.weights_
        d_mu = Q_X - Q_sum * self.gmm.means_
        d_sigma = (-Q_XX_2 - Q_sum * self.gmm.means_**2 +
                   Q_sum * self.gmm.covariances_ + 2 * Q_X * self.gmm.means_)

        # merge derivatives into a vector.
        fisher_vector = np.hstack((d_pi, d_mu.flatten(), d_sigma.flatten()))

        if normalized:
            fisher_vector = np.sqrt(np.abs(fisher_vector)) * np.sign(
                fisher_vector)  # power normalization
            fisher_vector = fisher_vector / np.linalg.norm(fisher_vector,
                                                           axis=0)  # L2 norm

        return fisher_vector

    def save(self):
        with open(os.path.join(self.save_dir, self.name, 'gmm.model'),
                  'wb') as out_gmm:
            pickle.dump(self.gmm, out_gmm, protocol=3)
        with open(os.path.join(self.save_dir, self.name, 'covars.data'),
                  'wb') as out_covars:
            pickle.dump(self.covars, out_covars, protocol=3)
        out_gmm.close()
        out_covars.close()
        print("\nmodel saved. --- ", self.name)

    def load(self):
        with open(os.path.join(self.save_dir, self.name, 'gmm.model'),
                  'rb') as in_gmm:
            self.gmm = pickle.load(in_gmm)
        with open(os.path.join(self.save_dir, self.name, 'covars.data'),
                  'rb') as in_covars:
            self.covars = pickle.load(in_covars)
        in_gmm.close()
        in_covars.close()
        if not self.use_bayesian:
            assert isinstance(self.gmm, GaussianMixture)
        else:
            assert isinstance(self.gmm, BayesianGaussianMixture)
        self.means = self.gmm.means_
        self.weights = self.gmm.weights_
        print("\nmodel loaded. --- ", self.name)

    def save_vector(self,
                    fisher_vector,
                    partition,
                    dynamics=False,
                    label=False):
        if not label:
            filename = 'vector_%s_%d' % (
                partition,
                self.n_kernels) if dynamics else 'fisher_vector_%s_%d' % (
                    partition, self.n_kernels)
            np.save(os.path.join(self.data_dir, filename), fisher_vector)
        else:
            filename = 'label_%s' % partition
            np.save(os.path.join(self.data_dir, filename), fisher_vector)

    def load_vector(self, partition, dynamics=False, label=False, bic=False):
        if not label:
            if not bic:
                filename = 'vector_%s_%d.npy' % (
                    partition, self.n_kernels
                ) if dynamics else 'fisher_vector_%s_%d.npy' % (partition,
                                                                self.n_kernels)
            else:
                filename = 'vector_%s_0.npy' % partition if dynamics else 'fisher_vector_%s_0.npy' % partition
            fisher_vector = np.load(os.path.join(self.data_dir, filename),
                                    allow_pickle=True)
            return fisher_vector
        else:
            filename = 'label_%s.npy' % partition
            label = np.load(os.path.join(self.data_dir, filename))
            return label
    else:
        anomaly_tracker.append(0)
        test_cuboids.append(test_images_PED1[i])
test_cuboids_np = np.array(test_cuboids)
test_cuboids_np = test_cuboids_np.astype('float64')
test_cuboids_np *= 255.0 / test_cuboids_np.max()
test_cuboids_t = torch.from_numpy(test_cuboids_np)
test_cuboids_t = test_cuboids_t.permute(0, 4, 1, 2, 3)

batch_size = 1
test_dataloader = torch.utils.data.DataLoader(test_cuboids_t,
                                              shuffle=False,
                                              batch_size=batch_size,
                                              num_workers=4,
                                              drop_last=True)

print('Test dataloader loaded')
model.eval()
test_inputs = []
for y in test_dataloader:
    clust_pred, d_ignore = model(y)
    for i in range(len(clust_pred)):
        test_inputs.append(clust_pred[i].detach().cpu().numpy())

for sample in test_inputs:
    sb_test_scores.append(sb.score(sample.reshape(1, -1)))
print('Done with testing PED1')

np.save('SB_twoStage_PED1_Unsupervised' + str(num_epochs) + '.npy',
        np.array(sb_test_scores))
print('SB results saved')
예제 #13
0
    print('naive GMM with fix k.')
    start_time = time.time()
    ngmm = GaussianMixture(trav.n_components,covariance_type='diag').fit(data)
    record_ngmm_fix_k.iloc[t,0] = time.time() - start_time
    #print("--- %s seconds ---" % (t))
    #record_ngmm.iloc[t,2] = ngmm.bic(data)
    record_ngmm_fix_k.iloc[t,1] = ngmm.score(data)
    record_ngmm_fix_k.iloc[t,2] = trav.n_components
    #print(ngmm_ll)
    
    
    print('dpgmm.')
    start_time = time.time()
    dpgmm = BayesianGaussianMixture(n_components=max_cluster_num,max_iter=500).fit(data)
    record_dpgmm.iloc[t,0] = time.time() - start_time
    record_dpgmm.iloc[t,1] = dpgmm.score(data)
    record_dpgmm.iloc[t,2] = len(dpgmm.weights_)
    
    
db_summary = pd.concat([record_sort,record_gmm,record_gmm_fix_k,record_ngmm,record_ngmm_fix_k,record_dpgmm])
db_summary['DB'] = name
db_summary['method'] = ['CITE-sort']*record_sort.shape[0] + ['GMM']*record_gmm.shape[0] + ['GMM_fixk']*record_gmm_fix_k.shape[0] + \
['nGMM']*record_ngmm.shape[0] + ['nGMM_fixk']*record_ngmm_fix_k.shape[0] + ['dpgmm']*record_dpgmm.shape[0]

db_summary.to_csv(savepath+'/record_'+name+'.csv')





record_full[name] = db_summary
예제 #14
0
centers = [[1, 1], [-1, -1], [1, -1]]
X,Y = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,random_state=0)
X = StandardScaler().fit_transform(X)

'''
    贝叶斯混合高斯
        在最大期望的基础上,做了些变种。比如加入了很多先验的事情,需要对数据集有更好的理解
        优点:
            自动选择的一些超参数,对参数数量的敏感度低,加入了正则化来约束先验
        缺点:
            速度慢些,超参数需要使用交叉验证从而增加的计算量,包含了模型中存在很多隐藏的偏差
'''

cluster = BayesianGaussianMixture(n_components=3, covariance_type='full', tol=0.001, reg_covar=1e-06, max_iter=100, n_init=1, init_params='kmeans', weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=None, mean_precision_prior=None, mean_prior=None, degrees_of_freedom_prior=None, covariance_prior=None, random_state=None, warm_start=False, verbose=0, verbose_interval=10)
cluster.fit(X)
cluster.score(X,Y)

'''
    n_components                            混合成分的数量
    covariance_type                         协方差类型
        spherical                               球面
        diagonal                                对角线
        tied                                    所有矩阵共享相同的一般矩阵
        full                                    全协方差
    tol                                     收敛阈值
    reg_covar                               正则化项,在计算协方差矩阵式的约束
    max_iter                                最大迭代次数
    n_init                                  不太懂
    init_params                             初始化权重的计算方法 
    weight_concentration_prior_type         描述重量浓度类型的字符串
    weight_concentration_prior              !不太理解,不过应该是最重要的!
예제 #15
0
    x = X[:, j].reshape(-1, 1)
    model = BayesianGaussianMixture(n_components=comps[j],
                                    covariance_type='full')
    model.fit(x)
    both = np.column_stack([x, model.predict(x)])

# attempts to cluster the whole feature space...may or may not be useful
#X_filt = medfilt(X)
num_clusters = np.arange(1, 20 + 1)
scores = []
for num in num_clusters:
    #model = KMeans(n_clusters=num,max_iter=500,n_init=20)
    #model = GaussianMixture(n_components=num, covariance_type='full')
    model = BayesianGaussianMixture(n_components=num, covariance_type='full')
    model.fit(X)
    scores += [-model.score(X)]

plt.plot(np.arange(1, 20 + 1), scores)
plt.xlim(0, 20)
plt.xticks(np.arange(1, 20 + 1))
plt.xlabel('number of clusters')
plt.ylabel('loss')
plt.show()

# 4,6,10 clusters?
#model = KMeans(n_clusters=4,max_iter=500,n_init=20)
model = BayesianGaussianMixture(n_components=10, covariance_type='full')
model.fit(X)
plt.hist(model.predict(X))
plt.show()
#print model.cluster_centers_