示例#1
0
def _make_mixture_for_column(colname, col, verbose, Ngauss_init,
                             nmin_multigauss, VB_iter):
    col = col[numpy.isfinite(col)]
    std = col.std()
    if len(col) == 0 or not (std > 0):  # no useful samples
        if verbose > 0: print('    column %s: no data' % colname)
        return None
    elif len(col) < nmin_multigauss or VB_iter == 0:
        if verbose > 0:
            print('    column %s: %s +- %s' % (colname, col.mean(), std))
        return create_gaussian_mixture(
            numpy.array([[col.mean()]], dtype=float),
            [numpy.array([[std]], dtype=float)])
    else:
        means, covs = _make_single_mixture(col, Ngauss_init)
        mix = create_gaussian_mixture(means, covs)

        if verbose > 0: print('    column %s: running VB...' % colname)
        vb = GaussianInference(col.reshape(-1, 1),
                               initial_guess=mix,
                               W0=numpy.eye(1) * 1e10)
        vb_prune = 0.5 * len(vb.data) / vb.K
        vb.run(VB_iter,
               rel_tol=1e-8,
               abs_tol=1e-5,
               prune=vb_prune,
               verbose=verbose > 1)
        mix = vb.make_mixture()
        return mix
示例#2
0
def make_long_patch_gaussian_mixture(data, K_g=15, critical_r=2.):
    '''Use samples from Markov-chains to form a Gaussian Mixture (to be
    used as initial guess for VB). This is done using "long patches" as
    in [Allen Fred].

    :param data:

        Iterable of vector-like arrays; the individual items are interpreted
        as points from an individual chain.

    :param K_g:

        Integer; the number of components per chain group.

    :param critical_r:

        Float; the maximum R value a chain group may have.

    '''
    def append_components(means, covs, data, partition):
        subdata_start = 0
        subdata_stop  = partition[0]
        for len_subdata in partition:
            subdata = data[subdata_start:subdata_stop]
            means.append( np.mean(subdata,   axis=0) )
            covs.append ( np.cov (subdata, rowvar=0) )
            subdata_start += len_subdata
            subdata_stop  += len_subdata


    chain_groups = pypmc.mix_adapt.r_value.r_group([np.mean(chain_values[:], axis=0) for chain_values in data],
                                                   [np.cov(chain_values[:], rowvar=0) for chain_values in data],
                                                   len(data[0]), critical_r)

    print 'found %i chain grous\n' %len(chain_groups)

    long_patches_means = []
    long_patches_covs = []
    for group in chain_groups:
        # we want K_g components from k_g = len(group) chains
        k_g = len(group)
        if K_g >= k_g:
            # find minimal lexicographic integer partition
            n = partition(K_g, k_g)
            for i, chain_index in enumerate(group):
                # need to partition in n[i] parts
                data_full_chain = data[chain_index]
                # find minimal lexicographic integer partition of chain_length into n[i]
                this_patch_lengths = partition(len(data_full_chain), n[i])
                append_components(long_patches_means, long_patches_covs, data_full_chain, this_patch_lengths)
        else:
            # form one long chain and set k_g = 1
            k_g = 1
            # make one large chain
            data_full_chain = np.vstack([data[i] for i in group])
            # need to partition into K_g parts -- > minimal lexicographic integer partition
            this_patch_lengths = partition(len(data_full_chain), K_g)
            append_components(long_patches_means, long_patches_covs, data_full_chain, this_patch_lengths)

    return create_gaussian_mixture(long_patches_means, long_patches_covs)
示例#3
0
def simu(N, K, dim, gof_test= True, pc_select = True, write_results = True, verbose = False):
    #reset the random generator, useful for multiprocess 
    seed()
    try:
        if verbose:
            print "init N=",N," dim=",dim
        #Some initialization
        #sc = StandardScaler()
        max_pca_comp = dim/2+1
        # We generate the Gaussian mixture
        #gg = GaussianMixtureGen(dim, weights)
        #centers_star, cov_star = gg.get_params()
        gg = BasicGen(dim)
        centers_star, cov_star = gg.get_params()
        #X_ = gg.sample(N)
        X_ids = gg.sample(N, with_ids=True)
        ids = X_ids[:,-1]
        X_ = X_ids[:,:-1]
        K = len(set(ids))
        weights = 1./K*np.ones(K)
        #We normalize the data for the PCA in the KL aggreg
        #X = sc.fit_transform(X_)
        X = X_
        #We generate the target density f_star from the components
        f_star = GaussMixtureDensity(weights, centers_star, cov_star)
        f_star_sampling = create_gaussian_mixture(centers_star, cov_star, weights)
        ######################
        # KL-AGGREG. ALGORITHM
        ######################
        if verbose:
            print "starting KL-aggreg"
        time_kl_aggreg_start = time()
        dg = DictionaryGenerator(kmeans_k=KMEANS_K, max_pca_comp=max_pca_comp, subspace_cluster_dim=2, pc_select=pc_select)
        X_train_dict_gen = X[:N/2]
        X_train_kl_aggreg = X[N/2:] 
        if gof_test:
            dg.fit(X_train_dict_gen)
            densities_dict = dg.simplify_gof()
        else:
            densities_dict = dg.fit_transform(X_train_dict_gen)
        cl = WeightEstimator(densities_dict=densities_dict)
        cl.fit(X_train_kl_aggreg)
        time_kl_aggreg_stop = time()
        kl_aggreg_weights = cl.pi_final
        kl_aggreg_density = KLaggDensity(kl_aggreg_weights, densities_dict)
        #Compute L2 loss
        kl_aggreg_integrand_L2_loss = IntegrandL2Density(f_star.pdf, kl_aggreg_density.pdf)
        kl_aggreg_l2 = l2_norm(kl_aggreg_integrand_L2_loss.pdf, f_star_sampling, sample_size=SAMPLE_SIZE_NORM_MC,  hypercube_size=HYPERCUBE_SIZE)
        #Compute KL loss
        kl_aggreg_integrand_KL_loss = IntegrandKLDensity(f_star.pdf, kl_aggreg_density.pdf)
        kl_aggreg_kl = kl_norm(kl_aggreg_integrand_KL_loss.pdf, f_star_sampling, sample_size=SAMPLE_SIZE_NORM_MC,  hypercube_size=HYPERCUBE_SIZE)
        if verbose:
            print "KL-aggreg done"
            print "KL-loss", kl_aggreg_kl
            print "L2-loss", kl_aggreg_l2
        #################
        #EM-BIC ALGORITHM
        #################
        if verbose:
            print "starting EM-BIC"
        time_em_start = time()
        _, em_model = mle_bic(X, MAX_EM_BIC_K)
        time_em_stop = time()
        em_density = GaussMixtureDensity(em_model.weights_, em_model.means_, em_model.covariances_)
        #Compute L2 loss
        em_integrand_L2_loss = IntegrandL2Density(f_star.pdf, em_density.pdf)
        em_l2 = l2_norm(em_integrand_L2_loss.pdf, f_star_sampling, sample_size=SAMPLE_SIZE_NORM_MC,  hypercube_size=HYPERCUBE_SIZE)
        #Compute KL loss
        em_integrand_KL_loss = IntegrandKLDensity(f_star.pdf, em_density.pdf)
        em_kl = kl_norm(em_integrand_KL_loss.pdf, f_star_sampling, sample_size = SAMPLE_SIZE_NORM_MC,  hypercube_size=HYPERCUBE_SIZE)
        if verbose:
            print "EM-BIC done"
            print "KL-loss", em_kl
            print "L2-loss", em_l2
        #################
        #KDE-CV ALGORITHM
        #################
        if verbose:
            print "starting KDE-CV"
        kde = KdeCV(n_jobs = 1, cv=10, bw = np.linspace(0.01, 1.0, 20))
        time_kde_start = time()
        kde.fit(X)
        time_kde_stop = time()
        kde_integrand_KL_loss = IntegrandKLDensity(f_star.pdf, kde.pdf)
        kde_kl = kl_norm(kde_integrand_KL_loss.pdf, f_star_sampling, sample_size = SAMPLE_SIZE_NORM_MC,  hypercube_size=HYPERCUBE_SIZE)   
        kde_integrand_L2_loss = IntegrandL2Density(f_star.pdf, kde.pdf)
        kde_l2 = l2_norm(kde_integrand_L2_loss.pdf, f_star_sampling, sample_size=SAMPLE_SIZE_NORM_MC,  hypercube_size=HYPERCUBE_SIZE)  
        #Compute times
        kl_aggreg_time = time_kl_aggreg_stop-time_kl_aggreg_start
        em_bic_time = time_em_stop-time_em_start
        kde_cv_time = time_kde_stop-time_kde_start
        if verbose:
            print "KDE-CV done"
            print "KL-loss", kde_kl
            print "L2-loss", kde_l2
        #Writing results
        if write_results:
            print "OK, writing results"
            pickle.dump({"K" : K,
                         "p" : dim,
                         "N" : N,
                         "MLE_l2" : kl_aggreg_l2,
                         "MLE_KL" : kl_aggreg_kl,
                         "MLE_time" : kl_aggreg_time,
                         "EM_l2" : em_l2,
                         "EM_KL" : em_kl,
                         "EM_time" : em_bic_time,
                         "KdeCV_l2" : kde_l2,
                         "KdeCV_KL" : kde_kl,
                         "KdeCV_time" : kde_cv_time
                     }, open(FOLDER +
                             "res_" + "K" + str(K) + "p" + str(dim) + "N" + str(N) +"_"+ type_simu_to_str(gof_test, pc_select)+"_"+str(uuid.uuid4()), "wb"))
        else:
            # we print the results, for testing.
            print {"K" : K,
                         "p" : dim,
                         "N" : N,
                         "MLE_l2" : kl_aggreg_l2,
                         "MLE_KL" : kl_aggreg_kl,
                         "MLE_time" : kl_aggreg_time,
                         "EM_l2" : em_l2,
                         "EM_KL" : em_kl,
                         "EM_time" : em_bic_time,
                         "KdeCV_l2" : kde_l2,
                         "KdeCV_KL" : kde_kl,
                         "KdeCV_time" : kde_cv_time
                     }
        return 1
    except Exception as e:
        print e
        return 0
示例#4
0
 def sample(self, N):
     mixture = create_gaussian_mixture(self.centers, self.cov, self.weights)
     return mixture.propose(N)
            for j in range(remainder):
                this_patch_lengths[j] += 1
            start = 0
            stop  = this_patch_lengths[0]
            for next_len in this_patch_lengths:
                    this_data = data_full_chain[start:stop]
                    long_patches_means.append( np.mean(this_data, axis=0) )
                    if vb_initialization == 'long_patches_rescaled_cov':
                        long_patches_covs.append ( np.cov(this_data, rowvar=0) * cov_rescale_factor)
                    elif vb_initialization == 'long_patches':
                        long_patches_covs.append ( np.cov(this_data, rowvar=0) )
                    else:
                        raise RuntimeError('Unknown Error')
                    start += next_len
                    stop  += next_len
    mcmcmix = create_gaussian_mixture(long_patches_means, long_patches_covs)
    try:
        vb = pypmc.mix_adapt.variational.GaussianInference(data, initial_guess=mcmcmix, nu=(np.zeros(len(mcmcmix))+100.) )
    except ValueError:
        vb = pypmc.mix_adapt.variational.GaussianInference(data, initial_guess=mcmcmix)
    # set vb_prune
    vb_prune = .5 * len(data)/len(mcmcmix)
    params.update((('vb_prune', vb_prune),))
# else --> unrecognized initialization scheme
else:
    raise ValueError("I don't know what you mean by `vb_initialization` = \"%s\"" %vb_initialization)

# run the variational bayes
try:
    vb_converged = vb.run(N_max_vb, verbose=True, rel_tol=vb_rel_tol, abs_tol=vb_abs_tol, prune=vb_prune)
except NameError:
示例#6
0
    def run_iter(
        self,
        num_gauss_samples=400,
        max_ncalls=100000,
        min_ess=400,
        max_improvement_loops=4,
        heavytail_laplaceapprox=True,
        verbose=True,
    ):
        """
        Iterative version of run(). See documentation there.
        Returns current samples on each iteration.
        """
        paramnames = self.paramnames
        loglike = self.loglike
        transform = self.transform

        ndim = len(paramnames)
        optu, cov, invcov = self.optu, self.cov, self.invcov
        # for numerical stability, use 1e260, so that we can go down be 1e-100,
        # but up by 1e600
        self.Loffset = self.optL  #+ 600

        # first iteration: create a single gaussian and importance-sample
        if self.log:
            self.logger.info("Initiating gaussian importance sampler")

        def log_target(u):
            """ log-posterior to sample from """
            if (u > 1).any() or (u < 0).any():
                return -np.inf
            p = transform(u)
            L = loglike(p)
            return L - self.Loffset

        if not heavytail_laplaceapprox:
            initial_proposal = Gauss(optu, cov)
        else:
            # make a few gaussians, in case the fit errors were too narrow
            means, covs, weights = _make_initial_proposal(optu, cov)
            initial_proposal = create_gaussian_mixture(means, covs, weights)

        mixes = [initial_proposal]

        N = num_gauss_samples
        Nhere = N // self.mpi_size
        if self.mpi_size > 1:
            SequentialIS = ImportanceSampler
            from pypmc.tools.parallel_sampler import MPISampler
            sampler = MPISampler(SequentialIS,
                                 target=log_target,
                                 proposal=initial_proposal,
                                 prealloc=Nhere)
        else:
            sampler = ImportanceSampler(target=log_target,
                                        proposal=initial_proposal,
                                        prealloc=Nhere)

        if self.log:
            self.logger.info("    sampling %d ..." % N)
        np.seterr(over="warn")
        sampler.run(Nhere)
        self.ncall += Nhere * self.mpi_size

        samples, weights = self._collect_samples(sampler)
        assert weights.sum() > 0, 'All samples have weight zero.'

        vbmix = None
        for it in range(max_improvement_loops):
            ess_fraction = ess(weights)
            if self.log:
                self.logger.info("    sampling efficiency: %.3f%%" %
                                 (ess_fraction * 100))

            if it % 3 == 0:
                if self.log:
                    self.logger.info("Optimizing proposal (from scratch) ...")
                mix = _make_proposal(samples, weights, optu, cov, invcov)
                vb = GaussianInference(samples,
                                       weights=weights,
                                       initial_guess=mix,
                                       W0=np.eye(ndim) * 1e10)
                vb_prune = 0.5 * len(vb.data) / vb.K
            else:
                if self.log:
                    self.logger.info("Optimizing proposal (from previous) ...")
                prior_for_proposal_update = vb.posterior2prior()
                prior_for_proposal_update.pop('alpha0')
                vb = GaussianInference(samples,
                                       initial_guess=vbmix,
                                       weights=weights,
                                       **prior_for_proposal_update)

            if self.log:
                self.logger.info('    running variational Bayes ...')
            vb.run(1000,
                   rel_tol=1e-8,
                   abs_tol=1e-5,
                   prune=vb_prune,
                   verbose=False)
            vbmix = vb.make_mixture()
            if self.log:
                self.logger.info('    reduced from %d to %d components' %
                                 (len(mix.components), len(vbmix.components)))

            sampler.proposal = vbmix

            if self.log:
                self.logger.info("Importance sampling %d ..." % N)
            sampler.run(N // self.mpi_size)
            self.ncall += (N // self.mpi_size) * self.mpi_size
            mixes.append(vbmix)

            samples, weights = self._collect_samples(sampler)
            ess_fraction = ess(weights)
            if self.log:
                self.logger.debug("    sampling efficiency: %.3f%%" %
                                  (ess_fraction * 100))
                self.logger.debug("    obtained %.0f new effective samples" %
                                  (ess_fraction * len(weights)))

            samples, weights = self._collect_samples(sampler,
                                                     all=True,
                                                     mixes=mixes)
            ess_fraction = ess(weights)
            Ndone = ess_fraction * len(weights)

            result = self._update_results(samples, weights)
            if Ndone >= min_ess:
                if self.log:
                    self.logger.info(
                        "Status: Have %d total effective samples, done." %
                        Ndone)
                yield result
                break
            elif self.ncall > max_ncalls:
                if self.log:
                    self.logger.info(
                        "Status: Have %d total effective samples, reached max number of calls."
                        % Ndone)
                yield result
                break
            else:
                N = int(1.4 * min(max_ncalls - self.ncall, N))
                if self.log:
                    self.logger.info(
                        "Status: Have %d total effective samples, sampling %d next."
                        % (Ndone, N))
                yield result
示例#7
0
def _make_proposal(samples, weights, optu, cov, invcov):
    # split samples into 3 equally large groups, by L
    w1, w2 = np.percentile(weights[weights > 0], [33, 66])

    means = [optu]
    covs = [cov]
    chunk_weights = [1]
    # for each group (top: L1 < L, mid: L1 > L > L2, bottom: L < L2)

    cov_guess = cov
    for mask in weights >= w1, ~np.logical_or(weights >= w2,
                                              weights <= w1), weights <= w2:
        mask = np.logical_and(mask, weights > 0)
        if not mask.any():
            continue
        # assume H as distance metric
        # find most distant point from ML (u)
        dists = scipy.spatial.distance.cdist(samples[mask, :], [optu],
                                             'mahalanobis',
                                             VI=invcov).flatten()
        # maximum size of clusters:

        handled = np.zeros(len(dists), dtype=bool)

        # repeat recursively until no points left
        while not handled.all():
            samples_todo = samples[mask, :][~handled, :]

            # find most distant point, which is used as the center
            i = dists[~handled].argmax()
            # add all points within distance until peak is included
            d = dists[~handled][i]
            #    but include at most a distance of maxdistance

            dists_todo = scipy.spatial.distance.cdist(samples_todo,
                                                      [samples_todo[i, :]],
                                                      'mahalanobis',
                                                      VI=invcov).flatten()
            selected = dists_todo <= d
            cluster = samples_todo[selected]
            #print("  accreted %d (of %d to do)" % (len(cluster), (~handled).sum()), 'from', samples_todo[i, :])
            handled[~handled] = selected

            if len(cluster) < cluster.shape[1]:
                continue

            # print(np.diag(np.var(cluster, axis=0)))
            # cov_guess = np.diag(np.var(cluster, axis=0))
            try:
                cov_local = np.cov(cluster, rowvar=0)
                # check that it is positive-definite
                np.linalg.cholesky(cov_local)
                if not np.all(np.linalg.eigvals(cov_local) > 0):
                    continue
            except np.linalg.LinAlgError:
                cov_local = cov_guess
                # reject, too few points in cluster
                continue

            assert np.isfinite(cluster).all(), cluster[~np.isfinite(cluster)]
            assert np.isfinite(cov_local).all(), (
                cov_local, cov_local[np.isfinite(cov_local)])
            means.append(np.mean(cluster, axis=0))
            covs.append(cov_local)
            chunk_weights.append(1)

    chunk_weights = np.asarray(chunk_weights) / np.sum(chunk_weights)

    mix = create_gaussian_mixture(means, covs, weights=chunk_weights)
    return mix
示例#8
0
weights = numpy.ones(K) / K
means = []
covariances = []
for j in range(K):
    mean = np.zeros(ndim) + 0.5
    mean[0] = j * 1. / K
    means.append(mean)
    sigma = 10**(mean[0] * 20 - 10)
    sigma = max(sigma, 10**-difficulty)
    sigma = min(sigma, 3)
    cov = np.eye(ndim) * sigma
    cov[0, 0] = (1. / K)
    #print mean, cov
    covariances.append(cov)

mix = create_gaussian_mixture(means, covariances, weights)

N = 40000
sampler = pypmc.sampler.importance_sampling.ImportanceSampler(log_target,
                                                              mix,
                                                              prealloc=N)
print('importance sampling ...')
#print('    drawing samples...')
#samples = mix.propose(N, numpy.random)
#print('    computing likelihood ...')
#weights_target = loglikelihood(samples)
#print('    computing weights...')
#weights_proposal = numpy.array([mix.evaluate(sample) for sample in samples])
#weights = exp(weights_target - weights_proposal)
sampler.run(N)
print('importance sampling done')
        # need to partition into K_g parts -- > minimal lexicographic integer partition
        this_patch_lengths = [chain_length // K_g for j in range(K_g)]
        remainder = chain_length % K_g
        for j in range(remainder):
            this_patch_lengths[j] += 1
        start = 0
        stop  = this_patch_lengths[0]
        for next_len in this_patch_lengths:
                this_data = data_full_chain[start:stop]
                long_patches_means.append( np.mean(this_data, axis=0) )
                long_patches_covs.append ( np.cov (this_data, rowvar=0) )
                start += next_len
                stop  += next_len


hierarchical_init = create_gaussian_mixture(long_patches_means, long_patches_covs)

plt.figure()
plt.title('hierarchical init')
plot_mixture(hierarchical_init, 0,1)
plotfile.savefig()

# ----------------------- hierarchical clustering --------------------------------------

hc = pypmc.mix_adapt.hierarchical.Hierarchical(mcmcmix, hierarchical_init, verbose=True)
hc_converged = hc.run(kill=kill_in_hc)
if hc_converged:
    statusfile.write('hierarchical clustering converged in step %i\n' %(hc_converged) )
else:
    statusfile.write('hierarchical clustering did not converge\n')
reduced_proposal = hc.g