示例#1
0
def bgplvm_simulation_missing_data(optimize=True, verbose=1,
                      plot=True, plot_sim=False,
                      max_iters=2e4, percent_missing=.1,
                      ):
    from GPy import kern
    from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch

    D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 400, 3, 4
    _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
    Y = Ylist[0]
    k = kern.Linear(Q, ARD=True)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)

    inan = _np.random.binomial(1, percent_missing, size=Y.shape).astype(bool)  # 80% missing data
    Ymissing = Y.copy()
    Ymissing[inan] = _np.nan

    m = BayesianGPLVMMiniBatch(Ymissing, Q, init="random", num_inducing=num_inducing,
                      kernel=k, missing_data=True)

    m.Yreal = Y

    if optimize:
        print("Optimizing model:")
        m.optimize('bfgs', messages=verbose, max_iters=max_iters,
                   gtol=.05)
    if plot:
        m.X.plot("BGPLVM Latent Space 1D")
        m.kern.plot_ARD('BGPLVM Simulation ARD Parameters')
    return m
示例#2
0
    def test_missing_data(self):
        from GPy import kern
        from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch
        from GPy.examples.dimensionality_reduction import _simulate_matern

        D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 400, 3, 4
        _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, False)
        Y = Ylist[0]

        inan = np.random.binomial(1, .9, size=Y.shape).astype(bool) # 80% missing data
        Ymissing = Y.copy()
        Ymissing[inan] = np.nan

        k = kern.Linear(Q, ARD=True) + kern.White(Q, np.exp(-2)) # + kern.bias(Q)
        m = BayesianGPLVMMiniBatch(Ymissing, Q, init="random", num_inducing=num_inducing,
                          kernel=k, missing_data=True)
        assert(m.checkgrad())
        mul, varl = m.predict(m.X)

        k = kern.RBF(Q, ARD=True) + kern.White(Q, np.exp(-2)) # + kern.bias(Q)
        m2 = BayesianGPLVMMiniBatch(Ymissing, Q, init="random", num_inducing=num_inducing,
                          kernel=k, missing_data=True)
        assert(m.checkgrad())
        m2.kern.rbf.lengthscale[:] = 1e6
        m2.X[:] = m.X.param_array
        m2.likelihood[:] = m.likelihood[:]
        m2.kern.white[:] = m.kern.white[:]
        mu, var = m.predict(m.X)
        np.testing.assert_allclose(mul, mu)
        np.testing.assert_allclose(varl, var)

        q50 = m.predict_quantiles(m.X, (50,))
        np.testing.assert_allclose(mul, q50[0])
def bgplvm_simulation_missing_data_stochastics(
    optimize=True,
    verbose=1,
    plot=True,
    plot_sim=False,
    max_iters=2e4,
    percent_missing=0.1,
    d=13,
    batchsize=2,
):
    from GPy import kern
    from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch

    D1, D2, D3, N, num_inducing, Q = d, 5, 8, 400, 3, 4
    _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
    Y = Ylist[0]
    k = kern.Linear(Q,
                    ARD=True)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)

    inan = _np.random.binomial(1, percent_missing,
                               size=Y.shape).astype(bool)  # 80% missing data
    Ymissing = Y.copy()
    Ymissing[inan] = _np.nan

    m = BayesianGPLVMMiniBatch(
        Ymissing,
        Q,
        init="random",
        num_inducing=num_inducing,
        kernel=k,
        missing_data=True,
        stochastic=True,
        batchsize=batchsize,
    )

    m.Yreal = Y

    if optimize:
        print("Optimizing model:")
        m.optimize("bfgs", messages=verbose, max_iters=max_iters, gtol=0.05)
    if plot:
        m.X.plot("BGPLVM Latent Space 1D")
        m.kern.plot_ARD()
    return m
示例#4
0
    def test_missing_data(self):
        from GPy import kern
        from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch
        from GPy.examples.dimensionality_reduction import _simulate_matern

        D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 400, 3, 4
        _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, False)
        Y = Ylist[0]

        inan = np.random.binomial(1, .9, size=Y.shape).astype(
            bool)  # 80% missing data
        Ymissing = Y.copy()
        Ymissing[inan] = np.nan

        k = kern.Linear(Q, ARD=True) + kern.White(Q,
                                                  np.exp(-2))  # + kern.bias(Q)
        m = BayesianGPLVMMiniBatch(Ymissing,
                                   Q,
                                   init="random",
                                   num_inducing=num_inducing,
                                   kernel=k,
                                   missing_data=True)
        assert (m.checkgrad())

        k = kern.RBF(Q, ARD=True) + kern.White(Q, np.exp(-2))  # + kern.bias(Q)
        m = BayesianGPLVMMiniBatch(Ymissing,
                                   Q,
                                   init="random",
                                   num_inducing=num_inducing,
                                   kernel=k,
                                   missing_data=True)
        assert (m.checkgrad())
示例#5
0
    def test_missing_data(self):
        from GPy import kern
        from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch
        from GPy.examples.dimensionality_reduction import _simulate_matern

        D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 400, 3, 4
        _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, False)
        Y = Ylist[0]

        inan = np.random.binomial(1, .9, size=Y.shape).astype(bool) # 80% missing data
        Ymissing = Y.copy()
        Ymissing[inan] = np.nan

        k = kern.Linear(Q, ARD=True) + kern.White(Q, np.exp(-2)) # + kern.bias(Q)
        m = BayesianGPLVMMiniBatch(Ymissing, Q, init="random", num_inducing=num_inducing,
                          kernel=k, missing_data=True)
        assert(m.checkgrad())

        k = kern.RBF(Q, ARD=True) + kern.White(Q, np.exp(-2)) # + kern.bias(Q)
        m = BayesianGPLVMMiniBatch(Ymissing, Q, init="random", num_inducing=num_inducing,
                          kernel=k, missing_data=True)
        assert(m.checkgrad())
示例#6
0
    def test_missing_data(self):
        from GPy import kern
        from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch
        from GPy.examples.dimensionality_reduction import _simulate_matern

        D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 400, 3, 4
        _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, False)
        Y = Ylist[0]

        inan = np.random.binomial(1, .9, size=Y.shape).astype(
            bool)  # 80% missing data
        Ymissing = Y.copy()
        Ymissing[inan] = np.nan

        k = kern.Linear(Q, ARD=True) + kern.White(Q,
                                                  np.exp(-2))  # + kern.bias(Q)
        m = BayesianGPLVMMiniBatch(Ymissing,
                                   Q,
                                   init="random",
                                   num_inducing=num_inducing,
                                   kernel=k,
                                   missing_data=True)
        assert (m.checkgrad())
        mul, varl = m.predict(m.X)

        k = kern.RBF(Q, ARD=True) + kern.White(Q, np.exp(-2))  # + kern.bias(Q)
        m2 = BayesianGPLVMMiniBatch(Ymissing,
                                    Q,
                                    init="random",
                                    num_inducing=num_inducing,
                                    kernel=k,
                                    missing_data=True)
        assert (m.checkgrad())
        m2.kern.rbf.lengthscale[:] = 1e6
        m2.X[:] = m.X.param_array
        m2.likelihood[:] = m.likelihood[:]
        m2.kern.white[:] = m.kern.white[:]
        mu, var = m.predict(m.X)
        np.testing.assert_allclose(mul, mu)
        np.testing.assert_allclose(varl, var)

        q50 = m.predict_quantiles(m.X, (50, ))
        np.testing.assert_allclose(mul, q50[0])
示例#7
0
    def __init__(self, Ylist, input_dim, X=None, X_variance=None,
                 initx = 'PCA', initz = 'permute',
                 num_inducing=10, Z=None, kernel=None,
                 inference_method=None, likelihoods=None, name='mrd',
                 Ynames=None, normalizer=False, stochastic=False, batchsize=10):

        self.logger = logging.getLogger(self.__class__.__name__)
        self.input_dim = input_dim
        self.num_inducing = num_inducing

        if isinstance(Ylist, dict):
            Ynames, Ylist = zip(*Ylist.items())

        self.logger.debug("creating observable arrays")
        self.Ylist = [ObsAr(Y) for Y in Ylist]

        if Ynames is None:
            self.logger.debug("creating Ynames")
            Ynames = ['Y{}'.format(i) for i in range(len(Ylist))]
        self.names = Ynames
        assert len(self.names) == len(self.Ylist), "one name per dataset, or None if Ylist is a dict"

        if inference_method is None:
            self.inference_method = InferenceMethodList([VarDTC() for _ in xrange(len(self.Ylist))])
        else:
            assert isinstance(inference_method, InferenceMethodList), "please provide one inference method per Y in the list and provide it as InferenceMethodList, inference_method given: {}".format(inference_method)
            self.inference_method = inference_method

        if X is None:
            X, fracs = self._init_X(initx, Ylist)
        else:
            fracs = [X.var(0)]*len(Ylist)

        Z = self._init_Z(initz, X)
        self.Z = Param('inducing inputs', Z)
        self.num_inducing = self.Z.shape[0] # ensure M==N if M>N

        # sort out the kernels
        self.logger.info("building kernels")
        if kernel is None:
            from ..kern import RBF
            kernels = [RBF(input_dim, ARD=1, lengthscale=1./fracs[i]) for i in range(len(Ylist))]
        elif isinstance(kernel, Kern):
            kernels = []
            for i in range(len(Ylist)):
                k = kernel.copy()
                kernels.append(k)
        else:
            assert len(kernel) == len(Ylist), "need one kernel per output"
            assert all([isinstance(k, Kern) for k in kernel]), "invalid kernel object detected!"
            kernels = kernel

        self.variational_prior = NormalPrior()
        #self.X = NormalPosterior(X, X_variance)

        if likelihoods is None:
            likelihoods = [Gaussian(name='Gaussian_noise'.format(i)) for i in range(len(Ylist))]
        else: likelihoods = likelihoods

        self.logger.info("adding X and Z")
        super(MRD, self).__init__(Y, input_dim, X=X, X_variance=X_variance, num_inducing=num_inducing,
                 Z=self.Z, kernel=None, inference_method=self.inference_method, likelihood=Gaussian(),
                 name='manifold relevance determination', normalizer=None,
                 missing_data=False, stochastic=False, batchsize=1)

        self._log_marginal_likelihood = 0

        self.unlink_parameter(self.likelihood)
        self.unlink_parameter(self.kern)
        del self.kern
        del self.likelihood

        self.num_data = Ylist[0].shape[0]
        if isinstance(batchsize, int):
            batchsize = itertools.repeat(batchsize)

        self.bgplvms = []

        for i, n, k, l, Y, im, bs in itertools.izip(itertools.count(), Ynames, kernels, likelihoods, Ylist, self.inference_method, batchsize):
            assert Y.shape[0] == self.num_data, "All datasets need to share the number of datapoints, and those have to correspond to one another"
            md = np.isnan(Y).any()
            spgp = BayesianGPLVMMiniBatch(Y, input_dim, X, X_variance,
                                          Z=Z, kernel=k, likelihood=l,
                                          inference_method=im, name=n,
                                          normalizer=normalizer,
                                          missing_data=md,
                                          stochastic=stochastic,
                                          batchsize=bs)
            spgp.kl_factr = 1./len(Ynames)
            spgp.unlink_parameter(spgp.Z)
            spgp.unlink_parameter(spgp.X)
            del spgp.Z
            del spgp.X
            spgp.Z = self.Z
            spgp.X = self.X
            self.link_parameter(spgp, i+2)
            self.bgplvms.append(spgp)

        self.posterior = None
        self.logger.info("init done")
示例#8
0
def create_model(Y,
                 X_init=None,
                 num_inducing=10,
                 nonlinear_dims=5,
                 linear_dims=0,
                 white_variance=1):
    """
    Create a BayesianGPLVM model for the expression values in Y.

    Y has the cells on the rows and genes across dimensions:
        Y.shape == (#cells, #genes)

    X_init is the initial latent space for the model.
    Usually this is being initialized by using simulation.run_methods
        X_init, dims = run_methods(Y, methods)

    num_inducing are the number of inducing inputs. It is a number `M`
    between the `0` and the number of datapoints you have and controls
    the complexity of your model. We usually use 10 to 20
    inducing inputs, but if you are having trouble with accuracy in
    your found landscape, you can try to up this number. Note, that
    the speed of the method goes down, with higher numbers of
    inducing inputs. Also, if you use RNASeq data, it is recommended to use a
    lower number (i.e. 10) of inducing inputs so the BayesianGPLVM is
    forced to generalise over patterns and cannot explain the zeros in the
    data by inducing inputs.

    nonlinear_dims are the number of latent dimensions modelled as nonlinear
    relationship between latent space and observed gene expression values
    along the samples. This value gets ignored if X_init is given and the number
    of nonlinear_dims will be the number of dimensions in X_init. If X_init is
    not given, it will be created by PCA.

    linear_dims are the linear dimensions to add into the latent space.
    Linear dimensions are used for modelling linear relationships in the latent
    space independently from the non-linear ones. That is, the last linear_dims
    dimensions in the latent space will be modelled by a linear kernel. We
    recommend try to first run without linear dimensions and see what the
    BayesianGPLVM can learn. If there is a considered amount of confounding
    variation, the linear dimension can help to find this variation
    and explain it away from the rest. It can also lead to unexpected results...

    white_variance is a white variance value (float) for a white variance on the 
    kernel. If it is None, no white variance kernel will be added to the analysis.

    Missing Data: If you have missing data, you can assign the values in Y,
    which are missing to np.nan and the BayesianGPLVM will assume missing
    data at random over those. This will include the dimensionality in
    the runtime of the method and will slow down progress significantly. Thus,
    only include missing data into the model, if you are certain you want to
    use it.

    Usage example:

        from .simulation import run_methods
        Y -= Y.mean(0) # Normalization of data, zero mean is usually what you want.
        Y /= Y.std(0) # Beware of your data and decide whether you want to normalize the variances!
        X_init, dims = run_methods(Y, methods)
        m = create_model(Y, X_init, num_inducing=10)
        optimize_model(m)

    returns a BayesianGPLVM model for the given data matrix Y.
    """
    from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch
    from GPy.kern import Linear, RBF, Add, White
    from GPy.util.linalg import pca

    try:
        Y = Y.values.copy()
    except:
        Y = np.asarray(Y, float).copy()

    if X_init is None:
        X_init = pca(Y, nonlinear_dims)[0]

    kernels = []

    if linear_dims > 0:
        Qlin = linear_dims
        Q = X_init.shape[1] + Qlin
        kernels.extend([
            RBF(Q - Qlin, ARD=True, active_dims=np.arange(0, X_init.shape[1])),
            Linear(Qlin, ARD=True, active_dims=np.arange(X_init.shape[1], Q))
        ])
    else:
        Q = X_init.shape[1]
        kernels.append(
            RBF(Q, ARD=True, active_dims=np.arange(0, X_init.shape[1])))

    if white_variance is not None:
        kernels.append(White(Q, variance=white_variance))

    if len(kernels) > 1:
        kernel = Add(kernels)
    else:
        kernel = kernels[0]

    m = BayesianGPLVMMiniBatch(Y,
                               Q,
                               X=X_init,
                               kernel=kernel,
                               num_inducing=num_inducing,
                               missing_data=np.any(np.isnan(Y)))

    return m
示例#9
0
from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch

full_responses = np.genfromtxt('data/npi-responses.csv', delimiter=',')

seeds = [
    969167, 188942, 134058, 124022, 685285, 226318, 365209, 648795, 985797,
    193627, 569692, 589449, 832867, 497690, 402858, 583422, 183204, 883281,
    669543, 277324
]

print("Running GPLVM replicates")
for iteration in range(20):
    print("\tRunning replicate " + str(iteration + 1) + "\n")
    np.random.seed(seeds[iteration])
    altered_responses = np.copy(full_responses)
    remove_idx = np.random.choice(np.arange(altered_responses.size),
                                  replace=False,
                                  size=int(altered_responses.size * 0.2))
    outname = 'model-output/GPLVM-iter-' + str(iteration) + '-idx.csv'
    np.savetxt(outname, remove_idx, delimiter=',', newline='\n')
    altered_responses[np.unravel_index(remove_idx,
                                       altered_responses.shape)] = np.nan
    m = BayesianGPLVMMiniBatch(altered_responses, 1, missing_data=True)
    m.optimize(messages=0, max_iters=5e3)
    pred = m.predict(full_responses)
    outname = 'model-output/GPLVM-iter-' + str(iteration) + '-mean.csv'
    np.savetxt(outname, pred[0], delimiter=',', newline='\n')
    outname = 'model-output/GPLVM-iter-' + str(iteration) + '-var.csv'
    np.savetxt(outname, pred[1], delimiter=',', newline='\n')
print("")