Пример #1
0
    def __init__(self, X, Y,  kern, Z=None, Zy=None,Zvar = None,mean_function=None, minibatch_size=None, var = 1.0, shuffle=True, trainable_var=True,**kwargs):
        """
        X is a data matrix, size N x D
        Y is a data matrix, size N x R
        kern, mean_function are appropriate GPflow objects
        minibatch_size, if not None, turns on mini-batching with that size.
        vector_obs_variance if not None (default) is vectorized measurement variance
        """
        Z = DataHolder(Z) if (Z is not None) and (Zy is not None) and (Zvar is not None) else None
        Zy = DataHolder(Zy) if (Z is not None) and (Zy is not None) and (Zvar is not None) else None
        Zvar = DataHolder(Zvar) if (Z is not None) and (Zy is not None) and (Zvar is not None) else None

        if minibatch_size is None:
            X = DataHolder(X)
            Y = DataHolder(Y)
            Y_var = DataHolder(var)
        else:
            X = Minibatch(X, batch_size=minibatch_size, shuffle=shuffle, seed=0)
            Y = Minibatch(Y, batch_size=minibatch_size, shuffle=shuffle, seed=0)
            Y_var = Minibatch(var, batch_size=minibatch_size, shuffle=shuffle, seed=0)
        likelihood = Gaussian_v2(var=1.0,trainable=trainable_var)
        likelihood.relative_variance = Y_var

        GPModel.__init__(self, X, Y, kern, likelihood, mean_function, **kwargs)
        self.Z = Z
        self.Zy = Zy
        self.Zvar = Zvar
Пример #2
0
    def __init__(self,
                 X,
                 Y,
                 likelihood,
                 layers,
                 minibatch_size=None,
                 num_samples=1,
                 num_data=None,
                 **kwargs):
        """
        Base class for the fully coupled DGP providing all basic functionalities.
        """
        Model.__init__(self, **kwargs)
        self.num_samples = num_samples

        self.num_data = num_data or X.shape[0]

        if minibatch_size:
            self.X = Minibatch(X, minibatch_size, seed=0)
            self.Y = Minibatch(Y, minibatch_size, seed=0)
        else:
            self.X = DataHolder(X)
            self.Y = DataHolder(Y)

        self.likelihood = likelihood
        self.layers = layers
Пример #3
0
    def __init__(self,
                 X,
                 Y,
                 Z,
                 kern,
                 likelihood,
                 mean_function=Zero,
                 minibatch_size=None,
                 num_latent=None,
                 num_samples=1,
                 num_data=None,
                 whiten=True):
        Model.__init__(self)
        self.num_samples = num_samples
        self.num_latent = num_latent or Y.shape[1]
        self.num_data = num_data or X.shape[0]

        if minibatch_size:
            self.X = Minibatch(X, minibatch_size, seed=0)
            self.Y = Minibatch(Y, minibatch_size, seed=0)
        else:
            self.X = DataHolder(X)
            self.Y = DataHolder(Y)

        self.likelihood = likelihood
        assert isinstance(likelihood, HeteroscedasticLikelihood)

        self.f_latent = Latent(Z,
                               mean_function,
                               kern,
                               num_latent=num_latent,
                               whiten=whiten,
                               name="f_latent")
Пример #4
0
    def __init__(self,
                 X,
                 Y,
                 Z,
                 layers,
                 likelihood,
                 num_latent=None,
                 minibatch_size=None,
                 num_samples=1,
                 mean_function=Zero(),
                 name=None):
        """
        - X is a data matrix, size N x D.
        - Y is a data matrix, size N x R.
        - Z is a matrix of inducing inputs, size M x D.
        - layers is an instance of Sequential containing the layer structure of
        the DGP.
        - likelihood is an instance of the gpflow likehood object.
        - num_latent_Y is the number of latent processes to use.
        - minibatch_size, if not None turns of minibatching with that size.
        - num_samples is the number of Monte Carlo samples to use.
        - mean_function is an instance of the gpflow mean_function object,
        corresponds to the mean function of the final layer.
        - name is the name of the TensforFlow object.
        """

        super(DSDGP, self).__init__(name=name)

        assert X.shape[0] == Y.shape[0]
        assert Z.shape[1] == X.shape[1]

        self.num_data, D_X = X.shape
        self.num_samples = num_samples
        self.D_Y = num_latent or Y.shape[1]

        self.mean_function = mean_function

        layers.initialize_params(X,
                                 Z)  #Maybe add initialization method for model
        if layers._initialized == True:
            self.layers = layers
        else:
            raise ValueError("Layers were not initialized")

        self.dims = self.layers.get_dims()

        self.likelihood = likelihood

        if minibatch_size is None:
            X = DataHolder(X)
            Y = DataHolder(Y)
        else:
            X = Minibatch(X, batch_size=minibatch_size, seed=0)
            Y = Minibatch(Y, batch_size=minibatch_size, seed=0)

        self.X, self.Y = X, Y
Пример #5
0
    def __init__(self,
                 X,
                 Y,
                 kern,
                 minibatch_size=None,
                 n_filters=256,
                 name: str = None):
        super(ConvNet, self).__init__(name=name)
        if not hasattr(kern, 'W_'):
            # Create W_ and b_ as attributes in kernel
            X_zeros = np.zeros([1] + kern.input_shape)
            _ = kern.equivalent_BNN(X=tf.constant(X_zeros,
                                                  dtype=settings.float_type),
                                    n_samples=1,
                                    n_filters=n_filters)
        self._kern = kern

        # Make MiniBatches if necessary
        if minibatch_size is None:
            self.X = DataHolder(X)
            self.Y = DataHolder(Y, dtype=tf.int32)
            self.scale_factor = 1.
        else:
            self.X = Minibatch(X, batch_size=minibatch_size, seed=0)
            self.Y = Minibatch(Y,
                               batch_size=minibatch_size,
                               seed=0,
                               dtype=np.int32)
            self.scale_factor = X.shape[0] / minibatch_size
        self.n_labels = int(np.max(Y) + 1)

        # Create GPFlow parameters with the relevant size of the network
        Ws, bs = [], []
        for i, (W, b) in enumerate(zip(kern._W, kern._b)):
            if i == kern.n_layers:
                W_shape = [int(W.shape[1]), self.n_labels]
                b_shape = [self.n_labels]
            else:
                W_shape = list(map(int, W.shape[1:]))
                b_shape = [n_filters]
            W_var = kern.var_weight.read_value() / W_shape[-2]
            b_var = kern.var_bias.read_value()
            W_init = np.sqrt(W_var) * np.random.randn(*W_shape)
            b_init = np.sqrt(b_var) * np.random.randn(*b_shape)
            Ws.append(
                gpflow.params.Parameter(W_init, dtype=settings.float_type)
            )  #, prior=ZeroMeanGauss(W_var)))
            bs.append(
                gpflow.params.Parameter(b_init, dtype=settings.float_type)
            )  #, prior=ZeroMeanGauss(b_var)))
        self.Ws = gpflow.params.ParamList(Ws)
        self.bs = gpflow.params.ParamList(bs)
Пример #6
0
    def __init__(self, Y_var, freqs, *args, **kwargs):
        minibatch_size = kwargs.get('minibatch_size', None)
        if minibatch_size is None:
            Y_var = DataHolder(Y_var)
            freqs = DataHolder(freqs)
        else:
            Y_var = Minibatch(Y_var, batch_size=minibatch_size, seed=0)
            freqs = Minibatch(freqs, batch_size=minibatch_size, seed=0)

        super(HeteroscedasticPhaseOnlySVGP, self).__init__(*args, **kwargs)

        self.Y_var = Y_var
        self.freqs = freqs
Пример #7
0
    def __init__(self,
                 X,
                 Y,
                 likelihood,
                 layers,
                 minibatch_size=None,
                 num_samples=1,
                 **kwargs):
        """

        :param X: List of training inputs where each element of the list is a numpy array corresponding to the inputs of one fidelity.
        :param Y: List of training targets where each element of the list is a numpy array corresponding to the inputs of one fidelity.
        :param likelihood: gpflow likelihood object for use at the final layer
        :param layers: List of doubly_stochastic_dgp.layers.Layer objects
        :param minibatch_size: Minibatch size if using minibatch trainingz
        :param num_samples: Number of samples when propagating predictions through layers
        :param kwargs: kwarg inputs to gpflow.models.Model
        """

        Model.__init__(self, **kwargs)

        self.Y_list = Y
        self.X_list = X
        self.minibatch_size = minibatch_size

        self.num_samples = num_samples

        # This allows a training regime where the first layer is trained first by itself, then the subsequent layer
        # and so on.
        self._train_upto_fidelity = -1

        if minibatch_size:
            for i, (x, y) in enumerate(zip(X, Y)):
                setattr(self, "num_data" + str(i), x.shape[0])
                setattr(self, "X" + str(i), Minibatch(x,
                                                      minibatch_size,
                                                      seed=0))
                setattr(self, "Y" + str(i), Minibatch(y,
                                                      minibatch_size,
                                                      seed=0))
        else:
            for i, (x, y) in enumerate(zip(X, Y)):
                setattr(self, "num_data" + str(i), x.shape[0])
                setattr(self, "X" + str(i), DataHolder(x))
                setattr(self, "Y" + str(i), DataHolder(y))

        self.num_layers = len(layers)
        self.layers = ParamList(layers)

        self.likelihood = BroadcastingLikelihood(likelihood)
Пример #8
0
    def __init__(self, X, Y, likelihood, layers,
                 minibatch_size=None,
                 num_samples=1):
        Model.__init__(self)
        self.num_samples = num_samples

        self.num_data = X.shape[0]
        if minibatch_size:
            self.X = Minibatch(X, minibatch_size, seed=0)
            self.Y = Minibatch(Y, minibatch_size, seed=0)
        else:
            self.X = DataHolder(X)
            self.Y = DataHolder(Y)

        self.likelihood = BroadcastingLikelihood(likelihood)

        self.layers = ParamList(layers)
Пример #9
0
    def __init__(self, X, Y, W, kern, idx=None, feat=None, Z=None,
                 mean_function=None, q_diag=False, whiten=False,
                 q_mu=None, q_sqrt=None,
                 minibatch_size=None, num_latent=None, **kwargs):
        """
        X is a data matrix, size N x D
        Y is a data matrix, size N x R
        Z is a matrix of pseudo inputs, size M x D
        kern, mean_function are appropriate GPflow objects
        This method only works with a Gaussian likelihood.
        """
        num_data = X.shape[0]

        if minibatch_size is None:
            X = DataHolder(X, fix_shape=True)
            Y = DataHolder(Y, fix_shape=True)

        else:
            X = Minibatch(X, batch_size=minibatch_size, seed=0)
            Y = Minibatch(Y, batch_size=minibatch_size, seed=0)

        # init the super class
        likelihood = likelihoods.Gaussian()
        num_latent = W.shape[1]
        GPModel.__init__(self, X, Y, kern, likelihood, mean_function,
                         num_latent=num_latent, **kwargs)

        if minibatch_size is not None:
            idx = Minibatch(np.arange(num_data), batch_size=minibatch_size, seed=0, dtype=np.int32)

        self.idx = idx
        self.W = Parameter(W, trainable=False)
        self.K = self.W.shape[1]
        self.W_prior = Parameter(np.ones(self.K) / self.K, trainable=False)
        self.num_data = num_data
        self.feature = features.inducingpoint_wrapper(feat, Z)

        self.minibatch_size = minibatch_size
        self.q_diag, self.whiten = q_diag, whiten

        # init variational parameters
        num_inducing = len(self.feature)
        self._init_variational_parameters(
            num_inducing, q_mu, q_sqrt, q_diag)
Пример #10
0
    def __init__(self, weights, *args, **kwargs):
        minibatch_size = kwargs.get('minibatch_size', None)
        if minibatch_size is None:
            weights = DataHolder(weights)
        else:
            weights = Minibatch(weights, batch_size=minibatch_size, seed=0)

        super(HomoscedasticPhaseOnlySVGP, self).__init__(*args, **kwargs)

        self.weights = weights
    def __init__(self,Y_var,*args, **kwargs):
        minibatch_size = kwargs.get('minibatch_size',None)
        if minibatch_size is None:
            Y_var = DataHolder(Y_var)
        else:
            Y_var = Minibatch(Y_var, batch_size=minibatch_size, seed=0)     

        super(HeteroscedasticTecSVGP, self).__init__(*args, **kwargs)

        self.Y_var = Y_var
Пример #12
0
    def __init__(self,
                 X,
                 Y,
                 likelihood,
                 layers,
                 minibatch_size=None,
                 num_samples=1,
                 num_data=None,
                 div_weights=None,
                 **kwargs):
        Model.__init__(self, **kwargs)
        self.num_samples = num_samples

        self.num_data = num_data or X.shape[0]
        if minibatch_size:
            self.X = Minibatch(X, minibatch_size, seed=0)
            self.Y = Minibatch(Y, minibatch_size, seed=0)
        else:
            self.X = DataHolder(X)
            self.Y = DataHolder(Y)

        self.likelihood = BroadcastingLikelihood(likelihood)

        self.layers = ParamList(layers)
        """CHANGES START"""
        """Weights for the uncertainty quantifiers (per layer)"""
        if div_weights is None:
            div_weights = [1.0] * len(
                layers)  #multiply by 1, i.e. don't change
        elif type(div_weights) == list and len(div_weights) != len(layers):
            print(
                "WARNING! You specified a list of weights for the " +
                "uncertainty quantifiers, but your DGP has more/less layers " +
                "than the number of weights you specified! " +
                "We set all weights to 1.0")
            div_weights = [1.0] * len(layers)
        elif type(div_weights) == list and len(div_weights) == len(layers):
            div_weights = div_weights
        """Distribute the weights into the layers"""
        for layer, weight in zip(layers, div_weights):
            layer.set_weight(weight)
        """CHANGES EEND"""
Пример #13
0
 def __init__(self, Y_var=None,var=1.0, trainable=True,minibatch_size=None):
     super().__init__()
     self.variance = Parameter(
             var, transform=transforms.positive, dtype=settings.float_type, trainable=trainable)
     
     if Y_var is None:
         self.relative_variance = 1.0
     else:
         if minibatch_size is None:
             self.relative_variance = DataHolder(Y_var[:,None]+1e-3)
         else:
             self.relative_variance = Minibatch(Y_var[:,None]+1e-3, batch_size=minibatch_size, shuffle=True, seed=0)
Пример #14
0
    def __init__(self, X, Y, kern, likelihood, feat, mean_function=None, num_latent=None, q_diag=False, whiten=True, minibatch_size=None, num_data=None, q_mu=None, q_sqrt=None, shuffle=True, **kwargs):

        if not isinstance(feat, InducingTensors) and not isinstance(feat, InducingSequences):
            raise ValueError('feat must be of type either InducingTensors or InducingSequences')

        num_inducing = len(feat)

        if minibatch_size is None:
            X = DataHolder(X)
            Y = DataHolder(Y)
        else:
            X = Minibatch(X, batch_size=minibatch_size, shuffle=shuffle, seed=0)
            Y = Minibatch(Y, batch_size=minibatch_size, shuffle=shuffle, seed=0)

        models.GPModel.__init__(self, X, Y, kern, likelihood, mean_function, num_latent, **kwargs)
        self.num_data = num_data or X.shape[0]
        self.q_diag, self.whiten = q_diag, whiten
        self.feature = feat
        self._init_variational_parameters(num_inducing, q_mu, q_sqrt, q_diag)
        
        return
Пример #15
0
    def __init__(self, X, Y, latent_dim, layers, batch_size=64, name=None):
        super().__init__(name=name)
        self.X_dim = X.shape[1]
        self.Y_dim = Y.shape[1]  # the conditions
        X = X.astype(np.float32)
        Y = Y.astype(np.float32)
        if batch_size is not None:
            self.X = Minibatch(X, batch_size=batch_size, seed=0)
            self.Y = Minibatch(Y, batch_size=batch_size, seed=0)
        else:
            self.X = DataHolder(X)
            self.Y = DataHolder(Y)
        self.latent_dim = latent_dim

        self.variance = Parameter(.05, transform=transforms.positive)

        self.batch_size = batch_size
        shape = (X.shape[0], latent_dim) if (batch_size is None) else (batch_size, latent_dim)
        self.prior_z = tf.distributions.Normal(loc=tf.zeros(shape, dtype=tf.float32),
                                               scale=tf.cast(1.0, dtype=tf.float32))

        self._build_encoder(layers)
        self._build_decoder(layers)
Пример #16
0
    def __init__(self,
                 X,
                 Y,
                 kern,
                 likelihood,
                 mean_function=None,
                 feat=None,
                 Z=None,
                 q_diag=False,
                 whiten=True,
                 minibatch_size=None,
                 num_data=None,
                 num_latent=None,
                 q_mu=None,
                 q_sqrt=None,
                 alpha=None,
                 alpha_tilde=None,
                 **kwargs):
        """
        - X is a data matrix, size N x D
        - Y contains the annotations. It is a numpy array of matrices with 2 columns, gathering pairs (annotator, annotation).
        - kern, likelihood, mean_function are appropriate GPflow objects
        - feat and Z define the pseudo inputs, usually feat=None and Z size M x D
        - q_diag, boolean indicating whether posterior covariance must be diagonal
        - withen, boolean indicating whether a whitened representation of the inducing points is used
        - minibatch_size, if not None, turns on mini-batching with that size
        - num_data is the total number of observations, default to X.shape[0] (relevant when feeding in external minibatches)
        - num_latent is the number of latent GP to be used. For multi-class likelihoods, this equals the number of classes. However, for many binary likelihoods, num_latent=1.
        - q_mu (M x K), q_sqrt (M x K or K x M x M), alpha (A x K x K), alpha_tilde (A x K x K), initializations for these parameters (all of them but alpha to be estimated).
        """
        if minibatch_size is None:
            X = DataHolder(X)
        else:
            X = Minibatch(X, batch_size=minibatch_size, seed=0)
        class_keys = np.unique(np.concatenate([y[:, 1] for y in Y]))
        num_classes = len(class_keys)
        num_latent = num_latent or num_classes
        GPModel.__init__(self, X, None, kern, likelihood, mean_function,
                         num_latent, **kwargs)
        self.class_keys = class_keys
        self.num_classes = num_classes
        self.num_latent = num_latent
        self.annot_keys = np.unique(np.concatenate([y[:, 0] for y in Y]))
        self.num_annotators = len(self.annot_keys)
        self.num_data = num_data or X.shape[0]
        self.q_diag, self.whiten = q_diag, whiten
        self.feature = features.inducingpoint_wrapper(feat, Z)
        self.num_inducing = len(self.feature)

        ###### Initializing Y_idxs as minibatch or placeholder (and the associated idxs to slice q_unn) ######################
        startTime = time.time()
        Y_idxs = np.array([
            np.stack((np.array(
                [np.flatnonzero(v == self.annot_keys)[0] for v in y[:, 0]]),
                      np.array([
                          np.flatnonzero(v == self.class_keys)[0]
                          for v in y[:, 1]
                      ])),
                     axis=1) for y in Y
        ])  # same as Y but with indexes
        S = np.max([v.shape[0] for v in Y_idxs])
        ###########################################
        ## pmr modification for CPU
        #Y_idxs_cr = np.array([np.concatenate((y,-1*np.ones((S-y.shape[0],2))),axis=0) for y in Y_idxs]).astype(np.int16) # NxSx2
        aux = np.array([self.num_annotators, 0])
        Y_idxs_cr = np.array([
            np.concatenate((y, np.tile(aux, (S - y.shape[0], 1))), axis=0)
            for y in Y_idxs
        ]).astype(np.int16)  # NxSx2
        ###########################################

        if minibatch_size is None:
            self.Y_idxs_cr = DataHolder(Y_idxs_cr)
            self.idxs_mb = DataHolder(np.arange(self.num_data))
        else:
            self.Y_idxs_cr = Minibatch(Y_idxs_cr,
                                       batch_size=minibatch_size,
                                       seed=0)
            self.idxs_mb = Minibatch(np.arange(self.num_data),
                                     batch_size=minibatch_size,
                                     seed=0)
        print("Time taken in Y_idxs creation:", time.time() - startTime)

        ########## Initializing q #####################################
        startTime = time.time()
        q_unn = np.array(
            [np.bincount(y[:, 1], minlength=self.num_classes) for y in Y_idxs])
        q_unn = q_unn + np.ones(q_unn.shape)
        q_unn = q_unn / np.sum(q_unn, axis=1, keepdims=True)
        self.q_unn = Parameter(q_unn, transform=transforms.positive)  # N x K
        print("Time taken in q_unn initialization:", time.time() - startTime)

        ######## Initializing alpha (fix) and alpha_tilde (trainable) ################3
        #if alpha is None:
        #    self.alpha = tf.constant(np.ones((self.num_annotators,self.num_classes,self.num_classes), dtype=settings.float_type)) # A x K x K
        #else:
        #    self.alpha = tf.constant(alpha, dtype=settings.float_type) # A x K x K

        if alpha is None:
            alpha = np.ones(
                (self.num_annotators, self.num_classes, self.num_classes),
                dtype=settings.float_type)  # A x K x K
        self.alpha = Parameter(alpha,
                               transform=transforms.positive,
                               trainable=False)

        startTime = time.time()
        alpha_tilde = self._init_behaviors(q_unn, Y_idxs)
        print("Time taken in alpha_tilde initialization:",
              time.time() - startTime)
        self.alpha_tilde = Parameter(
            alpha_tilde, transform=transforms.positive)  # A x K x K
        ################################################################################
        ##### Initializing the variational parameters  ####################################
        self._init_variational_parameters(q_mu, q_sqrt)
Пример #17
0
    def __init__(self,
                 datasets=[],
                 inducing_locations=[],
                 kernels=[],
                 noise_sigmas=[],
                 minibatch_sizes=[],
                 mixing_weight=None,
                 parent_mixtures=None,
                 masks=None,
                 num_samples=1,
                 **kwargs):
        """
            datasets: an array of arrays [X_a, Y_a] ordered by 'trust', ie datasets[0] is the most reliable 
            inducing_points_locations: an array of inducing locations for each of the datasets  
            kernels: an array of kernels for each of the datasets  
            noise_sigmas: an array of noise_sigmas for each of the datasets  
            mixing_weight (MR_Mixing_Weight): an object that will combine the predictions from each of the local experts
            parent_mixtures: an array of parent mixture models
        """
        Model.__init__(self, **kwargs)

        self.dataset_sizes = []
        for d in datasets:
            self.dataset_sizes.append(d[0].shape[0])

        self.num_datasets = len(datasets)
        self.X = []
        self.Y = []
        self.Z = inducing_locations
        self.masks = masks
        self.MASKS = []
        self.kernels = kernels
        self.noise_sigmas = noise_sigmas
        self.num_samples = num_samples

        #gpflow models are Parameterized objects
        print(parent_mixtures)
        self.parent_mixtures = ParamList(
            parent_mixtures) if parent_mixtures is not None else None

        self.mixing_weight = mixing_weight

        minibatch = False
        for i, d in enumerate(datasets):
            #TODO: can we just wrap with a ParamList?
            if minibatch:
                _x = Minibatch(d[0], batch_size=minibatch_sizes[i], seed=0)
                _y = Minibatch(d[1], batch_size=minibatch_sizes[i], seed=0)
            else:
                _x = DataHolder(d[0])
                _y = DataHolder(d[1])

            #Check we have some masks
            if self.masks:
                #Check if we have a mask for this dataset
                _mask = None
                if self.masks[i] is not None:
                    if minibatch:
                        _mask = Minibatch(self.masks[i],
                                          batch_size=minibatch_sizes[0],
                                          seed=0)
                    else:
                        _mask = DataHolder(self.masks[i])

            #make it so GPFlow can find _x, _y
            setattr(self, 'x_{i}'.format(i=i), _x)
            setattr(self, 'y_{i}'.format(i=i), _y)
            if self.masks:
                setattr(self, 'mask_{i}'.format(i=i), _mask)

            #save references
            self.X.append(self.__dict__['x_{i}'.format(i=i)])
            self.Y.append(self.__dict__['y_{i}'.format(i=i)])
            if self.masks:
                self.MASKS.append(self.__dict__['mask_{i}'.format(i=i)])

        self.setup()
Пример #18
0
    def __init__(self, X, Y, W1, W2, kern, likelihood,
                 idx=None, W1_idx=None, W2_idx=None, feat=None,
                 mean_function=None,
                 num_latent=None,
                 q_diag=False,
                 whiten=True,
                 minibatch_size=None,
                 Z=None,
                 num_data=None,
                 q_mu=None,
                 q_sqrt=None,
                 **kwargs):
        """
        - X is a data matrix, size N x D
        - Y is a data matrix, size N x P
        - kern, likelihood, mean_function are appropriate GPflow objects
        - Z is a matrix of pseudo inputs, size M x D
        - num_latent is the number of latent process to use, default to
          Y.shape[1]
        - q_diag is a boolean. If True, the covariance is approximated by a
          diagonal matrix.
        - whiten is a boolean. If True, we use the whitened representation of
          the inducing points.
        - minibatch_size, if not None, turns on mini-batching with that size.
        - num_data is the total number of observations, default to X.shape[0]
          (relevant when feeding in external minibatches)
        """
        # sort out the X, Y into MiniBatch objects if required.
        num_data = X.shape[0]

        if minibatch_size is None:
            X = DataHolder(X)
            Y = DataHolder(Y)

            if W1_idx is not None:
                W1_idx = DataHolder(W1_idx, fix_shape=True)

            if W2_idx is not None:
                W2_idx = DataHolder(W2_idx, fix_shape=True)
        else:
            X = Minibatch(X, batch_size=minibatch_size, seed=0)
            Y = Minibatch(Y, batch_size=minibatch_size, seed=0)
            
            idx = Minibatch(np.arange(num_data), batch_size=minibatch_size, seed=0, dtype=np.int32)
            if W1_idx is not None:
                W1_idx = Minibatch(
                    W1_idx, batch_size=minibatch_size, seed=0, dtype=np.int32)

            if W2_idx is not None:
                W2_idx = Minibatch(
                    W2_idx, batch_size=minibatch_size, seed=0, dtype=np.int32)

        # init the super class, accept args
        num_latent = W1.shape[1] * W2.shape[1]
        GPModel.__init__(self, X, Y, kern, likelihood, mean_function, num_latent, **kwargs)
        self.num_data = num_data or X.shape[0]
        self.q_diag, self.whiten = q_diag, whiten
        self.feature = features.inducingpoint_wrapper(feat, Z)

        self.idx = idx
        self.W1_idx = W1_idx
        self.W2_idx = W2_idx

        self.K1 = W1.shape[1]
        self.W1 = Parameter(W1, trainable=False, dtype=settings.float_type)
        self.W1_prior = Parameter(np.ones(self.K1) / self.K1, trainable=False)

        self.K2 = W2.shape[1]
        self.W2 = Parameter(W2, trainable=False, dtype=settings.float_type)
        self.W2_prior = Parameter(np.ones(self.K2) / self.K2, trainable=False)

        # init variational parameters
        num_inducing = len(self.feature)
        self._init_variational_parameters(num_inducing, q_mu, q_sqrt, q_diag)