Пример #1
0
class SVGPG_Layer(Layer):
    def __init__(self, kern, Z, mean_function, num_nodes, dim_per_in, dim_per_out,
                 gmat, share_Z=False, nb_init=True, **kwargs):

        Layer.__init__(self, input_prop_dim=False, **kwargs)

        self.kern = kern
        self.num_nodes = num_nodes
        self.dim_per_in, self.dim_per_out = dim_per_in, dim_per_out
        self.gmat = gmat
        self.share_Z = share_Z
        self.nb_init = nb_init
        self.num_outputs = num_nodes * dim_per_out
        self.num_inducing = Z.shape[0]

        self.q_mu = Parameter(np.zeros((self.num_inducing, num_nodes * dim_per_out)))
        self.mean_function = ParamList([], trainable=False)
        self.q_sqrt_lst = ParamList([])
        transform = transforms.LowerTriangular(self.num_inducing, num_matrices=self.dim_per_out)

        if share_Z:
            self.feature = InducingPoints(Z)
        else:
            self.feature = ParamList([])  # InducingPoints(Z)

        for nd in range(num_nodes):
            if mean_function:
                self.mean_function.append(mean_function[nd])
            else:
                self.mean_function.append(Zero())
            if share_Z:
                pa_nd = self.pa_idx(nd)
                Ku_nd = self.kern[nd].compute_K_symm(Z)
                Lu_nd = np.linalg.cholesky(Ku_nd + np.eye(Z.shape[0]) * settings.jitter)
                q_sqrt = np.tile(Lu_nd[None, :, :], [dim_per_out, 1, 1])
                self.q_sqrt_lst.append(Parameter(q_sqrt, transform=transform))
            else:
                pa_nd = self.pa_idx(nd)
                Z_tmp = Z[:, pa_nd].copy()
                self.feature.append(InducingPoints(Z_tmp))
                Ku_nd = self.kern[nd].compute_K_symm(Z_tmp)
                Lu_nd = np.linalg.cholesky(Ku_nd + np.eye(Z_tmp.shape[0]) * settings.jitter)
                q_sqrt = np.tile(Lu_nd[None, :, :], [dim_per_out, 1, 1])
                self.q_sqrt_lst.append(Parameter(q_sqrt, transform=transform))

        self.needs_build_cholesky = True

    def pa_idx(self, nd):
        res = []
        for n in range(self.num_nodes):
            w = self.gmat[nd, n]
            if w > 0:
                res = res + list(range(n * self.dim_per_in, (n + 1) * self.dim_per_in))
        res = np.asarray(res)
        return res

    @params_as_tensors
    def build_cholesky_if_needed(self):
        # make sure we only compute this once
        if self.needs_build_cholesky:
            self.Ku, self.Lu = [None] * self.num_nodes, [None] * self.num_nodes
            self.Ku_tiled_lst, self.Lu_tiled_lst = [], []
            for nd in range(self.num_nodes):
                if self.share_Z:
                    Ku_nd = self.feature.Kuu(self.kern[nd], jitter=settings.jitter)
                else:
                    Ku_nd = self.feature[nd].Kuu(self.kern[nd], jitter=settings.jitter)
                Lu_nd = tf.cholesky(Ku_nd)
                self.Ku[nd] = Ku_nd
                self.Lu[nd] = Lu_nd
                self.Ku_tiled_lst.append(tf.tile(Ku_nd[None, :, :], [self.dim_per_out, 1, 1]))
                self.Lu_tiled_lst.append(tf.tile(Lu_nd[None, :, :], [self.dim_per_out, 1, 1]))
            self.needs_build_cholesky = False

    @time_it
    def conditional_ND(self, X, full_cov=False):
        self.build_cholesky_if_needed()

        if self.share_Z:
            return self.conditional_ND_share_Z(X, full_cov=False)
        else:
            return self.conditional_ND_not_share_Z(X, full_cov=False)

    def conditional_ND_share_Z(self, X, full_cov=False):
        mean_lst, var_lst, A_tiled_lst = [], [], []
        for nd in range(self.num_nodes):
            pa_nd = self.pa_idx(nd)
            Kuf_nd = self.feature.Kuf(self.kern[nd], X)

            A_nd = tf.matrix_triangular_solve(self.Lu[nd], Kuf_nd, lower=True)
            A_nd = tf.matrix_triangular_solve(tf.transpose(self.Lu[nd]), A_nd, lower=False)
            mean_tmp = tf.matmul(A_nd, self.q_mu[:, nd * self.dim_per_out:(nd + 1) * self.dim_per_out],
                                 transpose_a=True)
            X_tmp = tf.gather(X, pa_nd, axis=1)
            if self.nb_init:
                mean_tmp += self.mean_function[nd](X_tmp)
            else:
                mean_tmp += self.mean_function[nd](X[:, nd * self.dim_per_in:(nd + 1) * self.dim_per_in])
            mean_lst.append(mean_tmp)

            A_tiled_lst.append(tf.tile(A_nd[None, :, :], [self.dim_per_out, 1, 1]))
            SK_nd = -self.Ku_tiled_lst[nd]
            q_sqrt_nd = self.q_sqrt_lst[nd]
            with params_as_tensors_for(q_sqrt_nd, convert=True):
                SK_nd += tf.matmul(q_sqrt_nd, q_sqrt_nd, transpose_b=True)

            B_nd = tf.matmul(SK_nd, A_tiled_lst[nd])
            # (num_latent, num_X)
            delta_cov_nd = tf.reduce_sum(A_tiled_lst[nd] * B_nd, 1)
            Kff_nd = self.kern[nd].Kdiag(X)

            # either (1, num_X) + (num_latent, num_X)
            var_nd = tf.expand_dims(Kff_nd, 0) + delta_cov_nd
            var_nd = tf.transpose(var_nd)

            var_lst.append(var_nd)

        mean = tf.concat(mean_lst, axis=1)
        var = tf.concat(var_lst, axis=1)
        return mean, var

    def conditional_ND_not_share_Z(self, X, full_cov=False):
        mean_lst, var_lst, A_tiled_lst = [], [], []
        for nd in range(self.num_nodes):
            pa_nd = self.pa_idx(nd)
            X_tmp = tf.gather(X, pa_nd, axis=1)
            Kuf_nd = self.feature[nd].Kuf(self.kern[nd], X_tmp)

            A_nd = tf.matrix_triangular_solve(self.Lu[nd], Kuf_nd, lower=True)
            A_nd = tf.matrix_triangular_solve(tf.transpose(self.Lu[nd]), A_nd, lower=False)

            mean_tmp = tf.matmul(A_nd, self.q_mu[:, nd * self.dim_per_out:(nd + 1) * self.dim_per_out],
                                 transpose_a=True)
            if self.nb_init:
                mean_tmp += self.mean_function[nd](X_tmp)
            else:
                mean_tmp += self.mean_function[nd](X[:, nd * self.dim_per_in:(nd + 1) * self.dim_per_in])
            mean_lst.append(mean_tmp)
            A_tiled_lst.append(tf.tile(A_nd[None, :, :], [self.dim_per_out, 1, 1]))

            SK_nd = -self.Ku_tiled_lst[nd]
            q_sqrt_nd = self.q_sqrt_lst[nd]
            with params_as_tensors_for(q_sqrt_nd, convert=True):
                SK_nd += tf.matmul(q_sqrt_nd, q_sqrt_nd, transpose_b=True)

            B_nd = tf.matmul(SK_nd, A_tiled_lst[nd])

            # (num_latent, num_X)
            delta_cov_nd = tf.reduce_sum(A_tiled_lst[nd] * B_nd, 1)
            Kff_nd = self.kern[nd].Kdiag(X_tmp)

            # (1, num_X) + (num_latent, num_X)
            var_nd = tf.expand_dims(Kff_nd, 0) + delta_cov_nd
            var_nd = tf.transpose(var_nd)

            var_lst.append(var_nd)

        mean = tf.concat(mean_lst, axis=1)
        var = tf.concat(var_lst, axis=1)
        return mean, var

    @time_it
    def KL(self):
        """
        The KL divergence from the variational distribution to the prior

        :return: KL divergence from N(q_mu, q_sqrt) to N(0, I), independently for each GP
        """

        self.build_cholesky_if_needed()

        KL = -0.5 * self.num_inducing * self.num_nodes * self.dim_per_out

        for nd in range(self.num_nodes):
            q_sqrt_nd = self.q_sqrt_lst[nd]
            with params_as_tensors_for(q_sqrt_nd, convert=True):
                KL -= 0.5 * tf.reduce_sum(tf.log(tf.matrix_diag_part(q_sqrt_nd) ** 2))

                KL += tf.reduce_sum(tf.log(tf.matrix_diag_part(self.Lu[nd]))) * self.dim_per_out
                KL += 0.5 * tf.reduce_sum(
                    tf.square(tf.matrix_triangular_solve(self.Lu_tiled_lst[nd], q_sqrt_nd, lower=True)))
                q_mu_nd = self.q_mu[:, nd * self.dim_per_out:(nd + 1) * self.dim_per_out]
                Kinv_m_nd = tf.cholesky_solve(self.Lu[nd], q_mu_nd)
                KL += 0.5 * tf.reduce_sum(q_mu_nd * Kinv_m_nd)

        return KL
Пример #2
0
def init_layers_graph(X, Y, Z, kernels, gmat,
                      num_layers=2,
                      num_nodes=None,
                      dim_per_node=5,
                      dim_per_X=5, dim_per_Y=5,
                      share_Z=False,
                      nb_init=True):
    layers = []

    def pa_idx(nd, dim_per_in):
        res = []
        for n in range(num_nodes):
            w = gmat[nd, n]
            if w > 0:
                # print(res, range(n*self.dim_per_in, (n+1)*self.dim_per_in))
                res = res + list(range(n * dim_per_in, (n + 1) * dim_per_in))
        res = np.asarray(res)
        return res

    X_running, Z_running = X.copy(), Z.copy()
    for l in range(num_layers - 1):
        if l == 0:
            dim_in = dim_per_X
            dim_out = dim_per_node
        else:
            dim_in = dim_per_node
            dim_out = dim_per_node
        # print(dim_in, dim_out)
        X_running_tmp = np.zeros((X.shape[0], dim_out * num_nodes))
        Z_running_tmp = np.zeros((Z.shape[0], dim_out * num_nodes))
        mf_lst = ParamList([], trainable=False)
        for nd in range(num_nodes):
            if nb_init:
                pa = pa_idx(nd, dim_in)
            else:
                pa = np.asarray(range(nd * dim_in, (nd + 1) * dim_in))
            agg_dim_in = len(pa)

            if agg_dim_in == dim_out:
                mf = Identity()

            else:
                if agg_dim_in > dim_out:  # stepping down, use the pca projection
                    # _, _, V = np.linalg.svd(X_running[:, nd*dim_in : (nd+1)*dim_in], full_matrices=False)
                    _, _, V = np.linalg.svd(X_running[:, pa], full_matrices=False)
                    W = V[:dim_out, :].T

                else:  # stepping up, use identity + padding
                    W = np.concatenate([np.eye(agg_dim_in), np.zeros((agg_dim_in, dim_out - agg_dim_in))], 1)

                mf = Linear(W)
                mf.set_trainable(False)
            mf_lst.append(mf)
            if agg_dim_in != dim_out:
                # print(Z_running_tmp[:, nd*dim_out:(nd+1)*dim_out].shape, Z_running[:, nd*dim_in:(nd+1)*dim_in].shape,
                #       W.shape, Z_running[:, nd*dim_in:(nd+1)*dim_in].dot(W).shape)
                Z_running_tmp[:, nd * dim_out:(nd + 1) * dim_out] = Z_running[:, pa].dot(W)
                X_running_tmp[:, nd * dim_out:(nd + 1) * dim_out] = X_running[:, pa].dot(W)
            else:
                Z_running_tmp[:, nd * dim_out:(nd + 1) * dim_out] = Z_running[:, pa]
                X_running_tmp[:, nd * dim_out:(nd + 1) * dim_out] = X_running[:, pa]

        layers.append(
            SVGPG_Layer(kernels[l], Z_running, mf_lst, num_nodes, dim_in, dim_out, gmat, share_Z=share_Z, nb_init=nb_init))
        Z_running = Z_running_tmp
        X_running = X_running_tmp

    # final layer
    if num_layers == 1:
        fin_dim_in = dim_per_X
    else:
        fin_dim_in = dim_per_node
    layers.append(
        SVGPG_Layer(kernels[-1], Z_running, None, num_nodes, fin_dim_in, dim_per_Y, gmat, share_Z=share_Z, nb_init=nb_init))
    return layers