예제 #1
0
def run_with_adam_and_nat(model, lr, iterations, callback=None, gamma=0.001):
    if gamma == 0:
        adam = AdamOptimizer(lr).make_optimize_action(model)
        actions = [adam]
        actions = actions if callback is None else actions + [callback]

        Loop(actions, stop=iterations)()
        model.anchor(model.enquire_session())
        return

    var_list = [(model.f_latent.q_mu, model.f_latent.q_sqrt)]

    # we don't want adam optimizing these
    model.f_latent.q_mu.set_trainable(False)
    model.f_latent.q_sqrt.set_trainable(False)

    adam = AdamOptimizer(lr).make_optimize_action(model)
    natgrad = NatGradOptimizer(gamma).make_optimize_action(model,
                                                           var_list=var_list)

    actions = [adam, natgrad]
    actions = actions if callback is None else actions + [callback]

    Loop(actions, stop=iterations)()
    model.anchor(model.enquire_session())
예제 #2
0
    def train_model(self, dgp_model):


        ng_vars = [[dgp_model.layers[-1].q_mu, dgp_model.layers[-1].q_sqrt]]
        for v in ng_vars[0]:
            v.set_trainable(False)
        ng_action = NatGradOptimizer(gamma=0.1).make_optimize_action(dgp_model, var_list=ng_vars)
        adam_action = AdamOptimizer(0.01).make_optimize_action(dgp_model)

        iterations = 10000
        try:
            Loop([ng_action, adam_action], stop=iterations)()
        except:
            print('Failure of Cholesky in Nat Gradient')

        # sess = dgp_model.enquire_session()
        #
        # gamma_start = 1e-2
        # gamma_max = 1e-1
        # gamma_step = 1e-2
        #
        # gamma = tf.Variable(gamma_start, dtype=tf.float64)
        # gamma_incremented = tf.where(tf.less(gamma, gamma_max), gamma + gamma_step, gamma_max)
        #
        # op_ng = NatGradOptimizer(gamma).make_optimize_tensor(dgp_model, var_list=[[dgp_model.layers[-1].q_mu,
        #                                                                            dgp_model.layers[-1].q_sqrt]])
        # op_adam = AdamOptimizer(0.001).make_optimize_tensor(dgp_model)
        # op_increment_gamma = tf.assign(gamma, gamma_incremented)
        #
        # gamma_fallback = 1e-1  # we'll reduce by this factor if there's a cholesky failure
        # op_fallback_gamma = tf.assign(gamma, gamma * gamma_fallback)
        #
        # sess.run(tf.variables_initializer([gamma]))
        #
        # iterations = 10000
        # for it in range(iterations):
        #     try:
        #         sess.run(op_ng)
        #         sess.run(op_increment_gamma)
        #     except tf.errors.InvalidArgumentError:
        #         g = sess.run(gamma)
        #         print('gamma = {} on iteration {} is too big! Falling back to {}'.format(it, g, g * gamma_fallback))
        #         sess.run(op_fallback_gamma)
        #
        #     sess.run(op_adam)
        #
        #     if it % 1000 == 0:
        #         print('{} gamma={:.4f} ELBO={:.4f}'.format(it, *sess.run([gamma, dgp_model.likelihood_tensor])))
        #
        # dgp_model.anchor(sess)
        # # print(len(tf.all_variables()))
        # # print(len(tf.get_default_graph().get_operations()))
        sess = dgp_model.enquire_session()
        dgp_model.anchor(sess)
        print('ELBO={:.4f}'.format(*sess.run([dgp_model.likelihood_tensor])))
        return dgp_model
    def test_2layer_vs_nat_grad(self):
        Ns, N, M = 5, 1, 50
        D_X, D_Y = 1, 1

        lik_var = 0.1

        X = np.random.uniform(size=(N, D_X))
        Y = np.random.uniform(size=(N, D_Y))
        Z = np.random.uniform(size=(M, D_Y))
        Xs = np.random.uniform(size=(Ns, D_X))

        Z[:N, :] = X[:M, :]

        def kerns():
            return [RBF(D_X, lengthscales=0.1), RBF(D_X, lengthscales=0.5)]

        layers_col = init_layers_linear(X, Y, Z, kerns())
        layers_ng = init_layers_linear(X, Y, Z, kerns())

        def lik():
            l = Gaussian()
            l.variance = lik_var
            return l

        last_layer = SGPR_Layer(layers_col[-1].kern,
                                layers_col[-1].feature.Z.read_value(), D_Y,
                                layers_col[-1].mean_function)

        layers_col = layers_col[:-1] + [last_layer]
        m_col = DGP_Collapsed(X, Y, lik(), layers_col)
        m_ng = DGP_Quad(X, Y, lik(), layers_ng, H=200)

        q_mu1 = np.random.randn(M, D_X)
        q_sqrt1 = np.random.randn(M, M)
        q_sqrt1 = np.tril(q_sqrt1)[None, :, :]

        for m in m_col, m_ng:
            m.layers[0].q_mu = q_mu1
            m.layers[0].q_sqrt = q_sqrt1

        p = [[m_ng.layers[-1].q_mu, m_ng.layers[-1].q_sqrt]]
        NatGradOptimizer(gamma=1.).minimize(m_ng, var_list=p, maxiter=1)

        assert_allclose(m_col.compute_log_likelihood(),
                        m_ng.compute_log_likelihood())
예제 #4
0
    def fit(self, X, Y):
        """
        Optimize
        """
        if not self.model:
            self.init_model(ODVGP, X, Y)

        var_list = [[self.model.basis.a_beta, self.model.basis.L]]
        self.model.basis.a_beta.set_trainable(False)

        op_ng = NatGradOptimizer(SETTINGS.ng_stepsize).make_optimize_tensor(
            self.model, var_list=var_list)
        op_adam = AdamOptimizer(
            SETTINGS.adam_stepsize).make_optimize_tensor(self.model)
        for it in range(SETTINGS.iterations):
            self.sess.run(op_ng)
            self.sess.run(op_adam)

            if it % 50 == 0:
                print('Iter: {}, Loss:{:.4f}'.format(
                    it, self.sess.run(self.model.likelihood_tensor)))

        self.model.anchor(self.sess)
예제 #5
0
def build_model(ARGS, X, Y, apply_name=True):

    if ARGS.mode == 'CVAE':

        layers = []
        for l in ARGS.configuration.split('_'):
            try:
                layers.append(int(l))
            except:
                pass

        with defer_build():
            name = 'CVAE' if apply_name else None
            model = CVAE(X, Y, 1, layers, batch_size=ARGS.minibatch_size, name=name)

        model.compile()

        global_step = tf.Variable(0, dtype=tf.int32)
        op_increment = tf.assign_add(global_step, 1)

        lr = tf.cast(tf.train.exponential_decay(ARGS.lr, global_step, 1000, 0.98, staircase=True), dtype=tf.float64)
        op_adam = AdamOptimizer(lr).make_optimize_tensor(model)

        model.train_op = lambda s: s.run([op_adam, op_increment])
        model.init_op = lambda s: s.run(tf.variables_initializer([global_step]))
        model.global_step = global_step

        model.compile()

    else:
        N, D = X.shape

        # first layer inducing points
        if N > ARGS.M:
            Z = kmeans2(X, ARGS.M, minit='points')[0]
        else:
            M_pad = ARGS.M - N
            Z = np.concatenate([X.copy(), np.random.randn(M_pad, D)], 0)

        #################################### layers
        P = np.linalg.svd(X, full_matrices=False)[2]
        # PX = P.copy()

        layers = []
        # quad_layers = []

        DX = D
        DY = 1

        D_in = D
        D_out = D
        with defer_build():
            lik = Gaussian()
            lik.variance = ARGS.likelihood_variance

            if len(ARGS.configuration) > 0:
                for c, d in ARGS.configuration.split('_'):
                    if c == 'G':
                        num_gps = int(d)
                        A = np.zeros((D_in, D_out))
                        D_min = min(D_in, D_out)
                        A[:D_min, :D_min] = np.eye(D_min)
                        mf = Linear(A=A)
                        mf.b.set_trainable(False)

                        def make_kern():
                            k = RBF(D_in, lengthscales=float(D_in) ** 0.5, variance=1., ARD=True)
                            k.variance.set_trainable(False)
                            return k

                        PP = np.zeros((D_out, num_gps))
                        PP[:, :min(num_gps, DX)] = P[:, :min(num_gps, DX)]
                        ZZ = np.random.randn(ARGS.M, D_in)
                        ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)]

                        kern = SharedMixedMok(make_kern(), W=PP)
                        inducing = MixedKernelSharedMof(InducingPoints(ZZ))

                        l = GPLayer(kern, inducing, num_gps, mean_function=mf)
                        if ARGS.fix_linear is True:
                            kern.W.set_trainable(False)
                            mf.set_trainable(False)

                        layers.append(l)

                        D_in = D_out

                    elif c == 'L':
                        d = int(d)
                        D_in += d
                        layers.append(LatentVariableLayer(d, XY_dim=DX+1))

            kern = RBF(D_in, lengthscales=float(D_in)**0.5, variance=1., ARD=True)
            ZZ = np.random.randn(ARGS.M, D_in)
            ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)]
            layers.append(GPLayer(kern, InducingPoints(ZZ), DY))


            #################################### model
            name = 'Model' if apply_name else None

            if ARGS.mode == 'VI':
                model = DGP_VI(X, Y, layers, lik,
                               minibatch_size=ARGS.minibatch_size,
                               name=name)

            elif ARGS.mode == 'SGHMC':
                for layer in layers:
                    if hasattr(layer, 'q_sqrt'):
                        del layer.q_sqrt
                        layer.q_sqrt = None
                        layer.q_mu.set_trainable(False)

                model = DGP_VI(X, Y, layers, lik,
                               minibatch_size=ARGS.minibatch_size,
                               name=name)


            elif ARGS.mode == 'IWAE':
                model = DGP_IWVI(X, Y, layers, lik,
                                 minibatch_size=ARGS.minibatch_size,
                                 num_samples=ARGS.num_IW_samples,
                                 name=name)



        global_step = tf.Variable(0, dtype=tf.int32)
        op_increment = tf.assign_add(global_step, 1)

        if not ('SGHMC' == ARGS.mode):
            for layer in model.layers[:-1]:
                if isinstance(layer, GPLayer):
                    layer.q_sqrt = layer.q_sqrt.read_value() * 1e-5

            model.compile()

            #################################### optimization

            var_list = [[model.layers[-1].q_mu, model.layers[-1].q_sqrt]]

            model.layers[-1].q_mu.set_trainable(False)
            model.layers[-1].q_sqrt.set_trainable(False)

            gamma = tf.cast(tf.train.exponential_decay(ARGS.gamma, global_step, 1000, ARGS.gamma_decay, staircase=True),
                            dtype=tf.float64)
            lr = tf.cast(tf.train.exponential_decay(ARGS.lr, global_step, 1000, ARGS.lr_decay, staircase=True), dtype=tf.float64)

            op_ng = NatGradOptimizer(gamma=gamma).make_optimize_tensor(model, var_list=var_list)

            op_adam = AdamOptimizer(lr).make_optimize_tensor(model)

            def train(s):
                s.run(op_increment)
                s.run(op_ng)
                s.run(op_adam)

            model.train_op = train
            model.init_op = lambda s: s.run(tf.variables_initializer([global_step]))
            model.global_step = global_step

        else:
            model.compile()

            hmc_vars = []
            for layer in layers:
                if hasattr(layer, 'q_mu'):
                    hmc_vars.append(layer.q_mu.unconstrained_tensor)

            hyper_train_op = AdamOptimizer(ARGS.lr).make_optimize_tensor(model)

            sghmc_optimizer = SGHMC(model, hmc_vars, hyper_train_op, 100)

            def train_op(s):
                s.run(op_increment),
                sghmc_optimizer.sghmc_step(s),
                sghmc_optimizer.train_hypers(s)

            model.train_op = train_op
            model.sghmc_optimizer = sghmc_optimizer
            def init_op(s):
                epsilon = 0.01
                mdecay = 0.05
                with tf.variable_scope('hmc'):
                    sghmc_optimizer.generate_update_step(epsilon, mdecay)
                v = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='hmc')
                s.run(tf.variables_initializer(v))
                s.run(tf.variables_initializer([global_step]))

            model.init_op = init_op
            model.global_step = global_step

    return model
예제 #6
0
def build_model(ARGS, X, Y, apply_name=True):
    N, D = X.shape

    # first layer inducing points
    if N > ARGS.M:
        Z = kmeans2(X, ARGS.M, minit="points")[0]
    else:
        M_pad = ARGS.M - N
        Z = np.concatenate([X.copy(), np.random.randn(M_pad, D)], 0)

    #################################### layers
    P = np.linalg.svd(X, full_matrices=False)[2]

    layers = []

    DX = D
    DY = 1

    D_in = D
    D_out = D
    with defer_build():
        lik = Gaussian()
        lik.variance = ARGS.likelihood_variance

        if len(ARGS.configuration) > 0:
            for c, d in ARGS.configuration.split("_"):
                if c == "G":
                    num_gps = int(d)
                    A = np.zeros((D_in, D_out))
                    D_min = min(D_in, D_out)
                    A[:D_min, :D_min] = np.eye(D_min)
                    mf = Linear(A=A)
                    mf.b.set_trainable(False)

                    def make_kern():
                        k = RBF(D_in,
                                lengthscales=float(D_in)**0.5,
                                variance=1.0,
                                ARD=True)
                        k.variance.set_trainable(False)
                        return k

                    PP = np.zeros((D_out, num_gps))
                    PP[:, :min(num_gps, DX)] = P[:, :min(num_gps, DX)]
                    ZZ = np.random.randn(ARGS.M, D_in)
                    ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)]

                    kern = SharedMixedMok(make_kern(), W=PP)
                    inducing = MixedKernelSharedMof(InducingPoints(ZZ))

                    l = GPLayer(kern,
                                inducing,
                                num_gps,
                                layer_num=len(layers),
                                mean_function=mf)
                    if ARGS.fix_linear is True:
                        kern.W.set_trainable(False)
                        mf.set_trainable(False)

                    layers.append(l)

                    D_in = D_out

                elif c == "L":
                    d = int(d)
                    D_in += d
                    encoder_dims = [
                        int(dim.strip())
                        for dim in ARGS.encoder_dims.split(",")
                    ]
                    layers.append(
                        LatentVariableLayer(d,
                                            XY_dim=DX + 1,
                                            encoder_dims=encoder_dims,
                                            qz_mode=ARGS.qz_mode))

        kern = RBF(D_in, lengthscales=float(D_in)**0.5, variance=1.0, ARD=True)
        ZZ = np.random.randn(ARGS.M, D_in)
        ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)]
        layers.append(GPLayer(kern, InducingPoints(ZZ), DY))

        #################################### model
        name = "Model" if apply_name else None

        if ARGS.mode == "VI":
            model = DGP_VI(X,
                           Y,
                           layers,
                           lik,
                           minibatch_size=ARGS.minibatch_size,
                           name=name)

        elif ARGS.mode == "IWAE":
            model = DGP_IWVI(
                X=X,
                Y=Y,
                layers=layers,
                likelihood=lik,
                minibatch_size=ARGS.minibatch_size,
                num_samples=ARGS.num_IW_samples,
                name=name,
                encoder_minibatch_size=ARGS.encoder_minibatch_size,
            )

        elif ARGS.mode == "CIWAE":
            model = DGP_CIWAE(
                X,
                Y,
                layers,
                lik,
                minibatch_size=ARGS.minibatch_size,
                num_samples=ARGS.num_IW_samples,
                name=name,
                beta=ARGS.beta,
            )

        else:
            raise ValueError(f"Unknown mode {ARGS.mode}.")

    global_step = tf.Variable(0, dtype=tf.int32)
    op_increment = tf.assign_add(global_step, 1)

    for layer in model.layers[:-1]:
        if isinstance(layer, GPLayer):
            layer.q_sqrt = layer.q_sqrt.read_value() * 1e-5

    model.compile()

    #################################### optimization

    # Whether to train the final layer with the other parameters, using Adam, or by itself, using natural
    # gradients.
    if ARGS.use_nat_grad_for_final_layer:
        # Turn off training so the parameters are not optimised by Adam. We pass them directly to the natgrad
        # optimiser, which bypasses this flag.
        model.layers[-1].q_mu.set_trainable(False)
        model.layers[-1].q_sqrt.set_trainable(False)

        gamma = tf.cast(
            tf.train.exponential_decay(ARGS.gamma,
                                       global_step,
                                       1000,
                                       ARGS.gamma_decay,
                                       staircase=True),
            dtype=tf.float64,
        )
        final_layer_vars = [[model.layers[-1].q_mu, model.layers[-1].q_sqrt]]
        final_layer_opt_op = NatGradOptimizer(
            gamma=gamma).make_optimize_tensor(model, var_list=final_layer_vars)
    else:
        final_layer_opt_op = NoOp()

    lr = tf.cast(
        tf.train.exponential_decay(ARGS.lr,
                                   global_step,
                                   decay_steps=1000,
                                   decay_rate=ARGS.lr_decay,
                                   staircase=True),
        dtype=tf.float64,
    )

    encoder_lr = tf.cast(
        tf.train.exponential_decay(
            ARGS.encoder_lr,
            global_step,
            decay_steps=1000,
            decay_rate=ARGS.encoder_lr_decay,
            staircase=True,
        ),
        dtype=tf.float64,
    )

    dreg_optimizer = DregOptimizer(
        enable_dreg=ARGS.use_dreg,
        optimizer=ARGS.optimizer,
        encoder_optimizer=ARGS.encoder_optimizer,
        learning_rate=lr,
        encoder_learning_rate=encoder_lr,
        assert_no_nans=ARGS.assert_no_nans,
        encoder_grad_clip_value=ARGS.clip_encoder_grads,
    )
    other_layers_opt_op = dreg_optimizer.make_optimize_tensor(model)

    model.lr = lr
    model.train_op = tf.group(op_increment, final_layer_opt_op,
                              other_layers_opt_op)
    model.init_op = lambda s: s.run(tf.variables_initializer([global_step]))
    model.global_step = global_step

    return model
예제 #7
0
    def build_model(self,
                    ARGS,
                    X,
                    Y,
                    conditioning=False,
                    apply_name=True,
                    noise_var=None,
                    mean_function=None):

        if conditioning == False:
            N, D = X.shape

            # first layer inducing points
            if N > ARGS.M:
                Z = kmeans2(X, ARGS.M, minit='points')[0]
            else:
                # This is the old way of initializing Zs
                # M_pad = ARGS.M - N
                # Z = np.concatenate([X.copy(), np.random.randn(M_pad, D)], 0)

                # This is the new way of initializing Zs
                min_x, max_x = self.bounds[0]
                min_x = (min_x - self.x_mean) / self.x_std
                max_x = (max_x - self.x_mean) / self.x_std

                Z = np.linspace(min_x, max_x, num=ARGS.M)  # * X.shape[1])
                Z = Z.reshape((-1, X.shape[1]))
                #print(min_x)
                #print(max_x)
                #print(Z)

            #################################### layers
            P = np.linalg.svd(X, full_matrices=False)[2]
            # PX = P.copy()

            layers = []
            # quad_layers = []

            DX = D
            DY = 1

            D_in = D
            D_out = D

            with defer_build():

                # variance initialiaztion
                lik = Gaussian()
                lik.variance = ARGS.likelihood_variance

                if len(ARGS.configuration) > 0:
                    for c, d in ARGS.configuration.split('_'):
                        if c == 'G':
                            num_gps = int(d)
                            A = np.zeros((D_in, D_out))
                            D_min = min(D_in, D_out)
                            A[:D_min, :D_min] = np.eye(D_min)
                            mf = Linear(A=A)
                            mf.b.set_trainable(False)

                            def make_kern():
                                k = RBF(D_in,
                                        lengthscales=float(D_in)**0.5,
                                        variance=1.,
                                        ARD=True)
                                k.variance.set_trainable(False)
                                return k

                            PP = np.zeros((D_out, num_gps))
                            PP[:, :min(num_gps, DX)] = P[:, :min(num_gps, DX)]
                            ZZ = np.random.randn(ARGS.M, D_in)
                            # print(Z.shape)
                            # print(ZZ.shape)
                            ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)]

                            kern = SharedMixedMok(make_kern(), W=PP)
                            inducing = MixedKernelSharedMof(InducingPoints(ZZ))

                            l = GPLayer(kern,
                                        inducing,
                                        num_gps,
                                        mean_function=mf)
                            if ARGS.fix_linear is True:
                                kern.W.set_trainable(False)
                                mf.set_trainable(False)

                            layers.append(l)

                            D_in = D_out

                        elif c == 'L':
                            d = int(d)
                            D_in += d
                            layers.append(LatentVariableLayer(d,
                                                              XY_dim=DX + 1))

                # kernel initialization
                kern = RBF(D_in,
                           lengthscales=float(D_in)**0.5,
                           variance=1.,
                           ARD=True)
                ZZ = np.random.randn(ARGS.M, D_in)
                ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)]
                layers.append(GPLayer(kern, InducingPoints(ZZ), DY))
                self.layers = layers
                self.lik = lik

            # global_step = tf.Variable(0, dtype=tf.int32)
            # self.global_step = global_step
        else:
            lik = self._gp.likelihood
            layers = self._gp.layers._list
            # val = self.session.run(self.global_step)
            # global_step = tf.Variable(val, dtype=tf.int32)
            # self.global_step = global_step
            self._gp.clear()

        with defer_build():

            #################################### model
            name = 'Model' if apply_name else None

            if ARGS.mode == 'VI':
                model = DGP_VI(X,
                               Y,
                               layers,
                               lik,
                               minibatch_size=ARGS.minibatch_size,
                               name=name)

            elif ARGS.mode == 'SGHMC':
                for layer in layers:
                    if hasattr(layer, 'q_sqrt'):
                        del layer.q_sqrt
                        layer.q_sqrt = None
                        layer.q_mu.set_trainable(False)

                model = DGP_VI(X,
                               Y,
                               layers,
                               lik,
                               minibatch_size=ARGS.minibatch_size,
                               name=name)

            elif ARGS.mode == 'IWAE':
                model = DGP_IWVI(X,
                                 Y,
                                 layers,
                                 lik,
                                 minibatch_size=ARGS.minibatch_size,
                                 num_samples=ARGS.num_IW_samples,
                                 name=name)

        global_step = tf.Variable(0, dtype=tf.int32)
        op_increment = tf.assign_add(global_step, 1)

        if not ('SGHMC' == ARGS.mode):
            for layer in model.layers[:-1]:
                if isinstance(layer, GPLayer):
                    layer.q_sqrt = layer.q_sqrt.read_value() * 1e-5

            model.compile()

            #################################### optimization

            var_list = [[model.layers[-1].q_mu, model.layers[-1].q_sqrt]]

            model.layers[-1].q_mu.set_trainable(False)
            model.layers[-1].q_sqrt.set_trainable(False)

            gamma = tf.cast(tf.train.exponential_decay(ARGS.gamma,
                                                       global_step,
                                                       1000,
                                                       ARGS.gamma_decay,
                                                       staircase=True),
                            dtype=tf.float64)
            lr = tf.cast(tf.train.exponential_decay(ARGS.lr,
                                                    global_step,
                                                    1000,
                                                    ARGS.lr_decay,
                                                    staircase=True),
                         dtype=tf.float64)

            op_ng = NatGradOptimizer(gamma=gamma).make_optimize_tensor(
                model, var_list=var_list)

            op_adam = AdamOptimizer(lr).make_optimize_tensor(model)

            def train(s):
                s.run(op_increment)
                s.run(op_ng)
                s.run(op_adam)

            model.train_op = train
            model.init_op = lambda s: s.run(
                tf.variables_initializer([global_step]))
            model.global_step = global_step

        else:
            model.compile()

            sghmc_vars = []
            for layer in layers:
                if hasattr(layer, 'q_mu'):
                    sghmc_vars.append(layer.q_mu.unconstrained_tensor)

            hyper_train_op = AdamOptimizer(ARGS.lr).make_optimize_tensor(model)

            self.sghmc_optimizer = SGHMC(model, sghmc_vars, hyper_train_op,
                                         100)

            def train_op(s):
                s.run(op_increment),
                self.sghmc_optimizer.sghmc_step(s),
                self.sghmc_optimizer.train_hypers(s)

            model.train_op = train_op
            model.sghmc_optimizer = self.sghmc_optimizer

            def init_op(s):
                epsilon = 0.01
                mdecay = 0.05
                with tf.variable_scope('sghmc'):
                    self.sghmc_optimizer.generate_update_step(epsilon, mdecay)
                v = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                      scope='sghmc')
                s.run(tf.variables_initializer(v))
                s.run(tf.variables_initializer([global_step]))

            # Added jitter due to input matrix invertability problems
            custom_config = gpflow.settings.get_settings()
            custom_config.numerics.jitter_level = 1e-8

            model.init_op = init_op
            model.global_step = global_step

        # build the computation graph for the gradient
        self.X_placeholder = tf.placeholder(tf.float64,
                                            shape=[None, X.shape[1]])
        self.Fs, Fmu, Fvar = model._build_predict(self.X_placeholder)
        self.mean_grad = tf.gradients(Fmu, self.X_placeholder)
        self.var_grad = tf.gradients(Fvar, self.X_placeholder)

        # calculated the gradient of the mean for the quantile-filtered distribution
        # print(Fs)
        # q = np.quantile(Fs, self.quantile, axis=0)
        # qFs = [f for f in Fs if f < q]
        # q_mean = np.mean(qFs, axis=0)
        # q_var = np.var(qFs, axis=0)
        # self.qmean_grad = tf.gradients(q_mean, self.X_placeholder)
        # self.qvar_grad = tf.gradients(q_var, self.X_placeholder)

        return model
        def test_vs_DGP2(self):
            lik = Gaussian()
            lik_var = 0.1
            lik.variance = lik_var
            N, Ns, D_Y, D_X = self.X.shape[0], self.Xs.shape[
                0], self.D_Y, self.X.shape[1]

            q_mu = np.random.randn(N, D_X)

            Y = np.random.randn(N, D_Y)
            Ys = np.random.randn(Ns, D_Y)

            kern1 = Matern52(self.X.shape[1], lengthscales=0.5)
            kern2 = Matern52(self.X.shape[1], lengthscales=0.5)
            kerns = [kern1, kern2]
            # mf = Linear(A=np.random.randn(D_X, D_Y), b=np.random.randn(D_Y))

            mf = Zero()
            m_dgp = DGP(self.X,
                        Y,
                        self.X,
                        kerns,
                        lik,
                        mean_function=mf,
                        white=True)
            m_dgp.layers[0].q_mu = q_mu
            m_dgp.layers[0].q_sqrt = m_dgp.layers[0].q_sqrt.read_value(
            ) * 1e-24

            Fs, ms, vs = m_dgp.predict_all_layers(self.Xs, 1)
            Z = self.X.copy()
            Z[:len(self.Xs)] = ms[0][0]
            m_dgp.layers[
                1].feature.Z = Z  # need to put the inducing points in the right place

            var_list = [[m_dgp.layers[1].q_mu, m_dgp.layers[1].q_sqrt]]
            NatGradOptimizer(gamma=1).minimize(m_dgp,
                                               var_list=var_list,
                                               maxiter=1)

            mean_dgp, var_dgp = m_dgp.predict_y(self.Xs, 1)
            test_lik_dgp = m_dgp.predict_density(self.Xs, Ys, 1)
            pred_m_dgp, pred_v_gpr = m_dgp.predict_f(self.Xs, 1)
            pred_mfull_dgp, pred_vfull_dgp = m_dgp.predict_f_full_cov(
                self.Xs, 1)

            # mean_functions = [Identity(), mf]
            layer0 = GPMC_Layer(kerns[0], self.X.copy(), D_X, Identity())
            layer1 = GPR_Layer(kerns[1], mf, D_Y)

            m_heinonen = DGP_Heinonen(self.X, Y, lik, [layer0, layer1])

            m_heinonen.layers[0].q_mu = q_mu

            mean_heinonen, var_heinonen = m_heinonen.predict_y(self.Xs, 1)
            test_lik_heinonen = m_heinonen.predict_density(self.Xs, Ys, 1)
            pred_m_heinonen, pred_v_heinonen = m_heinonen.predict_f(self.Xs, 1)
            pred_mfull_heinonen, pred_vfull_heinonen = m_heinonen.predict_f_full_cov(
                self.Xs, 1)

            tol = 1e-4
            assert_allclose(mean_dgp, mean_heinonen, atol=tol, rtol=tol)
            assert_allclose(test_lik_dgp,
                            test_lik_heinonen,
                            atol=tol,
                            rtol=tol)
            assert_allclose(pred_m_dgp, pred_m_heinonen, atol=tol, rtol=tol)
            assert_allclose(pred_mfull_dgp,
                            pred_mfull_heinonen,
                            atol=tol,
                            rtol=tol)
            assert_allclose(pred_vfull_dgp,
                            pred_vfull_heinonen,
                            atol=tol,
                            rtol=tol)
예제 #9
0
def train_with_nat(model,
                   gamma_start=1e-5,
                   gamma_add=1e-3,
                   gamma_mul=1.04,
                   gamma_max=0.1,
                   gamma_fallback=1e-1,
                   iterations=500,
                   var_list=None,
                   callback=None,
                   **kwargs):
    # we'll make use of this later when we use a XiTransform
    if var_list is None:
        var_list = [[model.q_mu, model.q_sqrt]]

    with tf.variable_scope("gamma"):

        gamma_start = tf.cast(gamma_start, tf.float64)
        gamma_max = tf.cast(gamma_max, tf.float64)
        mul_step = tf.cast(gamma_mul, tf.float64)
        add_step = tf.cast(gamma_add, tf.float64)
        gamma = tf.Variable(gamma_start, dtype=tf.float64, trainable=False)

        gamma_ref = tf.identity(gamma)

        gamma_fallback = tf.cast(
            gamma_fallback, tf.float64
        )  # we'll reduce by this factor if there's a cholesky failure
        op_fallback_gamma = tf.assign(gamma, gamma_ref * gamma_fallback)
        diff = tf.where(gamma_ref * mul_step < add_step, gamma_ref * mul_step,
                        add_step)
        op_gamma_inc = tf.assign(
            gamma,
            tf.where(gamma_ref + diff > gamma_max, gamma_max,
                     gamma_ref + diff))

    tf.summary.scalar("optimisation/gamma", gamma)
    sess = model.enquire_session()
    tf_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='gamma')
    sess.run(tf.variables_initializer(var_list=tf_vars))

    natgrad = NatGradOptimizer(gamma_ref).make_optimize_action(
        model, var_list=var_list)

    actions = [natgrad, GammaSchedule(op_gamma_inc)]
    actions = actions if callback is None else actions + callback

    for c in callback:
        try:
            c.init()
        except:
            pass

    sess = model.enquire_session()
    it = 0
    while it < iterations:
        try:
            looper = Loop(actions, start=it, stop=iterations)
            looper()
            it = looper.iteration
        except tf.errors.InvalidArgumentError:
            it = looper.iteration
            g, gf = sess.run([gamma_ref, op_fallback_gamma])
            logging.info(
                'gamma = {} on iteration {} is too big! Falling back to {}'.
                format(g, it, gf))

    model.anchor(model.enquire_session())
예제 #10
0
def train_with_nat_and_adam(model,
                            initial_learning_rate=0.03,
                            learning_rate_steps=2,
                            learning_rate_decay=1.5,
                            gamma_start=1e-5,
                            gamma_add=1e-3,
                            gamma_mul=1.1,
                            gamma_max=0.1,
                            gamma_fallback=1e-1,
                            iterations=500,
                            var_list=None,
                            callback=None,
                            **kwargs):
    # we'll make use of this later when we use a XiTransform
    if var_list is None:
        var_list = [[model.q_mu, model.q_sqrt]]

    # we don't want adam optimizing these
    model.q_mu.set_trainable(False)
    model.q_sqrt.set_trainable(False)

    with tf.variable_scope("learning_rate"):
        global_step = tf.Variable(0, trainable=False)
        starter_learning_rate = initial_learning_rate
        decay_steps = int(iterations / learning_rate_steps)
        decay_rate = 1. / learning_rate_decay
        learning_rate = tf.train.exponential_decay(starter_learning_rate,
                                                   tf.assign_add(
                                                       global_step, 1),
                                                   decay_steps,
                                                   decay_rate,
                                                   staircase=True)
    tf.summary.scalar("optimisation/learning_rate", learning_rate)
    sess = model.enquire_session()
    tf_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                scope='learning_rate')
    sess.run(tf.variables_initializer(var_list=tf_vars))

    with tf.variable_scope("gamma"):

        #        gamma = tf.Variable(gamma_start, dtype=tf.float64)
        #        beta = tf.Variable(1.,dtype=tf.float64)

        gamma_start = tf.cast(gamma_start, tf.float64)
        gamma_max = tf.cast(gamma_max, tf.float64)
        mul_step = tf.cast(gamma_mul, tf.float64)
        add_step = tf.cast(gamma_add, tf.float64)
        gamma = tf.Variable(gamma_start, dtype=tf.float64)

        gamma_ref = tf.identity(gamma)

        gamma_fallback = tf.cast(
            gamma_fallback, tf.float64
        )  # we'll reduce by this factor if there's a cholesky failure
        op_fallback_gamma = tf.assign(gamma, gamma * gamma_fallback)
        diff = tf.where(gamma_ref * mul_step < add_step, gamma_ref * mul_step,
                        add_step)
        op_gamma_inc = tf.assign(
            gamma,
            tf.where(gamma_ref + diff > gamma_max, gamma_max,
                     gamma_ref + diff))

    tf.summary.scalar("optimisation/gamma", gamma)
    sess = model.enquire_session()
    tf_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='gamma')
    sess.run(tf.variables_initializer(var_list=tf_vars))

    natgrad = NatGradOptimizer(gamma_ref).make_optimize_action(
        model, var_list=var_list)
    adam = AdamOptimizer(learning_rate).make_optimize_action(model)

    actions = [adam, natgrad, GammaSchedule(op_gamma_inc)]
    actions = actions if callback is None else actions + callback
    for c in callback:
        try:
            c.init()
        except:
            pass

    sess = model.enquire_session()
    it = 0
    while it < iterations:
        try:
            looper = Loop(actions, start=it, stop=iterations)
            looper()
            it = looper.iteration
        except tf.errors.InvalidArgumentError:
            it = looper.iteration
            g, gf = sess.run([gamma_ref, op_fallback_gamma])
            logging.info(
                'gamma = {} on iteration {} is too big! Falling back to {}'.
                format(g, it, gf))

    model.anchor(model.enquire_session())