예제 #1
0
def build_model(ARGS, X, Y, apply_name=True):
    N, D = X.shape

    # first layer inducing points
    if N > ARGS.M:
        Z = kmeans2(X, ARGS.M, minit="points")[0]
    else:
        M_pad = ARGS.M - N
        Z = np.concatenate([X.copy(), np.random.randn(M_pad, D)], 0)

    #################################### layers
    P = np.linalg.svd(X, full_matrices=False)[2]

    layers = []

    DX = D
    DY = 1

    D_in = D
    D_out = D
    with defer_build():
        lik = Gaussian()
        lik.variance = ARGS.likelihood_variance

        if len(ARGS.configuration) > 0:
            for c, d in ARGS.configuration.split("_"):
                if c == "G":
                    num_gps = int(d)
                    A = np.zeros((D_in, D_out))
                    D_min = min(D_in, D_out)
                    A[:D_min, :D_min] = np.eye(D_min)
                    mf = Linear(A=A)
                    mf.b.set_trainable(False)

                    def make_kern():
                        k = RBF(D_in,
                                lengthscales=float(D_in)**0.5,
                                variance=1.0,
                                ARD=True)
                        k.variance.set_trainable(False)
                        return k

                    PP = np.zeros((D_out, num_gps))
                    PP[:, :min(num_gps, DX)] = P[:, :min(num_gps, DX)]
                    ZZ = np.random.randn(ARGS.M, D_in)
                    ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)]

                    kern = SharedMixedMok(make_kern(), W=PP)
                    inducing = MixedKernelSharedMof(InducingPoints(ZZ))

                    l = GPLayer(kern,
                                inducing,
                                num_gps,
                                layer_num=len(layers),
                                mean_function=mf)
                    if ARGS.fix_linear is True:
                        kern.W.set_trainable(False)
                        mf.set_trainable(False)

                    layers.append(l)

                    D_in = D_out

                elif c == "L":
                    d = int(d)
                    D_in += d
                    encoder_dims = [
                        int(dim.strip())
                        for dim in ARGS.encoder_dims.split(",")
                    ]
                    layers.append(
                        LatentVariableLayer(d,
                                            XY_dim=DX + 1,
                                            encoder_dims=encoder_dims,
                                            qz_mode=ARGS.qz_mode))

        kern = RBF(D_in, lengthscales=float(D_in)**0.5, variance=1.0, ARD=True)
        ZZ = np.random.randn(ARGS.M, D_in)
        ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)]
        layers.append(GPLayer(kern, InducingPoints(ZZ), DY))

        #################################### model
        name = "Model" if apply_name else None

        if ARGS.mode == "VI":
            model = DGP_VI(X,
                           Y,
                           layers,
                           lik,
                           minibatch_size=ARGS.minibatch_size,
                           name=name)

        elif ARGS.mode == "IWAE":
            model = DGP_IWVI(
                X=X,
                Y=Y,
                layers=layers,
                likelihood=lik,
                minibatch_size=ARGS.minibatch_size,
                num_samples=ARGS.num_IW_samples,
                name=name,
                encoder_minibatch_size=ARGS.encoder_minibatch_size,
            )

        elif ARGS.mode == "CIWAE":
            model = DGP_CIWAE(
                X,
                Y,
                layers,
                lik,
                minibatch_size=ARGS.minibatch_size,
                num_samples=ARGS.num_IW_samples,
                name=name,
                beta=ARGS.beta,
            )

        else:
            raise ValueError(f"Unknown mode {ARGS.mode}.")

    global_step = tf.Variable(0, dtype=tf.int32)
    op_increment = tf.assign_add(global_step, 1)

    for layer in model.layers[:-1]:
        if isinstance(layer, GPLayer):
            layer.q_sqrt = layer.q_sqrt.read_value() * 1e-5

    model.compile()

    #################################### optimization

    # Whether to train the final layer with the other parameters, using Adam, or by itself, using natural
    # gradients.
    if ARGS.use_nat_grad_for_final_layer:
        # Turn off training so the parameters are not optimised by Adam. We pass them directly to the natgrad
        # optimiser, which bypasses this flag.
        model.layers[-1].q_mu.set_trainable(False)
        model.layers[-1].q_sqrt.set_trainable(False)

        gamma = tf.cast(
            tf.train.exponential_decay(ARGS.gamma,
                                       global_step,
                                       1000,
                                       ARGS.gamma_decay,
                                       staircase=True),
            dtype=tf.float64,
        )
        final_layer_vars = [[model.layers[-1].q_mu, model.layers[-1].q_sqrt]]
        final_layer_opt_op = NatGradOptimizer(
            gamma=gamma).make_optimize_tensor(model, var_list=final_layer_vars)
    else:
        final_layer_opt_op = NoOp()

    lr = tf.cast(
        tf.train.exponential_decay(ARGS.lr,
                                   global_step,
                                   decay_steps=1000,
                                   decay_rate=ARGS.lr_decay,
                                   staircase=True),
        dtype=tf.float64,
    )

    encoder_lr = tf.cast(
        tf.train.exponential_decay(
            ARGS.encoder_lr,
            global_step,
            decay_steps=1000,
            decay_rate=ARGS.encoder_lr_decay,
            staircase=True,
        ),
        dtype=tf.float64,
    )

    dreg_optimizer = DregOptimizer(
        enable_dreg=ARGS.use_dreg,
        optimizer=ARGS.optimizer,
        encoder_optimizer=ARGS.encoder_optimizer,
        learning_rate=lr,
        encoder_learning_rate=encoder_lr,
        assert_no_nans=ARGS.assert_no_nans,
        encoder_grad_clip_value=ARGS.clip_encoder_grads,
    )
    other_layers_opt_op = dreg_optimizer.make_optimize_tensor(model)

    model.lr = lr
    model.train_op = tf.group(op_increment, final_layer_opt_op,
                              other_layers_opt_op)
    model.init_op = lambda s: s.run(tf.variables_initializer([global_step]))
    model.global_step = global_step

    return model
예제 #2
0
def build_model(ARGS, X, Y, apply_name=True):

    if ARGS.mode == 'CVAE':

        layers = []
        for l in ARGS.configuration.split('_'):
            try:
                layers.append(int(l))
            except:
                pass

        with defer_build():
            name = 'CVAE' if apply_name else None
            model = CVAE(X, Y, 1, layers, batch_size=ARGS.minibatch_size, name=name)

        model.compile()

        global_step = tf.Variable(0, dtype=tf.int32)
        op_increment = tf.assign_add(global_step, 1)

        lr = tf.cast(tf.train.exponential_decay(ARGS.lr, global_step, 1000, 0.98, staircase=True), dtype=tf.float64)
        op_adam = AdamOptimizer(lr).make_optimize_tensor(model)

        model.train_op = lambda s: s.run([op_adam, op_increment])
        model.init_op = lambda s: s.run(tf.variables_initializer([global_step]))
        model.global_step = global_step

        model.compile()

    else:
        N, D = X.shape

        # first layer inducing points
        if N > ARGS.M:
            Z = kmeans2(X, ARGS.M, minit='points')[0]
        else:
            M_pad = ARGS.M - N
            Z = np.concatenate([X.copy(), np.random.randn(M_pad, D)], 0)

        #################################### layers
        P = np.linalg.svd(X, full_matrices=False)[2]
        # PX = P.copy()

        layers = []
        # quad_layers = []

        DX = D
        DY = 1

        D_in = D
        D_out = D
        with defer_build():
            lik = Gaussian()
            lik.variance = ARGS.likelihood_variance

            if len(ARGS.configuration) > 0:
                for c, d in ARGS.configuration.split('_'):
                    if c == 'G':
                        num_gps = int(d)
                        A = np.zeros((D_in, D_out))
                        D_min = min(D_in, D_out)
                        A[:D_min, :D_min] = np.eye(D_min)
                        mf = Linear(A=A)
                        mf.b.set_trainable(False)

                        def make_kern():
                            k = RBF(D_in, lengthscales=float(D_in) ** 0.5, variance=1., ARD=True)
                            k.variance.set_trainable(False)
                            return k

                        PP = np.zeros((D_out, num_gps))
                        PP[:, :min(num_gps, DX)] = P[:, :min(num_gps, DX)]
                        ZZ = np.random.randn(ARGS.M, D_in)
                        ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)]

                        kern = SharedMixedMok(make_kern(), W=PP)
                        inducing = MixedKernelSharedMof(InducingPoints(ZZ))

                        l = GPLayer(kern, inducing, num_gps, mean_function=mf)
                        if ARGS.fix_linear is True:
                            kern.W.set_trainable(False)
                            mf.set_trainable(False)

                        layers.append(l)

                        D_in = D_out

                    elif c == 'L':
                        d = int(d)
                        D_in += d
                        layers.append(LatentVariableLayer(d, XY_dim=DX+1))

            kern = RBF(D_in, lengthscales=float(D_in)**0.5, variance=1., ARD=True)
            ZZ = np.random.randn(ARGS.M, D_in)
            ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)]
            layers.append(GPLayer(kern, InducingPoints(ZZ), DY))


            #################################### model
            name = 'Model' if apply_name else None

            if ARGS.mode == 'VI':
                model = DGP_VI(X, Y, layers, lik,
                               minibatch_size=ARGS.minibatch_size,
                               name=name)

            elif ARGS.mode == 'SGHMC':
                for layer in layers:
                    if hasattr(layer, 'q_sqrt'):
                        del layer.q_sqrt
                        layer.q_sqrt = None
                        layer.q_mu.set_trainable(False)

                model = DGP_VI(X, Y, layers, lik,
                               minibatch_size=ARGS.minibatch_size,
                               name=name)


            elif ARGS.mode == 'IWAE':
                model = DGP_IWVI(X, Y, layers, lik,
                                 minibatch_size=ARGS.minibatch_size,
                                 num_samples=ARGS.num_IW_samples,
                                 name=name)



        global_step = tf.Variable(0, dtype=tf.int32)
        op_increment = tf.assign_add(global_step, 1)

        if not ('SGHMC' == ARGS.mode):
            for layer in model.layers[:-1]:
                if isinstance(layer, GPLayer):
                    layer.q_sqrt = layer.q_sqrt.read_value() * 1e-5

            model.compile()

            #################################### optimization

            var_list = [[model.layers[-1].q_mu, model.layers[-1].q_sqrt]]

            model.layers[-1].q_mu.set_trainable(False)
            model.layers[-1].q_sqrt.set_trainable(False)

            gamma = tf.cast(tf.train.exponential_decay(ARGS.gamma, global_step, 1000, ARGS.gamma_decay, staircase=True),
                            dtype=tf.float64)
            lr = tf.cast(tf.train.exponential_decay(ARGS.lr, global_step, 1000, ARGS.lr_decay, staircase=True), dtype=tf.float64)

            op_ng = NatGradOptimizer(gamma=gamma).make_optimize_tensor(model, var_list=var_list)

            op_adam = AdamOptimizer(lr).make_optimize_tensor(model)

            def train(s):
                s.run(op_increment)
                s.run(op_ng)
                s.run(op_adam)

            model.train_op = train
            model.init_op = lambda s: s.run(tf.variables_initializer([global_step]))
            model.global_step = global_step

        else:
            model.compile()

            hmc_vars = []
            for layer in layers:
                if hasattr(layer, 'q_mu'):
                    hmc_vars.append(layer.q_mu.unconstrained_tensor)

            hyper_train_op = AdamOptimizer(ARGS.lr).make_optimize_tensor(model)

            sghmc_optimizer = SGHMC(model, hmc_vars, hyper_train_op, 100)

            def train_op(s):
                s.run(op_increment),
                sghmc_optimizer.sghmc_step(s),
                sghmc_optimizer.train_hypers(s)

            model.train_op = train_op
            model.sghmc_optimizer = sghmc_optimizer
            def init_op(s):
                epsilon = 0.01
                mdecay = 0.05
                with tf.variable_scope('hmc'):
                    sghmc_optimizer.generate_update_step(epsilon, mdecay)
                v = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='hmc')
                s.run(tf.variables_initializer(v))
                s.run(tf.variables_initializer([global_step]))

            model.init_op = init_op
            model.global_step = global_step

    return model
예제 #3
0
    def build_model(self,
                    ARGS,
                    X,
                    Y,
                    conditioning=False,
                    apply_name=True,
                    noise_var=None,
                    mean_function=None):

        if conditioning == False:
            N, D = X.shape

            # first layer inducing points
            if N > ARGS.M:
                Z = kmeans2(X, ARGS.M, minit='points')[0]
            else:
                # This is the old way of initializing Zs
                # M_pad = ARGS.M - N
                # Z = np.concatenate([X.copy(), np.random.randn(M_pad, D)], 0)

                # This is the new way of initializing Zs
                min_x, max_x = self.bounds[0]
                min_x = (min_x - self.x_mean) / self.x_std
                max_x = (max_x - self.x_mean) / self.x_std

                Z = np.linspace(min_x, max_x, num=ARGS.M)  # * X.shape[1])
                Z = Z.reshape((-1, X.shape[1]))
                #print(min_x)
                #print(max_x)
                #print(Z)

            #################################### layers
            P = np.linalg.svd(X, full_matrices=False)[2]
            # PX = P.copy()

            layers = []
            # quad_layers = []

            DX = D
            DY = 1

            D_in = D
            D_out = D

            with defer_build():

                # variance initialiaztion
                lik = Gaussian()
                lik.variance = ARGS.likelihood_variance

                if len(ARGS.configuration) > 0:
                    for c, d in ARGS.configuration.split('_'):
                        if c == 'G':
                            num_gps = int(d)
                            A = np.zeros((D_in, D_out))
                            D_min = min(D_in, D_out)
                            A[:D_min, :D_min] = np.eye(D_min)
                            mf = Linear(A=A)
                            mf.b.set_trainable(False)

                            def make_kern():
                                k = RBF(D_in,
                                        lengthscales=float(D_in)**0.5,
                                        variance=1.,
                                        ARD=True)
                                k.variance.set_trainable(False)
                                return k

                            PP = np.zeros((D_out, num_gps))
                            PP[:, :min(num_gps, DX)] = P[:, :min(num_gps, DX)]
                            ZZ = np.random.randn(ARGS.M, D_in)
                            # print(Z.shape)
                            # print(ZZ.shape)
                            ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)]

                            kern = SharedMixedMok(make_kern(), W=PP)
                            inducing = MixedKernelSharedMof(InducingPoints(ZZ))

                            l = GPLayer(kern,
                                        inducing,
                                        num_gps,
                                        mean_function=mf)
                            if ARGS.fix_linear is True:
                                kern.W.set_trainable(False)
                                mf.set_trainable(False)

                            layers.append(l)

                            D_in = D_out

                        elif c == 'L':
                            d = int(d)
                            D_in += d
                            layers.append(LatentVariableLayer(d,
                                                              XY_dim=DX + 1))

                # kernel initialization
                kern = RBF(D_in,
                           lengthscales=float(D_in)**0.5,
                           variance=1.,
                           ARD=True)
                ZZ = np.random.randn(ARGS.M, D_in)
                ZZ[:, :min(D_in, DX)] = Z[:, :min(D_in, DX)]
                layers.append(GPLayer(kern, InducingPoints(ZZ), DY))
                self.layers = layers
                self.lik = lik

            # global_step = tf.Variable(0, dtype=tf.int32)
            # self.global_step = global_step
        else:
            lik = self._gp.likelihood
            layers = self._gp.layers._list
            # val = self.session.run(self.global_step)
            # global_step = tf.Variable(val, dtype=tf.int32)
            # self.global_step = global_step
            self._gp.clear()

        with defer_build():

            #################################### model
            name = 'Model' if apply_name else None

            if ARGS.mode == 'VI':
                model = DGP_VI(X,
                               Y,
                               layers,
                               lik,
                               minibatch_size=ARGS.minibatch_size,
                               name=name)

            elif ARGS.mode == 'SGHMC':
                for layer in layers:
                    if hasattr(layer, 'q_sqrt'):
                        del layer.q_sqrt
                        layer.q_sqrt = None
                        layer.q_mu.set_trainable(False)

                model = DGP_VI(X,
                               Y,
                               layers,
                               lik,
                               minibatch_size=ARGS.minibatch_size,
                               name=name)

            elif ARGS.mode == 'IWAE':
                model = DGP_IWVI(X,
                                 Y,
                                 layers,
                                 lik,
                                 minibatch_size=ARGS.minibatch_size,
                                 num_samples=ARGS.num_IW_samples,
                                 name=name)

        global_step = tf.Variable(0, dtype=tf.int32)
        op_increment = tf.assign_add(global_step, 1)

        if not ('SGHMC' == ARGS.mode):
            for layer in model.layers[:-1]:
                if isinstance(layer, GPLayer):
                    layer.q_sqrt = layer.q_sqrt.read_value() * 1e-5

            model.compile()

            #################################### optimization

            var_list = [[model.layers[-1].q_mu, model.layers[-1].q_sqrt]]

            model.layers[-1].q_mu.set_trainable(False)
            model.layers[-1].q_sqrt.set_trainable(False)

            gamma = tf.cast(tf.train.exponential_decay(ARGS.gamma,
                                                       global_step,
                                                       1000,
                                                       ARGS.gamma_decay,
                                                       staircase=True),
                            dtype=tf.float64)
            lr = tf.cast(tf.train.exponential_decay(ARGS.lr,
                                                    global_step,
                                                    1000,
                                                    ARGS.lr_decay,
                                                    staircase=True),
                         dtype=tf.float64)

            op_ng = NatGradOptimizer(gamma=gamma).make_optimize_tensor(
                model, var_list=var_list)

            op_adam = AdamOptimizer(lr).make_optimize_tensor(model)

            def train(s):
                s.run(op_increment)
                s.run(op_ng)
                s.run(op_adam)

            model.train_op = train
            model.init_op = lambda s: s.run(
                tf.variables_initializer([global_step]))
            model.global_step = global_step

        else:
            model.compile()

            sghmc_vars = []
            for layer in layers:
                if hasattr(layer, 'q_mu'):
                    sghmc_vars.append(layer.q_mu.unconstrained_tensor)

            hyper_train_op = AdamOptimizer(ARGS.lr).make_optimize_tensor(model)

            self.sghmc_optimizer = SGHMC(model, sghmc_vars, hyper_train_op,
                                         100)

            def train_op(s):
                s.run(op_increment),
                self.sghmc_optimizer.sghmc_step(s),
                self.sghmc_optimizer.train_hypers(s)

            model.train_op = train_op
            model.sghmc_optimizer = self.sghmc_optimizer

            def init_op(s):
                epsilon = 0.01
                mdecay = 0.05
                with tf.variable_scope('sghmc'):
                    self.sghmc_optimizer.generate_update_step(epsilon, mdecay)
                v = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                      scope='sghmc')
                s.run(tf.variables_initializer(v))
                s.run(tf.variables_initializer([global_step]))

            # Added jitter due to input matrix invertability problems
            custom_config = gpflow.settings.get_settings()
            custom_config.numerics.jitter_level = 1e-8

            model.init_op = init_op
            model.global_step = global_step

        # build the computation graph for the gradient
        self.X_placeholder = tf.placeholder(tf.float64,
                                            shape=[None, X.shape[1]])
        self.Fs, Fmu, Fvar = model._build_predict(self.X_placeholder)
        self.mean_grad = tf.gradients(Fmu, self.X_placeholder)
        self.var_grad = tf.gradients(Fvar, self.X_placeholder)

        # calculated the gradient of the mean for the quantile-filtered distribution
        # print(Fs)
        # q = np.quantile(Fs, self.quantile, axis=0)
        # qFs = [f for f in Fs if f < q]
        # q_mean = np.mean(qFs, axis=0)
        # q_var = np.var(qFs, axis=0)
        # self.qmean_grad = tf.gradients(q_mean, self.X_placeholder)
        # self.qvar_grad = tf.gradients(q_var, self.X_placeholder)

        return model