Пример #1
0
 def sample_from_chain(self, X_d, X_c=None, X_m=None, loop_iters=5, \
         sigma_scale=None):
     """
     Sample for several rounds through the I<->G loop, initialized with the
     the "data variable" samples in X_d.
     """
     data_samples = []
     prior_samples = []
     if X_c is None:
         X_c = 0.0 * X_d
     if X_m is None:
         X_m = 0.0 * X_d
     if sigma_scale is None:
         sigma_scale = 1.0
     # set sigma_scale on our InfNet
     old_scale = self.q_z_given_x.sigma_scale.get_value(borrow=False)
     self.q_z_given_x.set_sigma_scale(sigma_scale)
     for i in range(loop_iters):
         # apply mask, mixing foreground and background data
         X_d = apply_mask(Xd=X_d, Xc=X_c, Xm=X_m)
         # record the data samples for this iteration
         data_samples.append(1.0 * X_d)
         # sample from their inferred posteriors
         X_p = self.q_z_given_x.sample_posterior(X_d)
         # record the sampled points (in the "prior space")
         prior_samples.append(1.0 * X_p)
         # get next data samples by transforming the prior-space points
         X_d = self.transform_z_to_x(X_p)
     # reset sigma_scale on our InfNet
     self.q_z_given_x.set_sigma_scale(old_scale[0])
     result = {"data samples": data_samples, "prior samples": prior_samples}
     return result
Пример #2
0
 def sample_from_chain(self, X_d, X_c=None, X_m=None, loop_iters=5, \
         sigma_scale=None):
     """
     Sample for several rounds through the I<->G loop, initialized with the
     the "data variable" samples in X_d.
     """
     data_samples = []
     prior_samples = []
     if X_c is None:
         X_c = 0.0 * X_d
     if X_m is None:
         X_m = 0.0 * X_d
     if sigma_scale is None:
         sigma_scale = 1.0
     # set sigma_scale on our InfNet
     old_scale = self.q_z_given_x.sigma_scale.get_value(borrow=False)
     self.q_z_given_x.set_sigma_scale(sigma_scale)
     for i in range(loop_iters):
         # apply mask, mixing foreground and background data
         X_d = apply_mask(Xd=X_d, Xc=X_c, Xm=X_m)
         # record the data samples for this iteration
         data_samples.append(1.0 * X_d)
         # sample from their inferred posteriors
         X_p = self.q_z_given_x.sample_posterior(X_d)
         # record the sampled points (in the "prior space")
         prior_samples.append(1.0 * X_p)
         # get next data samples by transforming the prior-space points
         X_d = self.transform_z_to_x(X_p)
     # reset sigma_scale on our InfNet
     self.q_z_given_x.set_sigma_scale(old_scale[0])
     result = {"data samples": data_samples, "prior samples": prior_samples}
     return result
Пример #3
0
    def __init__(self, rng=None, Xd=None, Xc=None, Xm=None, Xt=None, \
                 i_net=None, g_net=None, d_net=None, chain_len=None, \
                 data_dim=None, prior_dim=None, params=None):
        # Do some stuff!
        self.rng = RandStream(rng.randint(100000))
        self.data_dim = data_dim
        self.prior_dim = prior_dim
        self.prior_mean = 0.0
        self.prior_logvar = 0.0
        if params is None:
            self.params = {}
        else:
            self.params = params
        if 'cost_decay' in self.params:
            self.cost_decay = self.params['cost_decay']
        else:
            self.cost_decay = 0.1
        if 'chain_type' in self.params:
            assert((self.params['chain_type'] == 'walkback') or \
                (self.params['chain_type'] == 'walkout'))
            self.chain_type = self.params['chain_type']
        else:
            self.chain_type = 'walkout'
        if 'xt_transform' in self.params:
            assert((self.params['xt_transform'] == 'sigmoid') or \
                    (self.params['xt_transform'] == 'none'))
            if self.params['xt_transform'] == 'sigmoid':
                self.xt_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.xt_transform = lambda x: x
        else:
            self.xt_transform = lambda x: T.nnet.sigmoid(x)
        if 'logvar_bound' in self.params:
            self.logvar_bound = self.params['logvar_bound']
        else:
            self.logvar_bound = 10
        #
        # x_type: this tells if we're using bernoulli or gaussian model for
        #         the observations
        #
        self.x_type = self.params['x_type']
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))

        # symbolic var for inputting samples for initializing the VAE chain
        self.Xd = Xd
        # symbolic var for masking subsets of the state variables
        self.Xm = Xm
        # symbolic var for controlling subsets of the state variables
        self.Xc = Xc
        # symbolic var for inputting samples from the target distribution
        self.Xt = Xt
        # integer number of times to cycle the VAE loop
        self.chain_len = chain_len

        # symbolic matrix of indices for data inputs
        self.It = T.arange(self.Xt.shape[0])
        # symbolic matrix of indices for noise/generated inputs
        self.Id = T.arange(self.chain_len * self.Xd.shape[0]) + self.Xt.shape[0]

        # get a clone of the desired VAE, for easy access
        self.OSM = OneStageModel(rng=rng, Xd=self.Xd, Xc=self.Xc, Xm=self.Xm, \
                p_x_given_z=g_net, q_z_given_x=i_net, x_dim=self.data_dim, \
                z_dim=self.prior_dim, params=self.params)
        self.IN = self.OSM.q_z_given_x
        self.GN = self.OSM.p_x_given_z
        self.transform_x_to_z = self.OSM.transform_x_to_z
        self.transform_z_to_x = self.OSM.transform_z_to_x
        self.bounded_logvar = self.OSM.bounded_logvar
        # self-loop some clones of the main VAE into a chain.
        # ** All VAEs in the chain share the same Xc and Xm, which are the
        #    symbolic inputs for providing the observed portion of the input
        #    and a mask indicating which part of the input is "observed".
        #    These inputs are used for training "reconstruction" policies.
        self.IN_chain = []
        self.GN_chain = []
        self.Xg_chain = []
        _Xd = self.Xd
        print("Unrolling chain...")
        for i in range(self.chain_len):
            # create a VAE infer/generate pair with _Xd as input and with
            # masking variables shared by all VAEs in this chain
            _IN = self.IN.shared_param_clone(rng=rng, \
                    Xd=apply_mask(Xd=_Xd, Xc=self.Xc, Xm=self.Xm), \
                    build_funcs=False)
            _GN = self.GN.shared_param_clone(rng=rng, Xd=_IN.output, \
                    build_funcs=False)
            _Xd = self.xt_transform(_GN.output_mean)
            self.IN_chain.append(_IN)
            self.GN_chain.append(_GN)
            self.Xg_chain.append(_Xd)
            print("    step {}...".format(i))

        # make a clone of the desired discriminator network, which will try
        # to discriminate between samples from the training data and samples
        # generated by the self-looped VAE chain.
        self.DN = d_net.shared_param_clone(rng=rng, \
                Xd=T.vertical_stack(self.Xt, *self.Xg_chain))

        zero_ary = np.zeros((1,)).astype(theano.config.floatX)
        # init shared var for weighting nll of data given posterior sample
        self.lam_chain_nll = theano.shared(value=zero_ary, name='vcg_lam_chain_nll')
        self.set_lam_chain_nll(lam_chain_nll=1.0)
        # init shared var for weighting posterior KL-div from prior
        self.lam_chain_kld = theano.shared(value=zero_ary, name='vcg_lam_chain_kld')
        self.set_lam_chain_kld(lam_chain_kld=1.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='vcg_lam_l2w')
        self.set_lam_l2w(lam_l2w=1e-4)
        # shared var learning rates for all networks
        self.lr_dn = theano.shared(value=zero_ary, name='vcg_lr_dn')
        self.lr_gn = theano.shared(value=zero_ary, name='vcg_lr_gn')
        self.lr_in = theano.shared(value=zero_ary, name='vcg_lr_in')
        # shared var momentum parameters for all networks
        self.mom_1 = theano.shared(value=zero_ary, name='vcg_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='vcg_mom_2')
        self.it_count = theano.shared(value=zero_ary, name='vcg_it_count')
        # shared var weights for adversarial classification objective
        self.dw_dn = theano.shared(value=zero_ary, name='vcg_dw_dn')
        self.dw_gn = theano.shared(value=zero_ary, name='vcg_dw_gn')
        # init parameters for controlling learning dynamics
        self.set_all_sgd_params()
        
        self.set_disc_weights()  # init adversarial cost weights for GN/DN
        # set a shared var for regularizing the output of the discriminator
        self.lam_l2d = theano.shared(value=(zero_ary + params['lam_l2d']), \
                name='vcg_lam_l2d')

        # Grab the full set of "optimizable" parameters from the generator
        # and discriminator networks that we'll be working with. We need to
        # ignore parameters in the final layers of the proto-networks in the
        # discriminator network (a generalized pseudo-ensemble). We ignore them
        # because the VCGair requires that they be "bypassed" in favor of some
        # binary classification layers that will be managed by this VCGair.
        self.dn_params = []
        for pn in self.DN.proto_nets:
            for pnl in pn[0:-1]:
                self.dn_params.extend(pnl.params)
        self.in_params = [p for p in self.IN.mlp_params]
        self.in_params.append(self.OSM.output_logvar)
        self.gn_params = [p for p in self.GN.mlp_params]
        self.joint_params = self.in_params + self.gn_params + self.dn_params

        # Now construct a binary discriminator layer for each proto-net in the
        # discriminator network. And, add their params to optimization list.
        self._construct_disc_layers(rng)
        self.disc_reg_cost = self.lam_l2d[0] * \
                T.sum([dl.act_l2_sum for dl in self.disc_layers])

        # Construct costs for the generator and discriminator networks based 
        # on adversarial binary classification
        self.disc_cost_dn, self.disc_cost_gn = self._construct_disc_costs()

        # first, build the cost to be optimized by the discriminator network,
        # in general this will be treated somewhat indepedently of the
        # optimization of the generator and inferencer networks.
        self.dn_cost = self.disc_cost_dn + self.DN.act_reg_cost + \
                self.disc_reg_cost

        # construct costs relevant to the optimization of the generator and
        # discriminator networks
        self.chain_nll_cost = self.lam_chain_nll[0] * \
                self._construct_chain_nll_cost(cost_decay=self.cost_decay)
        self.chain_kld_cost = self.lam_chain_kld[0] * \
                self._construct_chain_kld_cost(cost_decay=self.cost_decay)
        self.other_reg_cost = self._construct_other_reg_cost()
        self.osm_cost = self.disc_cost_gn + self.chain_nll_cost + \
                self.chain_kld_cost + self.other_reg_cost
        # compute total cost on the discriminator and VB generator/inferencer
        self.joint_cost = self.dn_cost + self.osm_cost

        # Get the gradient of the joint cost for all optimizable parameters
        self.joint_grads = OrderedDict()
        print("Computing VCGLoop DN cost gradients...")
        grad_list = T.grad(self.dn_cost, self.dn_params, disconnected_inputs='warn')
        for i, p in enumerate(self.dn_params):
            self.joint_grads[p] = grad_list[i]
        print("Computing VCGLoop IN cost gradients...")
        grad_list = T.grad(self.osm_cost, self.in_params, disconnected_inputs='warn')
        for i, p in enumerate(self.in_params):
            self.joint_grads[p] = grad_list[i]
        print("Computing VCGLoop GN cost gradients...")
        grad_list = T.grad(self.osm_cost, self.gn_params, disconnected_inputs='warn')
        for i, p in enumerate(self.gn_params):
            self.joint_grads[p] = grad_list[i]

        # construct the updates for the discriminator, generator and 
        # inferencer networks. all networks share the same first/second
        # moment momentum and iteration count. the networks each have their
        # own learning rates, which lets you turn their learning on/off.
        self.dn_updates = get_param_updates(params=self.dn_params, \
                grads=self.joint_grads, alpha=self.lr_dn, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0)
        self.gn_updates = get_param_updates(params=self.gn_params, \
                grads=self.joint_grads, alpha=self.lr_gn, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0)
        self.in_updates = get_param_updates(params=self.in_params, \
                grads=self.joint_grads, alpha=self.lr_in, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0)

        # bag up all the updates required for training
        self.joint_updates = OrderedDict()
        for k in self.dn_updates:
            self.joint_updates[k] = self.dn_updates[k]
        for k in self.gn_updates:
            self.joint_updates[k] = self.gn_updates[k]
        for k in self.in_updates:
            self.joint_updates[k] = self.in_updates[k]
        # construct an update for tracking the mean KL divergence of
        # approximate posteriors for this chain
        new_kld_mean = (0.98 * self.IN.kld_mean) + ((0.02 / self.chain_len) * \
            sum([T.mean(I_N.kld_cost) for I_N in self.IN_chain]))
        self.joint_updates[self.IN.kld_mean] = T.cast(new_kld_mean, 'floatX')

        # construct the function for training on training data
        print("Compiling VCGLoop theano functions....")
        self.train_joint = self._construct_train_joint()
        return
Пример #4
0
    def __init__(self, rng=None, \
            Xd=None, Xc=None, Xm=None, \
            p_x_given_z=None, q_z_given_x=None, \
            x_dim=None, z_dim=None, \
            params=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        if params is None:
            self.params = {}
        else:
            self.params = params
        if 'xt_transform' in self.params:
            assert((self.params['xt_transform'] == 'sigmoid') or \
                    (self.params['xt_transform'] == 'none'))
            if self.params['xt_transform'] == 'sigmoid':
                self.xt_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.xt_transform = lambda x: x
        else:
            self.xt_transform = lambda x: T.nnet.sigmoid(x)
        if 'logvar_bound' in self.params:
            self.logvar_bound = self.params['logvar_bound']
        else:
            self.logvar_bound = 10
        #
        # x_type: this tells if we're using bernoulli or gaussian model for
        #         the observations
        #
        self.x_type = self.params['x_type']
        assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))

        # record the dimensions of various spaces relevant to this model
        self.z_dim = z_dim
        self.x_dim = x_dim

        # set parameters for the isotropic Gaussian prior over z
        self.prior_mean = 0.0
        self.prior_logvar = 0.0

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this OneStageModel
        self.Xd = Xd
        self.Xc = Xc
        self.Xm = Xm
        self.batch_reps = T.lscalar()
        self.x = apply_mask(self.Xd, self.Xc, self.Xm)

        #####################################################################
        # Setup the computation graph that provides values in our objective #
        #####################################################################
        # inferencer model for latent prototypes given instances
        self.q_z_given_x = q_z_given_x.shared_param_clone(rng=rng, Xd=self.x)
        self.z = self.q_z_given_x.output
        self.z_mean = self.q_z_given_x.output_mean
        self.z_logvar = self.q_z_given_x.output_logvar
        # generator model for prototypes given latent prototypes
        self.p_x_given_z = p_x_given_z.shared_param_clone(rng=rng, Xd=self.z)
        self.xt = self.p_x_given_z.output_mean  # use deterministic output
        # construct the final output of generator, conditioned on z
        if self.x_type == 'bernoulli':
            self.xg = T.nnet.sigmoid(self.xt)
        else:
            self.xg = self.xt_transform(self.xt)

        # self.output_logvar modifies the output distribution
        self.output_logvar = self.p_x_given_z.sigma_layers[-1].b
        self.bounded_logvar = self.logvar_bound * \
                    T.tanh(self.output_logvar[0] / self.logvar_bound)

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = np.zeros((1, )).astype(theano.config.floatX)
        self.lr_1 = theano.shared(value=zero_ary, name='osm_lr_1')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='osm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='osm_mom_2')
        self.it_count = theano.shared(value=zero_ary, name='osm_it_count')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='osm_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_1 = theano.shared(value=zero_ary, name='osm_lam_kld_1')
        self.lam_kld_2 = theano.shared(value=zero_ary, name='osm_lam_kld_2')
        self.set_lam_kld(lam_kld_1=1.0, lam_kld_2=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='osm_lam_l2w')
        self.set_lam_l2w(1e-4)

        # Grab all of the "optimizable" parameters in "group 1"
        self.group_1_params = []
        self.group_1_params.extend(self.q_z_given_x.mlp_params)
        self.group_1_params.extend(self.p_x_given_z.mlp_params)
        # Make a joint list of parameters
        self.joint_params = self.group_1_params

        ###################################
        # CONSTRUCT THE COSTS TO OPTIMIZE #
        ###################################
        self.nll_costs = self.lam_nll[0] * self._construct_nll_costs()
        self.nll_cost = T.mean(self.nll_costs)
        self.kld_costs_1, self.kld_costs_2 = self._construct_kld_costs()
        self.kld_costs = (self.lam_kld_1[0] * self.kld_costs_1) + \
                (self.lam_kld_2[0] * self.kld_costs_2)
        self.kld_cost = T.mean(self.kld_costs)
        act_reg_cost, param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing OneStageModel cost gradients...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.joint_updates = get_param_updates(params=self.joint_params, \
                grads=self.joint_grads, alpha=self.lr_1, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0)

        # Construct a function for jointly training the generator/inferencer
        print("Compiling OneStageModel theano functions...")
        self.train_joint = self._construct_train_joint()
        self.compute_fe_terms = self._construct_compute_fe_terms()
        self.compute_post_klds = self._construct_compute_post_klds()
        self.sample_from_prior = self._construct_sample_from_prior()
        self.transform_x_to_z = theano.function([self.q_z_given_x.Xd], \
                outputs=self.q_z_given_x.output_mean)
        self.transform_z_to_x = theano.function([self.p_x_given_z.Xd], \
                outputs=self.xt_transform(self.p_x_given_z.output_mean))
        self.inf_weights = self.q_z_given_x.shared_layers[0].W
        self.gen_weights = self.p_x_given_z.mu_layers[-1].W
        return
Пример #5
0
    def __init__(self, rng=None, Xd=None, Xc=None, Xm=None, Xt=None, \
                 i_net=None, g_net=None, d_net=None, chain_len=None, \
                 data_dim=None, prior_dim=None, params=None):
        # Do some stuff!
        self.rng = RandStream(rng.randint(100000))
        self.data_dim = data_dim
        self.prior_dim = prior_dim
        self.prior_mean = 0.0
        self.prior_logvar = 0.0
        if params is None:
            self.params = {}
        else:
            self.params = params
        if 'cost_decay' in self.params:
            self.cost_decay = self.params['cost_decay']
        else:
            self.cost_decay = 0.1
        if 'chain_type' in self.params:
            assert((self.params['chain_type'] == 'walkback') or \
                (self.params['chain_type'] == 'walkout'))
            self.chain_type = self.params['chain_type']
        else:
            self.chain_type = 'walkout'
        if 'xt_transform' in self.params:
            assert((self.params['xt_transform'] == 'sigmoid') or \
                    (self.params['xt_transform'] == 'none'))
            if self.params['xt_transform'] == 'sigmoid':
                self.xt_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.xt_transform = lambda x: x
        else:
            self.xt_transform = lambda x: T.nnet.sigmoid(x)
        if 'logvar_bound' in self.params:
            self.logvar_bound = self.params['logvar_bound']
        else:
            self.logvar_bound = 10
        #
        # x_type: this tells if we're using bernoulli or gaussian model for
        #         the observations
        #
        self.x_type = self.params['x_type']
        assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))

        # symbolic var for inputting samples for initializing the VAE chain
        self.Xd = Xd
        # symbolic var for masking subsets of the state variables
        self.Xm = Xm
        # symbolic var for controlling subsets of the state variables
        self.Xc = Xc
        # symbolic var for inputting samples from the target distribution
        self.Xt = Xt
        # integer number of times to cycle the VAE loop
        self.chain_len = chain_len

        # symbolic matrix of indices for data inputs
        self.It = T.arange(self.Xt.shape[0])
        # symbolic matrix of indices for noise/generated inputs
        self.Id = T.arange(
            self.chain_len * self.Xd.shape[0]) + self.Xt.shape[0]

        # get a clone of the desired VAE, for easy access
        self.OSM = OneStageModel(rng=rng, Xd=self.Xd, Xc=self.Xc, Xm=self.Xm, \
                p_x_given_z=g_net, q_z_given_x=i_net, x_dim=self.data_dim, \
                z_dim=self.prior_dim, params=self.params)
        self.IN = self.OSM.q_z_given_x
        self.GN = self.OSM.p_x_given_z
        self.transform_x_to_z = self.OSM.transform_x_to_z
        self.transform_z_to_x = self.OSM.transform_z_to_x
        self.bounded_logvar = self.OSM.bounded_logvar
        # self-loop some clones of the main VAE into a chain.
        # ** All VAEs in the chain share the same Xc and Xm, which are the
        #    symbolic inputs for providing the observed portion of the input
        #    and a mask indicating which part of the input is "observed".
        #    These inputs are used for training "reconstruction" policies.
        self.IN_chain = []
        self.GN_chain = []
        self.Xg_chain = []
        _Xd = self.Xd
        print("Unrolling chain...")
        for i in range(self.chain_len):
            # create a VAE infer/generate pair with _Xd as input and with
            # masking variables shared by all VAEs in this chain
            _IN = self.IN.shared_param_clone(rng=rng, \
                    Xd=apply_mask(Xd=_Xd, Xc=self.Xc, Xm=self.Xm), \
                    build_funcs=False)
            _GN = self.GN.shared_param_clone(rng=rng, Xd=_IN.output, \
                    build_funcs=False)
            _Xd = self.xt_transform(_GN.output_mean)
            self.IN_chain.append(_IN)
            self.GN_chain.append(_GN)
            self.Xg_chain.append(_Xd)
            print("    step {}...".format(i))

        # make a clone of the desired discriminator network, which will try
        # to discriminate between samples from the training data and samples
        # generated by the self-looped VAE chain.
        self.DN = d_net.shared_param_clone(rng=rng, \
                Xd=T.vertical_stack(self.Xt, *self.Xg_chain))

        zero_ary = np.zeros((1, )).astype(theano.config.floatX)
        # init shared var for weighting nll of data given posterior sample
        self.lam_chain_nll = theano.shared(value=zero_ary,
                                           name='vcg_lam_chain_nll')
        self.set_lam_chain_nll(lam_chain_nll=1.0)
        # init shared var for weighting posterior KL-div from prior
        self.lam_chain_kld = theano.shared(value=zero_ary,
                                           name='vcg_lam_chain_kld')
        self.set_lam_chain_kld(lam_chain_kld=1.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='vcg_lam_l2w')
        self.set_lam_l2w(lam_l2w=1e-4)
        # shared var learning rates for all networks
        self.lr_dn = theano.shared(value=zero_ary, name='vcg_lr_dn')
        self.lr_gn = theano.shared(value=zero_ary, name='vcg_lr_gn')
        self.lr_in = theano.shared(value=zero_ary, name='vcg_lr_in')
        # shared var momentum parameters for all networks
        self.mom_1 = theano.shared(value=zero_ary, name='vcg_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='vcg_mom_2')
        self.it_count = theano.shared(value=zero_ary, name='vcg_it_count')
        # shared var weights for adversarial classification objective
        self.dw_dn = theano.shared(value=zero_ary, name='vcg_dw_dn')
        self.dw_gn = theano.shared(value=zero_ary, name='vcg_dw_gn')
        # init parameters for controlling learning dynamics
        self.set_all_sgd_params()

        self.set_disc_weights()  # init adversarial cost weights for GN/DN
        # set a shared var for regularizing the output of the discriminator
        self.lam_l2d = theano.shared(value=(zero_ary + params['lam_l2d']), \
                name='vcg_lam_l2d')

        # Grab the full set of "optimizable" parameters from the generator
        # and discriminator networks that we'll be working with. We need to
        # ignore parameters in the final layers of the proto-networks in the
        # discriminator network (a generalized pseudo-ensemble). We ignore them
        # because the VCGair requires that they be "bypassed" in favor of some
        # binary classification layers that will be managed by this VCGair.
        self.dn_params = []
        for pn in self.DN.proto_nets:
            for pnl in pn[0:-1]:
                self.dn_params.extend(pnl.params)
        self.in_params = [p for p in self.IN.mlp_params]
        self.in_params.append(self.OSM.output_logvar)
        self.gn_params = [p for p in self.GN.mlp_params]
        self.joint_params = self.in_params + self.gn_params + self.dn_params

        # Now construct a binary discriminator layer for each proto-net in the
        # discriminator network. And, add their params to optimization list.
        self._construct_disc_layers(rng)
        self.disc_reg_cost = self.lam_l2d[0] * \
                T.sum([dl.act_l2_sum for dl in self.disc_layers])

        # Construct costs for the generator and discriminator networks based
        # on adversarial binary classification
        self.disc_cost_dn, self.disc_cost_gn = self._construct_disc_costs()

        # first, build the cost to be optimized by the discriminator network,
        # in general this will be treated somewhat indepedently of the
        # optimization of the generator and inferencer networks.
        self.dn_cost = self.disc_cost_dn + self.DN.act_reg_cost + \
                self.disc_reg_cost

        # construct costs relevant to the optimization of the generator and
        # discriminator networks
        self.chain_nll_cost = self.lam_chain_nll[0] * \
                self._construct_chain_nll_cost(cost_decay=self.cost_decay)
        self.chain_kld_cost = self.lam_chain_kld[0] * \
                self._construct_chain_kld_cost(cost_decay=self.cost_decay)
        self.other_reg_cost = self._construct_other_reg_cost()
        self.osm_cost = self.disc_cost_gn + self.chain_nll_cost + \
                self.chain_kld_cost + self.other_reg_cost
        # compute total cost on the discriminator and VB generator/inferencer
        self.joint_cost = self.dn_cost + self.osm_cost

        # Get the gradient of the joint cost for all optimizable parameters
        self.joint_grads = OrderedDict()
        print("Computing VCGLoop DN cost gradients...")
        grad_list = T.grad(self.dn_cost,
                           self.dn_params,
                           disconnected_inputs='warn')
        for i, p in enumerate(self.dn_params):
            self.joint_grads[p] = grad_list[i]
        print("Computing VCGLoop IN cost gradients...")
        grad_list = T.grad(self.osm_cost,
                           self.in_params,
                           disconnected_inputs='warn')
        for i, p in enumerate(self.in_params):
            self.joint_grads[p] = grad_list[i]
        print("Computing VCGLoop GN cost gradients...")
        grad_list = T.grad(self.osm_cost,
                           self.gn_params,
                           disconnected_inputs='warn')
        for i, p in enumerate(self.gn_params):
            self.joint_grads[p] = grad_list[i]

        # construct the updates for the discriminator, generator and
        # inferencer networks. all networks share the same first/second
        # moment momentum and iteration count. the networks each have their
        # own learning rates, which lets you turn their learning on/off.
        self.dn_updates = get_param_updates(params=self.dn_params, \
                grads=self.joint_grads, alpha=self.lr_dn, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0)
        self.gn_updates = get_param_updates(params=self.gn_params, \
                grads=self.joint_grads, alpha=self.lr_gn, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0)
        self.in_updates = get_param_updates(params=self.in_params, \
                grads=self.joint_grads, alpha=self.lr_in, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0)

        # bag up all the updates required for training
        self.joint_updates = OrderedDict()
        for k in self.dn_updates:
            self.joint_updates[k] = self.dn_updates[k]
        for k in self.gn_updates:
            self.joint_updates[k] = self.gn_updates[k]
        for k in self.in_updates:
            self.joint_updates[k] = self.in_updates[k]
        # construct an update for tracking the mean KL divergence of
        # approximate posteriors for this chain
        new_kld_mean = (0.98 * self.IN.kld_mean) + ((0.02 / self.chain_len) * \
            sum([T.mean(I_N.kld_cost) for I_N in self.IN_chain]))
        self.joint_updates[self.IN.kld_mean] = T.cast(new_kld_mean, 'floatX')

        # construct the function for training on training data
        print("Compiling VCGLoop theano functions....")
        self.train_joint = self._construct_train_joint()
        return
Пример #6
0
    def __init__(self, rng=None, \
            Xd=None, Xc=None, Xm=None, \
            p_x_given_z=None, q_z_given_x=None, \
            x_dim=None, z_dim=None, \
            params=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        if params is None:
            self.params = {}
        else:
            self.params = params
        if 'xt_transform' in self.params:
            assert((self.params['xt_transform'] == 'sigmoid') or \
                    (self.params['xt_transform'] == 'none'))
            if self.params['xt_transform'] == 'sigmoid':
                self.xt_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.xt_transform = lambda x: x
        else:
            self.xt_transform = lambda x: T.nnet.sigmoid(x)
        if 'logvar_bound' in self.params:
            self.logvar_bound = self.params['logvar_bound']
        else:
            self.logvar_bound = 10
        #
        # x_type: this tells if we're using bernoulli or gaussian model for
        #         the observations
        #
        self.x_type = self.params['x_type']
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))

        # record the dimensions of various spaces relevant to this model
        self.z_dim = z_dim
        self.x_dim = x_dim

        # set parameters for the isotropic Gaussian prior over z
        self.prior_mean = 0.0
        self.prior_logvar = 0.0

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this OneStageModel
        self.Xd = Xd
        self.Xc = Xc
        self.Xm = Xm
        self.batch_reps = T.lscalar()
        self.x = apply_mask(self.Xd, self.Xc, self.Xm)

        #####################################################################
        # Setup the computation graph that provides values in our objective #
        #####################################################################
        # inferencer model for latent prototypes given instances
        self.q_z_given_x = q_z_given_x.shared_param_clone(rng=rng, Xd=self.x)
        self.z = self.q_z_given_x.output
        self.z_mean = self.q_z_given_x.output_mean
        self.z_logvar = self.q_z_given_x.output_logvar
        # generator model for prototypes given latent prototypes
        self.p_x_given_z = p_x_given_z.shared_param_clone(rng=rng, Xd=self.z)
        self.xt = self.p_x_given_z.output_mean # use deterministic output
        # construct the final output of generator, conditioned on z
        if self.x_type == 'bernoulli':
            self.xg = T.nnet.sigmoid(self.xt)
        else:
            self.xg = self.xt_transform(self.xt)

        # self.output_logvar modifies the output distribution
        self.output_logvar = self.p_x_given_z.sigma_layers[-1].b
        self.bounded_logvar = self.logvar_bound * \
                    T.tanh(self.output_logvar[0] / self.logvar_bound)

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = np.zeros((1,)).astype(theano.config.floatX)
        self.lr_1 = theano.shared(value=zero_ary, name='osm_lr_1')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='osm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='osm_mom_2')
        self.it_count = theano.shared(value=zero_ary, name='osm_it_count')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='osm_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_1 = theano.shared(value=zero_ary, name='osm_lam_kld_1')
        self.lam_kld_2 = theano.shared(value=zero_ary, name='osm_lam_kld_2')
        self.set_lam_kld(lam_kld_1=1.0, lam_kld_2=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='osm_lam_l2w')
        self.set_lam_l2w(1e-4)

        # Grab all of the "optimizable" parameters in "group 1"
        self.group_1_params = []
        self.group_1_params.extend(self.q_z_given_x.mlp_params)
        self.group_1_params.extend(self.p_x_given_z.mlp_params)
        # Make a joint list of parameters
        self.joint_params = self.group_1_params

        ###################################
        # CONSTRUCT THE COSTS TO OPTIMIZE #
        ###################################
        self.nll_costs = self.lam_nll[0] * self._construct_nll_costs()
        self.nll_cost = T.mean(self.nll_costs)
        self.kld_costs_1, self.kld_costs_2 = self._construct_kld_costs()
        self.kld_costs = (self.lam_kld_1[0] * self.kld_costs_1) + \
                (self.lam_kld_2[0] * self.kld_costs_2)
        self.kld_cost = T.mean(self.kld_costs)
        act_reg_cost, param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing OneStageModel cost gradients...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.joint_updates = get_param_updates(params=self.joint_params, \
                grads=self.joint_grads, alpha=self.lr_1, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0)

        # Construct a function for jointly training the generator/inferencer
        print("Compiling OneStageModel theano functions...")
        self.train_joint = self._construct_train_joint()
        self.compute_fe_terms = self._construct_compute_fe_terms()
        self.compute_post_klds = self._construct_compute_post_klds()
        self.sample_from_prior = self._construct_sample_from_prior()
        self.transform_x_to_z = theano.function([self.q_z_given_x.Xd], \
                outputs=self.q_z_given_x.output_mean)
        self.transform_z_to_x = theano.function([self.p_x_given_z.Xd], \
                outputs=self.xt_transform(self.p_x_given_z.output_mean))
        self.inf_weights = self.q_z_given_x.shared_layers[0].W
        self.gen_weights = self.p_x_given_z.mu_layers[-1].W
        return
Пример #7
0
    def __init__(self, rng=None, Xd=None, \
                 i_net=None, g_net=None, chain_len=None, \
                 data_dim=None, prior_dim=None):
        # Do some stuff!
        self.rng = RandStream(rng.randint(100000))
        self.data_dim = data_dim
        self.prior_dim = prior_dim

        # symbolic var for inputting samples for initializing the VAE chain
        self.Xd = Xd
        # symbolic var for masking subsets of the state variables
        self.Xm = T.zeros_like(self.Xd)
        # symbolic var for controlling subsets of the state variables
        self.Xc = T.zeros_like(self.Xd)
        # integer number of times to cycle the VAE loop
        self.chain_len = chain_len

        # get a clone of the desired VAE, for easy access
        self.GIP = GIPair(rng=rng, Xd=self.Xd, Xc=self.Xc, Xm=self.Xm, \
                g_net=g_net, i_net=i_net, data_dim=self.data_dim, \
                prior_dim=self.prior_dim, params=None, shared_param_dicts=None)
        self.IN = self.GIP.IN
        self.GN = self.GIP.GN
        self.use_encoder = self.IN.use_encoder
        assert(self.use_encoder == self.GN.use_decoder)
        # self-loop some clones of the main VAE into a chain.
        # ** All VAEs in the chain share the same Xc and Xm, which are the
        #    symbolic inputs for providing the observed portion of the input
        #    and a mask indicating which part of the input is "observed".
        #    These inputs are used for training "reconstruction" policies.
        self.IN_chain = []
        self.GN_chain = []
        self.Xg_chain = []
        _Xd = self.Xd
        for i in range(self.chain_len):
            if (i == 0):
                # start the chain with data provided by used
                _IN = self.IN.shared_param_clone(rng=rng, \
                        Xd=apply_mask(Xd=_Xd, Xc=self.Xc, Xm=self.Xm))
                _GN = self.GN.shared_param_clone(rng=rng, Xp=_IN.output)
            else:
                # continue the chain with samples from previous VAE
                _IN = self.IN.shared_param_clone(rng=rng, \
                        Xd=apply_mask(Xd=_Xd, Xc=self.Xc, Xm=self.Xm))
                _GN = self.GN.shared_param_clone(rng=rng, Xp=_IN.output)
            if self.use_encoder:
                # use the "decoded" output of the previous generator as input
                # to the next inferencer, which will re-encode it prior to
                # inference
                _Xd = _GN.output_decoded
            else:
                # use the "encoded" output of the previous generator as input
                # to the next inferencer, as the inferencer won't try to 
                # re-encode it prior to inference
                _Xd = _GN.output
            self.IN_chain.append(_IN)
            self.GN_chain.append(_GN)
            self.Xg_chain.append(_Xd)

        # construct the function for training on training data
        self.sample_from_chain = self._construct_sample_from_chain()
        return
Пример #8
0
    def __init__(self, rng=None, \
            Xd=None, Xc=None, Xm=None, \
            g_net=None, i_net=None, \
            data_dim=None, prior_dim=None, \
            g_net_2=None, i_net_2=None, \
            prior_dim_2=None, \
            params=None, shared_param_dicts=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))
        if params is None:
            self.params = {}
        else:
            self.params = params

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this GIPair
        self.Xd = Xd
        self.Xc = Xc
        self.Xm = Xm
        # check whether we'll be working with "encoded" inputs
        self.use_encoder = i_net.use_encoder
        print("i_net.use_encoder: {0:s}, g_net.use_decoder: {1:s}".format( \
                str(i_net.use_encoder), str(g_net.use_decoder)))
        assert(self.use_encoder == g_net.use_decoder)
        # create a "shared-parameter" clone of the inferencer, set up to
        # receive input from the appropriate symbolic variables.
        self.IN = i_net.shared_param_clone(rng=rng, \
                Xd=apply_mask(Xd=self.Xd, Xc=self.Xc, Xm=self.Xm))
        self.posterior_means = self.IN.output_mean
        self.posterior_sigmas = self.IN.output_sigma
        self.posterior_norms = T.sqrt(T.sum(self.posterior_means**2.0, axis=1, keepdims=1))
        self.posterior_klds = self.IN.kld_cost
        self.kld2_scale = self.IN.kld2_scale
        # capture a handle for samples from the variational posterior
        self.Xp = self.IN.output
        # create a "shared-parameter" clone of the generator, set up to
        # receive input from samples from the variational posterior
        self.GN = g_net.shared_param_clone(rng=rng, Xp=self.IN.output)
        # capture a handle for sampled reconstructions from the generator
        self.Xg = self.GN.output

        # construct a second GIPair stacked on top of the first GIPair, which
        # learns to model the posterior samples emitted by the inferencer in
        # the first GIPair
        self.IN2 = i_net_2.shared_param_clone(rng=rng, Xd=apply_mask(Xd=self.Xp, \
                Xc=T.zeros_like(self.Xp), Xm=T.zeros_like(self.Xp)))
        # capture a handle for samples from the top's variational posterior
        self.Xp2 = self.IN2.output
        # feed these variational posterior samples into the top's generator
        self.GN2 = g_net_2.shared_param_clone(rng=rng, Xp=self.Xp2)
        # capture a handle for sampled (latent) reconstructions from GN2
        self.Xg2 = self.GN2.output

        # record and validate the data dimensionality parameters
        self.data_dim = data_dim
        self.prior_dim = prior_dim
        self.prior_dim_2 = prior_dim_2
        # output of the generator and input to the inferencer should both be
        # equal to self.data_dim
        assert(self.data_dim == self.GN.mlp_layers[-1].out_dim)
        assert(self.data_dim == self.IN.shared_layers[0].in_dim)
        # input of the generator and mu/sigma outputs of the inferencer should
        # both be equal to self.prior_dim
        assert(self.prior_dim == self.GN.mlp_layers[0].in_dim)
        assert(self.prior_dim == self.IN.mu_layers[-1].out_dim)
        assert(self.prior_dim == self.IN.sigma_layers[-1].out_dim)
        # input of the generator and mu/sigma outputs of the inferencer should
        # both be equal to self.prior_dim
        assert(self.prior_dim_2 == self.GN2.mlp_layers[0].in_dim)
        assert(self.prior_dim_2 == self.IN2.mu_layers[-1].out_dim)
        assert(self.prior_dim_2 == self.IN2.sigma_layers[-1].out_dim)

        # determine whether this GIPair is a clone or an original
        if shared_param_dicts is None:
            # This is not a clone, and we will need to make a dict for
            # referring to the parameters of each network layer
            self.shared_param_dicts = {}
            self.is_clone = False
        else:
            # This is a clone, and its layer parameters can be found by
            # referring to the given param dict (i.e. shared_param_dicts).
            self.shared_param_dicts = shared_param_dicts
            self.is_clone = True
        if not self.is_clone:
            # shared var learning rate for generator and inferencer
            zero_ary = np.zeros((1,)).astype(theano.config.floatX)
            self.lr_gn = theano.shared(value=zero_ary, name='gip_lr_gn')
            self.lr_in = theano.shared(value=zero_ary, name='gip_lr_in')
            # shared var momentum parameters for generator and inferencer
            self.mom_1 = theano.shared(value=zero_ary, name='gip_mom_1')
            self.mom_2 = theano.shared(value=zero_ary, name='gip_mom_2')
            self.it_count_bot = theano.shared(value=zero_ary, name='gip_it_count_bot')
            self.it_count_top = theano.shared(value=zero_ary, name='gip_it_count_top')
            self.it_count_joint = theano.shared(value=zero_ary, name='gip_it_count_joint')
            # init parameters for controlling learning dynamics
            self.set_all_sgd_params()
            # init shared var for weighting nll of data given posterior sample
            self.lam_nll = theano.shared(value=zero_ary, name='gip_lam_nll')
            self.set_lam_nll(lam_nll=1.0)
            # init shared var for weighting prior kld against reconstruction
            self.lam_kld = theano.shared(value=zero_ary, name='gip_lam_kld')
            self.set_lam_kld(lam_kld=1.0)
            # init shared var for controlling l2 regularization on params
            self.lam_l2w = theano.shared(value=zero_ary, name='gip_lam_l2w')
            self.set_lam_l2w(1e-4)
            # record shared parameters that are to be shared among clones
            self.shared_param_dicts['gip_lr_gn'] = self.lr_gn
            self.shared_param_dicts['gip_lr_in'] = self.lr_in
            self.shared_param_dicts['gip_mom_1'] = self.mom_1
            self.shared_param_dicts['gip_mom_2'] = self.mom_2
            self.shared_param_dicts['gip_it_count_bot'] = self.it_count_bot
            self.shared_param_dicts['gip_it_count_top'] = self.it_count_top
            self.shared_param_dicts['gip_it_count_joint'] = self.it_count_joint
            self.shared_param_dicts['gip_lam_nll'] = self.lam_nll
            self.shared_param_dicts['gip_lam_kld'] = self.lam_kld
            self.shared_param_dicts['gip_lam_l2w'] = self.lam_l2w
        else:
            # use some shared parameters that are shared among all clones of
            # some "base" GIPair
            self.lr_gn = self.shared_param_dicts['gip_lr_gn']
            self.lr_in = self.shared_param_dicts['gip_lr_in']
            self.mom_1 = self.shared_param_dicts['gip_mom_1']
            self.mom_2 = self.shared_param_dicts['gip_mom_2']
            self.it_count_bot = self.shared_param_dicts['gip_it_count_bot']
            self.it_count_top = self.shared_param_dicts['gip_it_count_top']
            self.it_count_joint = self.shared_param_dicts['gip_it_count_joint']
            self.lam_nll = self.shared_param_dicts['gip_lam_nll']
            self.lam_kld = self.shared_param_dicts['gip_lam_kld']
            self.lam_l2w = self.shared_param_dicts['gip_lam_l2w']

        # grab the optimizable parameters in the bottom GIPair
        self.in_params = [p for p in self.IN.mlp_params]
        self.gn_params = [p for p in self.GN.mlp_params]
        self.bot_params = self.in_params + self.gn_params
        # grab the optimizable parameters in the top GIPair
        self.in2_params = [p for p in self.IN2.mlp_params]
        self.gn2_params = [p for p in self.GN2.mlp_params]
        self.top_params = self.in2_params + self.gn2_params
        # get the optimizable parameters of bottom + top GIPair
        self.joint_params = self.top_params + self.bot_params

        ###################################
        # CONSTRUCT THE COSTS TO OPTIMIZE #
        ###################################
        self.data_nll_cost_bot = self.lam_nll[0] * \
                self._construct_data_nll_cost(which_gip='bot')
        self.data_nll_cost_top = self.lam_nll[0] * \
                self._construct_data_nll_cost(which_gip='top')
        self.post_kld_cost_bot = self.lam_kld[0] * \
                self._construct_post_kld_cost(which_gip='bot', kld2_scale=self.kld2_scale)
        self.post_kld_cost_top = self.lam_kld[0] * \
                self._construct_post_kld_cost(which_gip='top', kld2_scale=self.kld2_scale)
        self.other_reg_cost_bot = \
                self._construct_other_reg_cost(which_gip='bot')
        self.other_reg_cost_top = \
                self._construct_other_reg_cost(which_gip='top')
        # summed costs for bottom, top, and joint objectives
        self.bot_cost = self.data_nll_cost_bot + self.post_kld_cost_bot + \
                self.other_reg_cost_bot
        self.top_cost = self.data_nll_cost_top + self.post_kld_cost_top + \
                self.other_reg_cost_top
        self.joint_cost = self.bot_cost + self.top_cost

        #########################################
        # CONSTRUCT THE GRADIENTS FOR THE COSTS #
        #########################################
        self.bot_grads = OrderedDict()
        for p in self.bot_params:
            self.bot_grads[p] = T.grad(self.bot_cost, p).clip(-0.1, 0.1)
        # Get the gradient of the top cost for all relevant parameters
        self.top_grads = OrderedDict()
        for p in self.top_params:
            self.top_grads[p] = T.grad(self.top_cost, p).clip(-0.1, 0.1)
        # Get the gradient of the joint cost for all optimizable parameters
        self.joint_grads = OrderedDict()
        for p in self.joint_params:
            self.joint_grads[p] = T.grad(self.joint_cost, p).clip(-0.1, 0.1)

        #######################################
        # CONSTRUCT THE UPDATES FOR THE COSTS #
        #######################################
        # construct updates for the bottom GIPair, for the bottom cost
        self.gn_updates_bot = get_adam_updates(params=self.gn_params, \
                grads=self.bot_grads, alpha=self.lr_gn, \
                beta1=self.mom_1, beta2=self.mom_2, \
                it_count=self.it_count_bot, \
                mom2_init=1e-3, smoothing=1e-8)
        self.in_updates_bot = get_adam_updates(params=self.in_params, \
                grads=self.bot_grads, alpha=self.lr_in, \
                beta1=self.mom_1, beta2=self.mom_2, \
                it_count=self.it_count_bot, \
                mom2_init=1e-3, smoothing=1e-8)
        # construct updates for the top GIPair, for the top cost
        self.gn2_updates_top = get_adam_updates(params=self.gn2_params, \
                grads=self.top_grads, alpha=self.lr_gn, \
                beta1=self.mom_1, beta2=self.mom_2, \
                it_count=self.it_count_top, \
                mom2_init=1e-3, smoothing=1e-8)
        self.in2_updates_top = get_adam_updates(params=self.in2_params, \
                grads=self.top_grads, alpha=self.lr_in, \
                beta1=self.mom_1, beta2=self.mom_2, \
                it_count=self.it_count_top, \
                mom2_init=1e-3, smoothing=1e-8)
        # construct updates for the bottom GIPair, for the joint cost
        self.gn_updates_joint = get_adam_updates(params=self.gn_params, \
                grads=self.joint_grads, alpha=self.lr_gn, \
                beta1=self.mom_1, beta2=self.mom_2, \
                it_count=self.it_count_joint, \
                mom2_init=1e-3, smoothing=1e-8)
        self.in_updates_joint = get_adam_updates(params=self.in_params, \
                grads=self.joint_grads, alpha=self.lr_in, \
                beta1=self.mom_1, beta2=self.mom_2, \
                it_count=self.it_count_joint, \
                mom2_init=1e-3, smoothing=1e-8)
        # construct updates for the top GIPair, for the joint cost
        self.gn2_updates_joint = get_adam_updates(params=self.gn2_params, \
                grads=self.joint_grads, alpha=self.lr_gn, \
                beta1=self.mom_1, beta2=self.mom_2, \
                it_count=self.it_count_joint, \
                mom2_init=1e-3, smoothing=1e-8)
        self.in2_updates_joint = get_adam_updates(params=self.in2_params, \
                grads=self.joint_grads, alpha=self.lr_in, \
                beta1=self.mom_1, beta2=self.mom_2, \
                it_count=self.it_count_joint, \
                mom2_init=1e-3, smoothing=1e-8)


        # Merge the bottom updates for easier application
        self.bot_updates = OrderedDict()
        for k in self.gn_updates_bot:
            self.bot_updates[k] = self.gn_updates_bot[k]
        for k in self.in_updates_bot:
            self.bot_updates[k] = self.in_updates_bot[k]
        self.bot_updates[self.IN.kld_mean] = self.IN.kld_mean_update
        # Merge the top updates for easier application
        self.top_updates = OrderedDict()
        for k in self.gn2_updates_top:
            self.top_updates[k] = self.gn2_updates_top[k]
        for k in self.in2_updates_top:
            self.top_updates[k] = self.in2_updates_top[k]
        self.top_updates[self.IN2.kld_mean] = self.IN2.kld_mean_update
        # Merge the joint updates for easier application
        self.joint_updates = OrderedDict()
        for k in self.gn_updates_joint:
            self.joint_updates[k] = self.gn_updates_joint[k]
        for k in self.in_updates_joint:
            self.joint_updates[k] = self.in_updates_joint[k]
        for k in self.gn2_updates_joint:
            self.joint_updates[k] = self.gn2_updates_joint[k]
        for k in self.in2_updates_joint:
            self.joint_updates[k] = self.in2_updates_joint[k]
        self.joint_updates[self.IN.kld_mean] = self.IN.kld_mean_update
        self.joint_updates[self.IN2.kld_mean] = self.IN2.kld_mean_update
        # Construct a function for jointly training the generator/inferencer
        self.train_bot = self._construct_train_bot()
        self.train_top = self._construct_train_top()
        self.train_joint = self._construct_train_joint()
        self.compute_costs = self._construct_compute_costs()
        return
Пример #9
0
    def __init__(self, rng=None, \
            Xd=None, Xc=None, Xm=None, \
            g_net=None, i_net=None, \
            data_dim=None, prior_dim=None, \
            params=None, shared_param_dicts=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))
        if params is None:
            self.params = {}
        else:
            self.params = params

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this GIPair
        self.Xd = Xd
        self.Xc = Xc
        self.Xm = Xm
        # check whether we'll be working with "encoded" inputs
        self.use_encoder = i_net.use_encoder
        print("i_net.use_encoder: {0:s}, g_net.use_decoder: {1:s}".format( \
                str(i_net.use_encoder), str(g_net.use_decoder)))
        assert(self.use_encoder == g_net.use_decoder)
        # create a "shared-parameter" clone of the inferencer, set up to
        # receive input from the appropriate symbolic variables.
        self.IN = i_net.shared_param_clone(rng=rng, \
                Xd=apply_mask(self.Xd, self.Xc, self.Xm))
        self.posterior_means = self.IN.output_mean
        self.posterior_sigmas = self.IN.output_sigma
        self.posterior_norms = T.sqrt(T.sum(self.posterior_means**2.0, axis=1, keepdims=1))
        self.posterior_klds = self.IN.kld_cost
        self.kld2_scale = self.IN.kld2_scale
        # capture a handle for samples from the variational posterior
        self.Xp = self.IN.output
        # create a "shared-parameter" clone of the generator, set up to
        # receive input from samples from the variational posterior
        self.GN = g_net.shared_param_clone(rng=rng, Xp=self.IN.output)
        # capture a handle for sampled reconstructions from the generator
        self.Xg = self.GN.output

        # record and validate the data dimensionality parameters
        self.data_dim = data_dim
        self.prior_dim = prior_dim
        # output of the generator and input to the inferencer should both be
        # equal to self.data_dim
        assert(self.data_dim == self.GN.mlp_layers[-1].out_dim)
        assert(self.data_dim == self.IN.shared_layers[0].in_dim)
        # input of the generator and mu/sigma outputs of the inferencer should
        # both be equal to self.prior_dim
        assert(self.prior_dim == self.GN.mlp_layers[0].in_dim)
        assert(self.prior_dim == self.IN.mu_layers[-1].out_dim)
        assert(self.prior_dim == self.IN.sigma_layers[-1].out_dim)

        # determine whether this GIPair is a clone or an original
        if shared_param_dicts is None:
            # This is not a clone, and we will need to make a dict for
            # referring to the parameters of each network layer
            self.shared_param_dicts = {}
            self.is_clone = False
        else:
            # This is a clone, and its layer parameters can be found by
            # referring to the given param dict (i.e. shared_param_dicts).
            self.shared_param_dicts = shared_param_dicts
            self.is_clone = True

        if not self.is_clone:
            # shared var learning rate for generator and inferencer
            zero_ary = np.zeros((1,)).astype(theano.config.floatX)
            self.lr_gn = theano.shared(value=zero_ary, name='gip_lr_gn')
            self.lr_in = theano.shared(value=zero_ary, name='gip_lr_in')
            # shared var momentum parameters for generator and inferencer
            self.mom_1 = theano.shared(value=zero_ary, name='gip_mom_1')
            self.mom_2 = theano.shared(value=zero_ary, name='gip_mom_2')
            self.it_count = theano.shared(value=zero_ary, name='gip_it_count')
            # init parameters for controlling learning dynamics
            self.set_all_sgd_params()
            # init shared var for weighting nll of data given posterior sample
            self.lam_nll = theano.shared(value=zero_ary, name='gip_lam_nll')
            self.set_lam_nll(lam_nll=1.0)
            # init shared var for weighting prior kld against reconstruction
            self.lam_kld = theano.shared(value=zero_ary, name='gip_lam_kld')
            self.set_lam_kld(lam_kld=1.0)
            # init shared var for controlling l2 regularization on params
            self.lam_l2w = theano.shared(value=zero_ary, name='gip_lam_l2w')
            self.set_lam_l2w(1e-4)
            # record shared parameters that are to be shared among clones
            self.shared_param_dicts['gip_lr_gn'] = self.lr_gn
            self.shared_param_dicts['gip_lr_in'] = self.lr_in
            self.shared_param_dicts['gip_mom_1'] = self.mom_1
            self.shared_param_dicts['gip_mom_2'] = self.mom_2
            self.shared_param_dicts['gip_it_count'] = self.it_count
            self.shared_param_dicts['gip_lam_nll'] = self.lam_nll
            self.shared_param_dicts['gip_lam_kld'] = self.lam_kld
            self.shared_param_dicts['gip_lam_l2w'] = self.lam_l2w
        else:
            # use some shared parameters that are shared among all clones of
            # some "base" GIPair
            self.lr_gn = self.shared_param_dicts['gip_lr_gn']
            self.lr_in = self.shared_param_dicts['gip_lr_in']
            self.mom_1 = self.shared_param_dicts['gip_mom_1']
            self.mom_2 = self.shared_param_dicts['gip_mom_2']
            self.it_count = self.shared_param_dicts['gip_it_count']
            self.lam_nll = self.shared_param_dicts['gip_lam_nll']
            self.lam_kld = self.shared_param_dicts['gip_lam_kld']
            self.lam_l2w = self.shared_param_dicts['gip_lam_l2w']

        # Grab the full set of "optimizable" parameters from the generator
        # and inferencer networks that we'll be working with.
        self.in_params = [p for p in self.IN.mlp_params]
        self.gn_params = [p for p in self.GN.mlp_params]
        self.joint_params = self.in_params + self.gn_params

        ###################################
        # CONSTRUCT THE COSTS TO OPTIMIZE #
        ###################################
        self.data_nll_cost = self.lam_nll[0] * self._construct_data_nll_cost()
        self.post_kld_cost = self.lam_kld[0] * \
                self._construct_post_kld_cost(kld2_scale=self.kld2_scale)
        self.other_reg_cost = self._construct_other_reg_cost()
        self.joint_cost = self.data_nll_cost + self.post_kld_cost + \
                self.other_reg_cost

        # Get the gradient of the joint cost for all optimizable parameters
        self.joint_grads = OrderedDict()
        for p in self.joint_params:
            self.joint_grads[p] = T.grad(self.joint_cost, p)

        # Construct the updates for the generator and inferencer networks
        self.gn_updates = get_adam_updates(params=self.gn_params, \
                grads=self.joint_grads, alpha=self.lr_gn, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0)
        self.in_updates = get_adam_updates(params=self.in_params, \
                grads=self.joint_grads, alpha=self.lr_in, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0)
        self.joint_updates = OrderedDict()
        for k in self.gn_updates:
            self.joint_updates[k] = self.gn_updates[k]
        for k in self.in_updates:
            self.joint_updates[k] = self.in_updates[k]
        self.joint_updates[self.IN.kld_mean] = self.IN.kld_mean_update

        # Construct a function for jointly training the generator/inferencer
        self.train_joint = self._construct_train_joint()
        self.compute_costs = self._construct_compute_costs()
        self.compute_ll_bound = self._construct_compute_ll_bound()
        self.compute_post_stats = self._construct_compute_post_stats()
        return