def sample_from_chain(self, X_d, X_c=None, X_m=None, loop_iters=5, \ sigma_scale=None): """ Sample for several rounds through the I<->G loop, initialized with the the "data variable" samples in X_d. """ data_samples = [] prior_samples = [] if X_c is None: X_c = 0.0 * X_d if X_m is None: X_m = 0.0 * X_d if sigma_scale is None: sigma_scale = 1.0 # set sigma_scale on our InfNet old_scale = self.q_z_given_x.sigma_scale.get_value(borrow=False) self.q_z_given_x.set_sigma_scale(sigma_scale) for i in range(loop_iters): # apply mask, mixing foreground and background data X_d = apply_mask(Xd=X_d, Xc=X_c, Xm=X_m) # record the data samples for this iteration data_samples.append(1.0 * X_d) # sample from their inferred posteriors X_p = self.q_z_given_x.sample_posterior(X_d) # record the sampled points (in the "prior space") prior_samples.append(1.0 * X_p) # get next data samples by transforming the prior-space points X_d = self.transform_z_to_x(X_p) # reset sigma_scale on our InfNet self.q_z_given_x.set_sigma_scale(old_scale[0]) result = {"data samples": data_samples, "prior samples": prior_samples} return result
def __init__(self, rng=None, Xd=None, Xc=None, Xm=None, Xt=None, \ i_net=None, g_net=None, d_net=None, chain_len=None, \ data_dim=None, prior_dim=None, params=None): # Do some stuff! self.rng = RandStream(rng.randint(100000)) self.data_dim = data_dim self.prior_dim = prior_dim self.prior_mean = 0.0 self.prior_logvar = 0.0 if params is None: self.params = {} else: self.params = params if 'cost_decay' in self.params: self.cost_decay = self.params['cost_decay'] else: self.cost_decay = 0.1 if 'chain_type' in self.params: assert((self.params['chain_type'] == 'walkback') or \ (self.params['chain_type'] == 'walkout')) self.chain_type = self.params['chain_type'] else: self.chain_type = 'walkout' if 'xt_transform' in self.params: assert((self.params['xt_transform'] == 'sigmoid') or \ (self.params['xt_transform'] == 'none')) if self.params['xt_transform'] == 'sigmoid': self.xt_transform = lambda x: T.nnet.sigmoid(x) else: self.xt_transform = lambda x: x else: self.xt_transform = lambda x: T.nnet.sigmoid(x) if 'logvar_bound' in self.params: self.logvar_bound = self.params['logvar_bound'] else: self.logvar_bound = 10 # # x_type: this tells if we're using bernoulli or gaussian model for # the observations # self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) # symbolic var for inputting samples for initializing the VAE chain self.Xd = Xd # symbolic var for masking subsets of the state variables self.Xm = Xm # symbolic var for controlling subsets of the state variables self.Xc = Xc # symbolic var for inputting samples from the target distribution self.Xt = Xt # integer number of times to cycle the VAE loop self.chain_len = chain_len # symbolic matrix of indices for data inputs self.It = T.arange(self.Xt.shape[0]) # symbolic matrix of indices for noise/generated inputs self.Id = T.arange(self.chain_len * self.Xd.shape[0]) + self.Xt.shape[0] # get a clone of the desired VAE, for easy access self.OSM = OneStageModel(rng=rng, Xd=self.Xd, Xc=self.Xc, Xm=self.Xm, \ p_x_given_z=g_net, q_z_given_x=i_net, x_dim=self.data_dim, \ z_dim=self.prior_dim, params=self.params) self.IN = self.OSM.q_z_given_x self.GN = self.OSM.p_x_given_z self.transform_x_to_z = self.OSM.transform_x_to_z self.transform_z_to_x = self.OSM.transform_z_to_x self.bounded_logvar = self.OSM.bounded_logvar # self-loop some clones of the main VAE into a chain. # ** All VAEs in the chain share the same Xc and Xm, which are the # symbolic inputs for providing the observed portion of the input # and a mask indicating which part of the input is "observed". # These inputs are used for training "reconstruction" policies. self.IN_chain = [] self.GN_chain = [] self.Xg_chain = [] _Xd = self.Xd print("Unrolling chain...") for i in range(self.chain_len): # create a VAE infer/generate pair with _Xd as input and with # masking variables shared by all VAEs in this chain _IN = self.IN.shared_param_clone(rng=rng, \ Xd=apply_mask(Xd=_Xd, Xc=self.Xc, Xm=self.Xm), \ build_funcs=False) _GN = self.GN.shared_param_clone(rng=rng, Xd=_IN.output, \ build_funcs=False) _Xd = self.xt_transform(_GN.output_mean) self.IN_chain.append(_IN) self.GN_chain.append(_GN) self.Xg_chain.append(_Xd) print(" step {}...".format(i)) # make a clone of the desired discriminator network, which will try # to discriminate between samples from the training data and samples # generated by the self-looped VAE chain. self.DN = d_net.shared_param_clone(rng=rng, \ Xd=T.vertical_stack(self.Xt, *self.Xg_chain)) zero_ary = np.zeros((1,)).astype(theano.config.floatX) # init shared var for weighting nll of data given posterior sample self.lam_chain_nll = theano.shared(value=zero_ary, name='vcg_lam_chain_nll') self.set_lam_chain_nll(lam_chain_nll=1.0) # init shared var for weighting posterior KL-div from prior self.lam_chain_kld = theano.shared(value=zero_ary, name='vcg_lam_chain_kld') self.set_lam_chain_kld(lam_chain_kld=1.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='vcg_lam_l2w') self.set_lam_l2w(lam_l2w=1e-4) # shared var learning rates for all networks self.lr_dn = theano.shared(value=zero_ary, name='vcg_lr_dn') self.lr_gn = theano.shared(value=zero_ary, name='vcg_lr_gn') self.lr_in = theano.shared(value=zero_ary, name='vcg_lr_in') # shared var momentum parameters for all networks self.mom_1 = theano.shared(value=zero_ary, name='vcg_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='vcg_mom_2') self.it_count = theano.shared(value=zero_ary, name='vcg_it_count') # shared var weights for adversarial classification objective self.dw_dn = theano.shared(value=zero_ary, name='vcg_dw_dn') self.dw_gn = theano.shared(value=zero_ary, name='vcg_dw_gn') # init parameters for controlling learning dynamics self.set_all_sgd_params() self.set_disc_weights() # init adversarial cost weights for GN/DN # set a shared var for regularizing the output of the discriminator self.lam_l2d = theano.shared(value=(zero_ary + params['lam_l2d']), \ name='vcg_lam_l2d') # Grab the full set of "optimizable" parameters from the generator # and discriminator networks that we'll be working with. We need to # ignore parameters in the final layers of the proto-networks in the # discriminator network (a generalized pseudo-ensemble). We ignore them # because the VCGair requires that they be "bypassed" in favor of some # binary classification layers that will be managed by this VCGair. self.dn_params = [] for pn in self.DN.proto_nets: for pnl in pn[0:-1]: self.dn_params.extend(pnl.params) self.in_params = [p for p in self.IN.mlp_params] self.in_params.append(self.OSM.output_logvar) self.gn_params = [p for p in self.GN.mlp_params] self.joint_params = self.in_params + self.gn_params + self.dn_params # Now construct a binary discriminator layer for each proto-net in the # discriminator network. And, add their params to optimization list. self._construct_disc_layers(rng) self.disc_reg_cost = self.lam_l2d[0] * \ T.sum([dl.act_l2_sum for dl in self.disc_layers]) # Construct costs for the generator and discriminator networks based # on adversarial binary classification self.disc_cost_dn, self.disc_cost_gn = self._construct_disc_costs() # first, build the cost to be optimized by the discriminator network, # in general this will be treated somewhat indepedently of the # optimization of the generator and inferencer networks. self.dn_cost = self.disc_cost_dn + self.DN.act_reg_cost + \ self.disc_reg_cost # construct costs relevant to the optimization of the generator and # discriminator networks self.chain_nll_cost = self.lam_chain_nll[0] * \ self._construct_chain_nll_cost(cost_decay=self.cost_decay) self.chain_kld_cost = self.lam_chain_kld[0] * \ self._construct_chain_kld_cost(cost_decay=self.cost_decay) self.other_reg_cost = self._construct_other_reg_cost() self.osm_cost = self.disc_cost_gn + self.chain_nll_cost + \ self.chain_kld_cost + self.other_reg_cost # compute total cost on the discriminator and VB generator/inferencer self.joint_cost = self.dn_cost + self.osm_cost # Get the gradient of the joint cost for all optimizable parameters self.joint_grads = OrderedDict() print("Computing VCGLoop DN cost gradients...") grad_list = T.grad(self.dn_cost, self.dn_params, disconnected_inputs='warn') for i, p in enumerate(self.dn_params): self.joint_grads[p] = grad_list[i] print("Computing VCGLoop IN cost gradients...") grad_list = T.grad(self.osm_cost, self.in_params, disconnected_inputs='warn') for i, p in enumerate(self.in_params): self.joint_grads[p] = grad_list[i] print("Computing VCGLoop GN cost gradients...") grad_list = T.grad(self.osm_cost, self.gn_params, disconnected_inputs='warn') for i, p in enumerate(self.gn_params): self.joint_grads[p] = grad_list[i] # construct the updates for the discriminator, generator and # inferencer networks. all networks share the same first/second # moment momentum and iteration count. the networks each have their # own learning rates, which lets you turn their learning on/off. self.dn_updates = get_param_updates(params=self.dn_params, \ grads=self.joint_grads, alpha=self.lr_dn, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) self.gn_updates = get_param_updates(params=self.gn_params, \ grads=self.joint_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) self.in_updates = get_param_updates(params=self.in_params, \ grads=self.joint_grads, alpha=self.lr_in, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) # bag up all the updates required for training self.joint_updates = OrderedDict() for k in self.dn_updates: self.joint_updates[k] = self.dn_updates[k] for k in self.gn_updates: self.joint_updates[k] = self.gn_updates[k] for k in self.in_updates: self.joint_updates[k] = self.in_updates[k] # construct an update for tracking the mean KL divergence of # approximate posteriors for this chain new_kld_mean = (0.98 * self.IN.kld_mean) + ((0.02 / self.chain_len) * \ sum([T.mean(I_N.kld_cost) for I_N in self.IN_chain])) self.joint_updates[self.IN.kld_mean] = T.cast(new_kld_mean, 'floatX') # construct the function for training on training data print("Compiling VCGLoop theano functions....") self.train_joint = self._construct_train_joint() return
def __init__(self, rng=None, \ Xd=None, Xc=None, Xm=None, \ p_x_given_z=None, q_z_given_x=None, \ x_dim=None, z_dim=None, \ params=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters if params is None: self.params = {} else: self.params = params if 'xt_transform' in self.params: assert((self.params['xt_transform'] == 'sigmoid') or \ (self.params['xt_transform'] == 'none')) if self.params['xt_transform'] == 'sigmoid': self.xt_transform = lambda x: T.nnet.sigmoid(x) else: self.xt_transform = lambda x: x else: self.xt_transform = lambda x: T.nnet.sigmoid(x) if 'logvar_bound' in self.params: self.logvar_bound = self.params['logvar_bound'] else: self.logvar_bound = 10 # # x_type: this tells if we're using bernoulli or gaussian model for # the observations # self.x_type = self.params['x_type'] assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) # record the dimensions of various spaces relevant to this model self.z_dim = z_dim self.x_dim = x_dim # set parameters for the isotropic Gaussian prior over z self.prior_mean = 0.0 self.prior_logvar = 0.0 # record the symbolic variables that will provide inputs to the # computation graph created to describe this OneStageModel self.Xd = Xd self.Xc = Xc self.Xm = Xm self.batch_reps = T.lscalar() self.x = apply_mask(self.Xd, self.Xc, self.Xm) ##################################################################### # Setup the computation graph that provides values in our objective # ##################################################################### # inferencer model for latent prototypes given instances self.q_z_given_x = q_z_given_x.shared_param_clone(rng=rng, Xd=self.x) self.z = self.q_z_given_x.output self.z_mean = self.q_z_given_x.output_mean self.z_logvar = self.q_z_given_x.output_logvar # generator model for prototypes given latent prototypes self.p_x_given_z = p_x_given_z.shared_param_clone(rng=rng, Xd=self.z) self.xt = self.p_x_given_z.output_mean # use deterministic output # construct the final output of generator, conditioned on z if self.x_type == 'bernoulli': self.xg = T.nnet.sigmoid(self.xt) else: self.xg = self.xt_transform(self.xt) # self.output_logvar modifies the output distribution self.output_logvar = self.p_x_given_z.sigma_layers[-1].b self.bounded_logvar = self.logvar_bound * \ T.tanh(self.output_logvar[0] / self.logvar_bound) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = np.zeros((1, )).astype(theano.config.floatX) self.lr_1 = theano.shared(value=zero_ary, name='osm_lr_1') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='osm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='osm_mom_2') self.it_count = theano.shared(value=zero_ary, name='osm_it_count') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='osm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_1 = theano.shared(value=zero_ary, name='osm_lam_kld_1') self.lam_kld_2 = theano.shared(value=zero_ary, name='osm_lam_kld_2') self.set_lam_kld(lam_kld_1=1.0, lam_kld_2=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='osm_lam_l2w') self.set_lam_l2w(1e-4) # Grab all of the "optimizable" parameters in "group 1" self.group_1_params = [] self.group_1_params.extend(self.q_z_given_x.mlp_params) self.group_1_params.extend(self.p_x_given_z.mlp_params) # Make a joint list of parameters self.joint_params = self.group_1_params ################################### # CONSTRUCT THE COSTS TO OPTIMIZE # ################################### self.nll_costs = self.lam_nll[0] * self._construct_nll_costs() self.nll_cost = T.mean(self.nll_costs) self.kld_costs_1, self.kld_costs_2 = self._construct_kld_costs() self.kld_costs = (self.lam_kld_1[0] * self.kld_costs_1) + \ (self.lam_kld_2[0] * self.kld_costs_2) self.kld_cost = T.mean(self.kld_costs) act_reg_cost, param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost # Get the gradient of the joint cost for all optimizable parameters print("Computing OneStageModel cost gradients...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_param_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr_1, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) # Construct a function for jointly training the generator/inferencer print("Compiling OneStageModel theano functions...") self.train_joint = self._construct_train_joint() self.compute_fe_terms = self._construct_compute_fe_terms() self.compute_post_klds = self._construct_compute_post_klds() self.sample_from_prior = self._construct_sample_from_prior() self.transform_x_to_z = theano.function([self.q_z_given_x.Xd], \ outputs=self.q_z_given_x.output_mean) self.transform_z_to_x = theano.function([self.p_x_given_z.Xd], \ outputs=self.xt_transform(self.p_x_given_z.output_mean)) self.inf_weights = self.q_z_given_x.shared_layers[0].W self.gen_weights = self.p_x_given_z.mu_layers[-1].W return
def __init__(self, rng=None, Xd=None, Xc=None, Xm=None, Xt=None, \ i_net=None, g_net=None, d_net=None, chain_len=None, \ data_dim=None, prior_dim=None, params=None): # Do some stuff! self.rng = RandStream(rng.randint(100000)) self.data_dim = data_dim self.prior_dim = prior_dim self.prior_mean = 0.0 self.prior_logvar = 0.0 if params is None: self.params = {} else: self.params = params if 'cost_decay' in self.params: self.cost_decay = self.params['cost_decay'] else: self.cost_decay = 0.1 if 'chain_type' in self.params: assert((self.params['chain_type'] == 'walkback') or \ (self.params['chain_type'] == 'walkout')) self.chain_type = self.params['chain_type'] else: self.chain_type = 'walkout' if 'xt_transform' in self.params: assert((self.params['xt_transform'] == 'sigmoid') or \ (self.params['xt_transform'] == 'none')) if self.params['xt_transform'] == 'sigmoid': self.xt_transform = lambda x: T.nnet.sigmoid(x) else: self.xt_transform = lambda x: x else: self.xt_transform = lambda x: T.nnet.sigmoid(x) if 'logvar_bound' in self.params: self.logvar_bound = self.params['logvar_bound'] else: self.logvar_bound = 10 # # x_type: this tells if we're using bernoulli or gaussian model for # the observations # self.x_type = self.params['x_type'] assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) # symbolic var for inputting samples for initializing the VAE chain self.Xd = Xd # symbolic var for masking subsets of the state variables self.Xm = Xm # symbolic var for controlling subsets of the state variables self.Xc = Xc # symbolic var for inputting samples from the target distribution self.Xt = Xt # integer number of times to cycle the VAE loop self.chain_len = chain_len # symbolic matrix of indices for data inputs self.It = T.arange(self.Xt.shape[0]) # symbolic matrix of indices for noise/generated inputs self.Id = T.arange( self.chain_len * self.Xd.shape[0]) + self.Xt.shape[0] # get a clone of the desired VAE, for easy access self.OSM = OneStageModel(rng=rng, Xd=self.Xd, Xc=self.Xc, Xm=self.Xm, \ p_x_given_z=g_net, q_z_given_x=i_net, x_dim=self.data_dim, \ z_dim=self.prior_dim, params=self.params) self.IN = self.OSM.q_z_given_x self.GN = self.OSM.p_x_given_z self.transform_x_to_z = self.OSM.transform_x_to_z self.transform_z_to_x = self.OSM.transform_z_to_x self.bounded_logvar = self.OSM.bounded_logvar # self-loop some clones of the main VAE into a chain. # ** All VAEs in the chain share the same Xc and Xm, which are the # symbolic inputs for providing the observed portion of the input # and a mask indicating which part of the input is "observed". # These inputs are used for training "reconstruction" policies. self.IN_chain = [] self.GN_chain = [] self.Xg_chain = [] _Xd = self.Xd print("Unrolling chain...") for i in range(self.chain_len): # create a VAE infer/generate pair with _Xd as input and with # masking variables shared by all VAEs in this chain _IN = self.IN.shared_param_clone(rng=rng, \ Xd=apply_mask(Xd=_Xd, Xc=self.Xc, Xm=self.Xm), \ build_funcs=False) _GN = self.GN.shared_param_clone(rng=rng, Xd=_IN.output, \ build_funcs=False) _Xd = self.xt_transform(_GN.output_mean) self.IN_chain.append(_IN) self.GN_chain.append(_GN) self.Xg_chain.append(_Xd) print(" step {}...".format(i)) # make a clone of the desired discriminator network, which will try # to discriminate between samples from the training data and samples # generated by the self-looped VAE chain. self.DN = d_net.shared_param_clone(rng=rng, \ Xd=T.vertical_stack(self.Xt, *self.Xg_chain)) zero_ary = np.zeros((1, )).astype(theano.config.floatX) # init shared var for weighting nll of data given posterior sample self.lam_chain_nll = theano.shared(value=zero_ary, name='vcg_lam_chain_nll') self.set_lam_chain_nll(lam_chain_nll=1.0) # init shared var for weighting posterior KL-div from prior self.lam_chain_kld = theano.shared(value=zero_ary, name='vcg_lam_chain_kld') self.set_lam_chain_kld(lam_chain_kld=1.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='vcg_lam_l2w') self.set_lam_l2w(lam_l2w=1e-4) # shared var learning rates for all networks self.lr_dn = theano.shared(value=zero_ary, name='vcg_lr_dn') self.lr_gn = theano.shared(value=zero_ary, name='vcg_lr_gn') self.lr_in = theano.shared(value=zero_ary, name='vcg_lr_in') # shared var momentum parameters for all networks self.mom_1 = theano.shared(value=zero_ary, name='vcg_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='vcg_mom_2') self.it_count = theano.shared(value=zero_ary, name='vcg_it_count') # shared var weights for adversarial classification objective self.dw_dn = theano.shared(value=zero_ary, name='vcg_dw_dn') self.dw_gn = theano.shared(value=zero_ary, name='vcg_dw_gn') # init parameters for controlling learning dynamics self.set_all_sgd_params() self.set_disc_weights() # init adversarial cost weights for GN/DN # set a shared var for regularizing the output of the discriminator self.lam_l2d = theano.shared(value=(zero_ary + params['lam_l2d']), \ name='vcg_lam_l2d') # Grab the full set of "optimizable" parameters from the generator # and discriminator networks that we'll be working with. We need to # ignore parameters in the final layers of the proto-networks in the # discriminator network (a generalized pseudo-ensemble). We ignore them # because the VCGair requires that they be "bypassed" in favor of some # binary classification layers that will be managed by this VCGair. self.dn_params = [] for pn in self.DN.proto_nets: for pnl in pn[0:-1]: self.dn_params.extend(pnl.params) self.in_params = [p for p in self.IN.mlp_params] self.in_params.append(self.OSM.output_logvar) self.gn_params = [p for p in self.GN.mlp_params] self.joint_params = self.in_params + self.gn_params + self.dn_params # Now construct a binary discriminator layer for each proto-net in the # discriminator network. And, add their params to optimization list. self._construct_disc_layers(rng) self.disc_reg_cost = self.lam_l2d[0] * \ T.sum([dl.act_l2_sum for dl in self.disc_layers]) # Construct costs for the generator and discriminator networks based # on adversarial binary classification self.disc_cost_dn, self.disc_cost_gn = self._construct_disc_costs() # first, build the cost to be optimized by the discriminator network, # in general this will be treated somewhat indepedently of the # optimization of the generator and inferencer networks. self.dn_cost = self.disc_cost_dn + self.DN.act_reg_cost + \ self.disc_reg_cost # construct costs relevant to the optimization of the generator and # discriminator networks self.chain_nll_cost = self.lam_chain_nll[0] * \ self._construct_chain_nll_cost(cost_decay=self.cost_decay) self.chain_kld_cost = self.lam_chain_kld[0] * \ self._construct_chain_kld_cost(cost_decay=self.cost_decay) self.other_reg_cost = self._construct_other_reg_cost() self.osm_cost = self.disc_cost_gn + self.chain_nll_cost + \ self.chain_kld_cost + self.other_reg_cost # compute total cost on the discriminator and VB generator/inferencer self.joint_cost = self.dn_cost + self.osm_cost # Get the gradient of the joint cost for all optimizable parameters self.joint_grads = OrderedDict() print("Computing VCGLoop DN cost gradients...") grad_list = T.grad(self.dn_cost, self.dn_params, disconnected_inputs='warn') for i, p in enumerate(self.dn_params): self.joint_grads[p] = grad_list[i] print("Computing VCGLoop IN cost gradients...") grad_list = T.grad(self.osm_cost, self.in_params, disconnected_inputs='warn') for i, p in enumerate(self.in_params): self.joint_grads[p] = grad_list[i] print("Computing VCGLoop GN cost gradients...") grad_list = T.grad(self.osm_cost, self.gn_params, disconnected_inputs='warn') for i, p in enumerate(self.gn_params): self.joint_grads[p] = grad_list[i] # construct the updates for the discriminator, generator and # inferencer networks. all networks share the same first/second # moment momentum and iteration count. the networks each have their # own learning rates, which lets you turn their learning on/off. self.dn_updates = get_param_updates(params=self.dn_params, \ grads=self.joint_grads, alpha=self.lr_dn, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) self.gn_updates = get_param_updates(params=self.gn_params, \ grads=self.joint_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) self.in_updates = get_param_updates(params=self.in_params, \ grads=self.joint_grads, alpha=self.lr_in, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) # bag up all the updates required for training self.joint_updates = OrderedDict() for k in self.dn_updates: self.joint_updates[k] = self.dn_updates[k] for k in self.gn_updates: self.joint_updates[k] = self.gn_updates[k] for k in self.in_updates: self.joint_updates[k] = self.in_updates[k] # construct an update for tracking the mean KL divergence of # approximate posteriors for this chain new_kld_mean = (0.98 * self.IN.kld_mean) + ((0.02 / self.chain_len) * \ sum([T.mean(I_N.kld_cost) for I_N in self.IN_chain])) self.joint_updates[self.IN.kld_mean] = T.cast(new_kld_mean, 'floatX') # construct the function for training on training data print("Compiling VCGLoop theano functions....") self.train_joint = self._construct_train_joint() return
def __init__(self, rng=None, \ Xd=None, Xc=None, Xm=None, \ p_x_given_z=None, q_z_given_x=None, \ x_dim=None, z_dim=None, \ params=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters if params is None: self.params = {} else: self.params = params if 'xt_transform' in self.params: assert((self.params['xt_transform'] == 'sigmoid') or \ (self.params['xt_transform'] == 'none')) if self.params['xt_transform'] == 'sigmoid': self.xt_transform = lambda x: T.nnet.sigmoid(x) else: self.xt_transform = lambda x: x else: self.xt_transform = lambda x: T.nnet.sigmoid(x) if 'logvar_bound' in self.params: self.logvar_bound = self.params['logvar_bound'] else: self.logvar_bound = 10 # # x_type: this tells if we're using bernoulli or gaussian model for # the observations # self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) # record the dimensions of various spaces relevant to this model self.z_dim = z_dim self.x_dim = x_dim # set parameters for the isotropic Gaussian prior over z self.prior_mean = 0.0 self.prior_logvar = 0.0 # record the symbolic variables that will provide inputs to the # computation graph created to describe this OneStageModel self.Xd = Xd self.Xc = Xc self.Xm = Xm self.batch_reps = T.lscalar() self.x = apply_mask(self.Xd, self.Xc, self.Xm) ##################################################################### # Setup the computation graph that provides values in our objective # ##################################################################### # inferencer model for latent prototypes given instances self.q_z_given_x = q_z_given_x.shared_param_clone(rng=rng, Xd=self.x) self.z = self.q_z_given_x.output self.z_mean = self.q_z_given_x.output_mean self.z_logvar = self.q_z_given_x.output_logvar # generator model for prototypes given latent prototypes self.p_x_given_z = p_x_given_z.shared_param_clone(rng=rng, Xd=self.z) self.xt = self.p_x_given_z.output_mean # use deterministic output # construct the final output of generator, conditioned on z if self.x_type == 'bernoulli': self.xg = T.nnet.sigmoid(self.xt) else: self.xg = self.xt_transform(self.xt) # self.output_logvar modifies the output distribution self.output_logvar = self.p_x_given_z.sigma_layers[-1].b self.bounded_logvar = self.logvar_bound * \ T.tanh(self.output_logvar[0] / self.logvar_bound) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = np.zeros((1,)).astype(theano.config.floatX) self.lr_1 = theano.shared(value=zero_ary, name='osm_lr_1') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='osm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='osm_mom_2') self.it_count = theano.shared(value=zero_ary, name='osm_it_count') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='osm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_1 = theano.shared(value=zero_ary, name='osm_lam_kld_1') self.lam_kld_2 = theano.shared(value=zero_ary, name='osm_lam_kld_2') self.set_lam_kld(lam_kld_1=1.0, lam_kld_2=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='osm_lam_l2w') self.set_lam_l2w(1e-4) # Grab all of the "optimizable" parameters in "group 1" self.group_1_params = [] self.group_1_params.extend(self.q_z_given_x.mlp_params) self.group_1_params.extend(self.p_x_given_z.mlp_params) # Make a joint list of parameters self.joint_params = self.group_1_params ################################### # CONSTRUCT THE COSTS TO OPTIMIZE # ################################### self.nll_costs = self.lam_nll[0] * self._construct_nll_costs() self.nll_cost = T.mean(self.nll_costs) self.kld_costs_1, self.kld_costs_2 = self._construct_kld_costs() self.kld_costs = (self.lam_kld_1[0] * self.kld_costs_1) + \ (self.lam_kld_2[0] * self.kld_costs_2) self.kld_cost = T.mean(self.kld_costs) act_reg_cost, param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost # Get the gradient of the joint cost for all optimizable parameters print("Computing OneStageModel cost gradients...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.joint_updates = get_param_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr_1, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) # Construct a function for jointly training the generator/inferencer print("Compiling OneStageModel theano functions...") self.train_joint = self._construct_train_joint() self.compute_fe_terms = self._construct_compute_fe_terms() self.compute_post_klds = self._construct_compute_post_klds() self.sample_from_prior = self._construct_sample_from_prior() self.transform_x_to_z = theano.function([self.q_z_given_x.Xd], \ outputs=self.q_z_given_x.output_mean) self.transform_z_to_x = theano.function([self.p_x_given_z.Xd], \ outputs=self.xt_transform(self.p_x_given_z.output_mean)) self.inf_weights = self.q_z_given_x.shared_layers[0].W self.gen_weights = self.p_x_given_z.mu_layers[-1].W return
def __init__(self, rng=None, Xd=None, \ i_net=None, g_net=None, chain_len=None, \ data_dim=None, prior_dim=None): # Do some stuff! self.rng = RandStream(rng.randint(100000)) self.data_dim = data_dim self.prior_dim = prior_dim # symbolic var for inputting samples for initializing the VAE chain self.Xd = Xd # symbolic var for masking subsets of the state variables self.Xm = T.zeros_like(self.Xd) # symbolic var for controlling subsets of the state variables self.Xc = T.zeros_like(self.Xd) # integer number of times to cycle the VAE loop self.chain_len = chain_len # get a clone of the desired VAE, for easy access self.GIP = GIPair(rng=rng, Xd=self.Xd, Xc=self.Xc, Xm=self.Xm, \ g_net=g_net, i_net=i_net, data_dim=self.data_dim, \ prior_dim=self.prior_dim, params=None, shared_param_dicts=None) self.IN = self.GIP.IN self.GN = self.GIP.GN self.use_encoder = self.IN.use_encoder assert(self.use_encoder == self.GN.use_decoder) # self-loop some clones of the main VAE into a chain. # ** All VAEs in the chain share the same Xc and Xm, which are the # symbolic inputs for providing the observed portion of the input # and a mask indicating which part of the input is "observed". # These inputs are used for training "reconstruction" policies. self.IN_chain = [] self.GN_chain = [] self.Xg_chain = [] _Xd = self.Xd for i in range(self.chain_len): if (i == 0): # start the chain with data provided by used _IN = self.IN.shared_param_clone(rng=rng, \ Xd=apply_mask(Xd=_Xd, Xc=self.Xc, Xm=self.Xm)) _GN = self.GN.shared_param_clone(rng=rng, Xp=_IN.output) else: # continue the chain with samples from previous VAE _IN = self.IN.shared_param_clone(rng=rng, \ Xd=apply_mask(Xd=_Xd, Xc=self.Xc, Xm=self.Xm)) _GN = self.GN.shared_param_clone(rng=rng, Xp=_IN.output) if self.use_encoder: # use the "decoded" output of the previous generator as input # to the next inferencer, which will re-encode it prior to # inference _Xd = _GN.output_decoded else: # use the "encoded" output of the previous generator as input # to the next inferencer, as the inferencer won't try to # re-encode it prior to inference _Xd = _GN.output self.IN_chain.append(_IN) self.GN_chain.append(_GN) self.Xg_chain.append(_Xd) # construct the function for training on training data self.sample_from_chain = self._construct_sample_from_chain() return
def __init__(self, rng=None, \ Xd=None, Xc=None, Xm=None, \ g_net=None, i_net=None, \ data_dim=None, prior_dim=None, \ g_net_2=None, i_net_2=None, \ prior_dim_2=None, \ params=None, shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) if params is None: self.params = {} else: self.params = params # record the symbolic variables that will provide inputs to the # computation graph created to describe this GIPair self.Xd = Xd self.Xc = Xc self.Xm = Xm # check whether we'll be working with "encoded" inputs self.use_encoder = i_net.use_encoder print("i_net.use_encoder: {0:s}, g_net.use_decoder: {1:s}".format( \ str(i_net.use_encoder), str(g_net.use_decoder))) assert(self.use_encoder == g_net.use_decoder) # create a "shared-parameter" clone of the inferencer, set up to # receive input from the appropriate symbolic variables. self.IN = i_net.shared_param_clone(rng=rng, \ Xd=apply_mask(Xd=self.Xd, Xc=self.Xc, Xm=self.Xm)) self.posterior_means = self.IN.output_mean self.posterior_sigmas = self.IN.output_sigma self.posterior_norms = T.sqrt(T.sum(self.posterior_means**2.0, axis=1, keepdims=1)) self.posterior_klds = self.IN.kld_cost self.kld2_scale = self.IN.kld2_scale # capture a handle for samples from the variational posterior self.Xp = self.IN.output # create a "shared-parameter" clone of the generator, set up to # receive input from samples from the variational posterior self.GN = g_net.shared_param_clone(rng=rng, Xp=self.IN.output) # capture a handle for sampled reconstructions from the generator self.Xg = self.GN.output # construct a second GIPair stacked on top of the first GIPair, which # learns to model the posterior samples emitted by the inferencer in # the first GIPair self.IN2 = i_net_2.shared_param_clone(rng=rng, Xd=apply_mask(Xd=self.Xp, \ Xc=T.zeros_like(self.Xp), Xm=T.zeros_like(self.Xp))) # capture a handle for samples from the top's variational posterior self.Xp2 = self.IN2.output # feed these variational posterior samples into the top's generator self.GN2 = g_net_2.shared_param_clone(rng=rng, Xp=self.Xp2) # capture a handle for sampled (latent) reconstructions from GN2 self.Xg2 = self.GN2.output # record and validate the data dimensionality parameters self.data_dim = data_dim self.prior_dim = prior_dim self.prior_dim_2 = prior_dim_2 # output of the generator and input to the inferencer should both be # equal to self.data_dim assert(self.data_dim == self.GN.mlp_layers[-1].out_dim) assert(self.data_dim == self.IN.shared_layers[0].in_dim) # input of the generator and mu/sigma outputs of the inferencer should # both be equal to self.prior_dim assert(self.prior_dim == self.GN.mlp_layers[0].in_dim) assert(self.prior_dim == self.IN.mu_layers[-1].out_dim) assert(self.prior_dim == self.IN.sigma_layers[-1].out_dim) # input of the generator and mu/sigma outputs of the inferencer should # both be equal to self.prior_dim assert(self.prior_dim_2 == self.GN2.mlp_layers[0].in_dim) assert(self.prior_dim_2 == self.IN2.mu_layers[-1].out_dim) assert(self.prior_dim_2 == self.IN2.sigma_layers[-1].out_dim) # determine whether this GIPair is a clone or an original if shared_param_dicts is None: # This is not a clone, and we will need to make a dict for # referring to the parameters of each network layer self.shared_param_dicts = {} self.is_clone = False else: # This is a clone, and its layer parameters can be found by # referring to the given param dict (i.e. shared_param_dicts). self.shared_param_dicts = shared_param_dicts self.is_clone = True if not self.is_clone: # shared var learning rate for generator and inferencer zero_ary = np.zeros((1,)).astype(theano.config.floatX) self.lr_gn = theano.shared(value=zero_ary, name='gip_lr_gn') self.lr_in = theano.shared(value=zero_ary, name='gip_lr_in') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='gip_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='gip_mom_2') self.it_count_bot = theano.shared(value=zero_ary, name='gip_it_count_bot') self.it_count_top = theano.shared(value=zero_ary, name='gip_it_count_top') self.it_count_joint = theano.shared(value=zero_ary, name='gip_it_count_joint') # init parameters for controlling learning dynamics self.set_all_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='gip_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld = theano.shared(value=zero_ary, name='gip_lam_kld') self.set_lam_kld(lam_kld=1.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='gip_lam_l2w') self.set_lam_l2w(1e-4) # record shared parameters that are to be shared among clones self.shared_param_dicts['gip_lr_gn'] = self.lr_gn self.shared_param_dicts['gip_lr_in'] = self.lr_in self.shared_param_dicts['gip_mom_1'] = self.mom_1 self.shared_param_dicts['gip_mom_2'] = self.mom_2 self.shared_param_dicts['gip_it_count_bot'] = self.it_count_bot self.shared_param_dicts['gip_it_count_top'] = self.it_count_top self.shared_param_dicts['gip_it_count_joint'] = self.it_count_joint self.shared_param_dicts['gip_lam_nll'] = self.lam_nll self.shared_param_dicts['gip_lam_kld'] = self.lam_kld self.shared_param_dicts['gip_lam_l2w'] = self.lam_l2w else: # use some shared parameters that are shared among all clones of # some "base" GIPair self.lr_gn = self.shared_param_dicts['gip_lr_gn'] self.lr_in = self.shared_param_dicts['gip_lr_in'] self.mom_1 = self.shared_param_dicts['gip_mom_1'] self.mom_2 = self.shared_param_dicts['gip_mom_2'] self.it_count_bot = self.shared_param_dicts['gip_it_count_bot'] self.it_count_top = self.shared_param_dicts['gip_it_count_top'] self.it_count_joint = self.shared_param_dicts['gip_it_count_joint'] self.lam_nll = self.shared_param_dicts['gip_lam_nll'] self.lam_kld = self.shared_param_dicts['gip_lam_kld'] self.lam_l2w = self.shared_param_dicts['gip_lam_l2w'] # grab the optimizable parameters in the bottom GIPair self.in_params = [p for p in self.IN.mlp_params] self.gn_params = [p for p in self.GN.mlp_params] self.bot_params = self.in_params + self.gn_params # grab the optimizable parameters in the top GIPair self.in2_params = [p for p in self.IN2.mlp_params] self.gn2_params = [p for p in self.GN2.mlp_params] self.top_params = self.in2_params + self.gn2_params # get the optimizable parameters of bottom + top GIPair self.joint_params = self.top_params + self.bot_params ################################### # CONSTRUCT THE COSTS TO OPTIMIZE # ################################### self.data_nll_cost_bot = self.lam_nll[0] * \ self._construct_data_nll_cost(which_gip='bot') self.data_nll_cost_top = self.lam_nll[0] * \ self._construct_data_nll_cost(which_gip='top') self.post_kld_cost_bot = self.lam_kld[0] * \ self._construct_post_kld_cost(which_gip='bot', kld2_scale=self.kld2_scale) self.post_kld_cost_top = self.lam_kld[0] * \ self._construct_post_kld_cost(which_gip='top', kld2_scale=self.kld2_scale) self.other_reg_cost_bot = \ self._construct_other_reg_cost(which_gip='bot') self.other_reg_cost_top = \ self._construct_other_reg_cost(which_gip='top') # summed costs for bottom, top, and joint objectives self.bot_cost = self.data_nll_cost_bot + self.post_kld_cost_bot + \ self.other_reg_cost_bot self.top_cost = self.data_nll_cost_top + self.post_kld_cost_top + \ self.other_reg_cost_top self.joint_cost = self.bot_cost + self.top_cost ######################################### # CONSTRUCT THE GRADIENTS FOR THE COSTS # ######################################### self.bot_grads = OrderedDict() for p in self.bot_params: self.bot_grads[p] = T.grad(self.bot_cost, p).clip(-0.1, 0.1) # Get the gradient of the top cost for all relevant parameters self.top_grads = OrderedDict() for p in self.top_params: self.top_grads[p] = T.grad(self.top_cost, p).clip(-0.1, 0.1) # Get the gradient of the joint cost for all optimizable parameters self.joint_grads = OrderedDict() for p in self.joint_params: self.joint_grads[p] = T.grad(self.joint_cost, p).clip(-0.1, 0.1) ####################################### # CONSTRUCT THE UPDATES FOR THE COSTS # ####################################### # construct updates for the bottom GIPair, for the bottom cost self.gn_updates_bot = get_adam_updates(params=self.gn_params, \ grads=self.bot_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, \ it_count=self.it_count_bot, \ mom2_init=1e-3, smoothing=1e-8) self.in_updates_bot = get_adam_updates(params=self.in_params, \ grads=self.bot_grads, alpha=self.lr_in, \ beta1=self.mom_1, beta2=self.mom_2, \ it_count=self.it_count_bot, \ mom2_init=1e-3, smoothing=1e-8) # construct updates for the top GIPair, for the top cost self.gn2_updates_top = get_adam_updates(params=self.gn2_params, \ grads=self.top_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, \ it_count=self.it_count_top, \ mom2_init=1e-3, smoothing=1e-8) self.in2_updates_top = get_adam_updates(params=self.in2_params, \ grads=self.top_grads, alpha=self.lr_in, \ beta1=self.mom_1, beta2=self.mom_2, \ it_count=self.it_count_top, \ mom2_init=1e-3, smoothing=1e-8) # construct updates for the bottom GIPair, for the joint cost self.gn_updates_joint = get_adam_updates(params=self.gn_params, \ grads=self.joint_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, \ it_count=self.it_count_joint, \ mom2_init=1e-3, smoothing=1e-8) self.in_updates_joint = get_adam_updates(params=self.in_params, \ grads=self.joint_grads, alpha=self.lr_in, \ beta1=self.mom_1, beta2=self.mom_2, \ it_count=self.it_count_joint, \ mom2_init=1e-3, smoothing=1e-8) # construct updates for the top GIPair, for the joint cost self.gn2_updates_joint = get_adam_updates(params=self.gn2_params, \ grads=self.joint_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, \ it_count=self.it_count_joint, \ mom2_init=1e-3, smoothing=1e-8) self.in2_updates_joint = get_adam_updates(params=self.in2_params, \ grads=self.joint_grads, alpha=self.lr_in, \ beta1=self.mom_1, beta2=self.mom_2, \ it_count=self.it_count_joint, \ mom2_init=1e-3, smoothing=1e-8) # Merge the bottom updates for easier application self.bot_updates = OrderedDict() for k in self.gn_updates_bot: self.bot_updates[k] = self.gn_updates_bot[k] for k in self.in_updates_bot: self.bot_updates[k] = self.in_updates_bot[k] self.bot_updates[self.IN.kld_mean] = self.IN.kld_mean_update # Merge the top updates for easier application self.top_updates = OrderedDict() for k in self.gn2_updates_top: self.top_updates[k] = self.gn2_updates_top[k] for k in self.in2_updates_top: self.top_updates[k] = self.in2_updates_top[k] self.top_updates[self.IN2.kld_mean] = self.IN2.kld_mean_update # Merge the joint updates for easier application self.joint_updates = OrderedDict() for k in self.gn_updates_joint: self.joint_updates[k] = self.gn_updates_joint[k] for k in self.in_updates_joint: self.joint_updates[k] = self.in_updates_joint[k] for k in self.gn2_updates_joint: self.joint_updates[k] = self.gn2_updates_joint[k] for k in self.in2_updates_joint: self.joint_updates[k] = self.in2_updates_joint[k] self.joint_updates[self.IN.kld_mean] = self.IN.kld_mean_update self.joint_updates[self.IN2.kld_mean] = self.IN2.kld_mean_update # Construct a function for jointly training the generator/inferencer self.train_bot = self._construct_train_bot() self.train_top = self._construct_train_top() self.train_joint = self._construct_train_joint() self.compute_costs = self._construct_compute_costs() return
def __init__(self, rng=None, \ Xd=None, Xc=None, Xm=None, \ g_net=None, i_net=None, \ data_dim=None, prior_dim=None, \ params=None, shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) if params is None: self.params = {} else: self.params = params # record the symbolic variables that will provide inputs to the # computation graph created to describe this GIPair self.Xd = Xd self.Xc = Xc self.Xm = Xm # check whether we'll be working with "encoded" inputs self.use_encoder = i_net.use_encoder print("i_net.use_encoder: {0:s}, g_net.use_decoder: {1:s}".format( \ str(i_net.use_encoder), str(g_net.use_decoder))) assert(self.use_encoder == g_net.use_decoder) # create a "shared-parameter" clone of the inferencer, set up to # receive input from the appropriate symbolic variables. self.IN = i_net.shared_param_clone(rng=rng, \ Xd=apply_mask(self.Xd, self.Xc, self.Xm)) self.posterior_means = self.IN.output_mean self.posterior_sigmas = self.IN.output_sigma self.posterior_norms = T.sqrt(T.sum(self.posterior_means**2.0, axis=1, keepdims=1)) self.posterior_klds = self.IN.kld_cost self.kld2_scale = self.IN.kld2_scale # capture a handle for samples from the variational posterior self.Xp = self.IN.output # create a "shared-parameter" clone of the generator, set up to # receive input from samples from the variational posterior self.GN = g_net.shared_param_clone(rng=rng, Xp=self.IN.output) # capture a handle for sampled reconstructions from the generator self.Xg = self.GN.output # record and validate the data dimensionality parameters self.data_dim = data_dim self.prior_dim = prior_dim # output of the generator and input to the inferencer should both be # equal to self.data_dim assert(self.data_dim == self.GN.mlp_layers[-1].out_dim) assert(self.data_dim == self.IN.shared_layers[0].in_dim) # input of the generator and mu/sigma outputs of the inferencer should # both be equal to self.prior_dim assert(self.prior_dim == self.GN.mlp_layers[0].in_dim) assert(self.prior_dim == self.IN.mu_layers[-1].out_dim) assert(self.prior_dim == self.IN.sigma_layers[-1].out_dim) # determine whether this GIPair is a clone or an original if shared_param_dicts is None: # This is not a clone, and we will need to make a dict for # referring to the parameters of each network layer self.shared_param_dicts = {} self.is_clone = False else: # This is a clone, and its layer parameters can be found by # referring to the given param dict (i.e. shared_param_dicts). self.shared_param_dicts = shared_param_dicts self.is_clone = True if not self.is_clone: # shared var learning rate for generator and inferencer zero_ary = np.zeros((1,)).astype(theano.config.floatX) self.lr_gn = theano.shared(value=zero_ary, name='gip_lr_gn') self.lr_in = theano.shared(value=zero_ary, name='gip_lr_in') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='gip_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='gip_mom_2') self.it_count = theano.shared(value=zero_ary, name='gip_it_count') # init parameters for controlling learning dynamics self.set_all_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='gip_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld = theano.shared(value=zero_ary, name='gip_lam_kld') self.set_lam_kld(lam_kld=1.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='gip_lam_l2w') self.set_lam_l2w(1e-4) # record shared parameters that are to be shared among clones self.shared_param_dicts['gip_lr_gn'] = self.lr_gn self.shared_param_dicts['gip_lr_in'] = self.lr_in self.shared_param_dicts['gip_mom_1'] = self.mom_1 self.shared_param_dicts['gip_mom_2'] = self.mom_2 self.shared_param_dicts['gip_it_count'] = self.it_count self.shared_param_dicts['gip_lam_nll'] = self.lam_nll self.shared_param_dicts['gip_lam_kld'] = self.lam_kld self.shared_param_dicts['gip_lam_l2w'] = self.lam_l2w else: # use some shared parameters that are shared among all clones of # some "base" GIPair self.lr_gn = self.shared_param_dicts['gip_lr_gn'] self.lr_in = self.shared_param_dicts['gip_lr_in'] self.mom_1 = self.shared_param_dicts['gip_mom_1'] self.mom_2 = self.shared_param_dicts['gip_mom_2'] self.it_count = self.shared_param_dicts['gip_it_count'] self.lam_nll = self.shared_param_dicts['gip_lam_nll'] self.lam_kld = self.shared_param_dicts['gip_lam_kld'] self.lam_l2w = self.shared_param_dicts['gip_lam_l2w'] # Grab the full set of "optimizable" parameters from the generator # and inferencer networks that we'll be working with. self.in_params = [p for p in self.IN.mlp_params] self.gn_params = [p for p in self.GN.mlp_params] self.joint_params = self.in_params + self.gn_params ################################### # CONSTRUCT THE COSTS TO OPTIMIZE # ################################### self.data_nll_cost = self.lam_nll[0] * self._construct_data_nll_cost() self.post_kld_cost = self.lam_kld[0] * \ self._construct_post_kld_cost(kld2_scale=self.kld2_scale) self.other_reg_cost = self._construct_other_reg_cost() self.joint_cost = self.data_nll_cost + self.post_kld_cost + \ self.other_reg_cost # Get the gradient of the joint cost for all optimizable parameters self.joint_grads = OrderedDict() for p in self.joint_params: self.joint_grads[p] = T.grad(self.joint_cost, p) # Construct the updates for the generator and inferencer networks self.gn_updates = get_adam_updates(params=self.gn_params, \ grads=self.joint_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) self.in_updates = get_adam_updates(params=self.in_params, \ grads=self.joint_grads, alpha=self.lr_in, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) self.joint_updates = OrderedDict() for k in self.gn_updates: self.joint_updates[k] = self.gn_updates[k] for k in self.in_updates: self.joint_updates[k] = self.in_updates[k] self.joint_updates[self.IN.kld_mean] = self.IN.kld_mean_update # Construct a function for jointly training the generator/inferencer self.train_joint = self._construct_train_joint() self.compute_costs = self._construct_compute_costs() self.compute_ll_bound = self._construct_compute_ll_bound() self.compute_post_stats = self._construct_compute_post_stats() return