def a(t): return c / (1 - t) def log_a(t): return T.log(c / (1 - t)) def A(t): return -c * T.log(1 - t) def create_harmonic(M): return np.cumsum(1.0 / np.arange(1, M + 1)).astype(np.float32) T.set_default_device('/cpu:0') c = T.scalar(name='c') segments = T.matrix(dtype='int32', name='segments') a_idx = segments[:, 0] b_idx = segments[:, 1] leaf_segment = segments[:, 2] m = segments[:, 3] log_fac = segments[:, 4] x = T.matrix(name='x') e = T.matrix(name='e') q_network = Vector(X.shape[1], placeholder=x, is_input=False) >> Repeat(Tanh(200), 2) q_mu_network = q_network >> Linear(D) q_mu = q_mu_network.get_outputs()[0].get_placeholder() q_sigma_network = q_network >> Linear(D) q_sigma = tf.sqrt(tf.exp(q_sigma_network.get_outputs()[0].get_placeholder()))
def initialize(self): self.graph = T.core.Graph() with self.graph.as_default(): prior_params = self.prior_params.copy() prior_type = prior_params.pop('prior_type') self.prior = PRIOR_MAP[prior_type](self.ds, self.da, self.horizon, **prior_params) cost_params = self.cost_params.copy() cost_type = cost_params.pop('cost_type') self.cost = COST_MAP[cost_type](self.ds, self.da, **cost_params) self.O = T.placeholder(T.floatx(), [None, None, self.do]) self.U = T.placeholder(T.floatx(), [None, None, self.du]) self.C = T.placeholder(T.floatx(), [None, None]) self.S = T.placeholder(T.floatx(), [None, None, self.ds]) self.A = T.placeholder(T.floatx(), [None, None, self.da]) self.t = T.placeholder(T.int32, []) self.state, self.action = T.placeholder(T.floatx(), [None, self.ds]), T.placeholder(T.floatx(), [None, self.da]) if self.prior.has_dynamics(): self.next_state = self.prior.next_state(self.state, self.action, self.t) self.prior_dynamics = self.prior.get_dynamics() self.num_data = T.scalar() self.beta = T.placeholder(T.floatx(), []) self.learning_rate = T.placeholder(T.floatx(), []) self.model_learning_rate = T.placeholder(T.floatx(), []) self.S_potentials = util.map_network(self.state_encoder)(self.O) self.A_potentials = util.map_network(self.action_encoder)(self.U) if self.prior.is_dynamics_prior(): self.data_strength = T.placeholder(T.floatx(), []) self.max_iter = T.placeholder(T.int32, []) posterior_dynamics, (encodings, actions) = \ self.prior.posterior_dynamics(self.S_potentials, self.A_potentials, data_strength=self.data_strength, max_iter=self.max_iter) self.posterior_dynamics_ = posterior_dynamics, (encodings.expected_value(), actions.expected_value()) if self.prior.is_filtering_prior(): self.prior_dynamics_stats = self.prior.sufficient_statistics() self.dynamics_stats = ( T.placeholder(T.floatx(), [None, self.ds, self.ds]), T.placeholder(T.floatx(), [None, self.ds, self.ds + self.da]), T.placeholder(T.floatx(), [None, self.ds + self.da, self.ds + self.da]), T.placeholder(T.floatx(), [None]), ) S_natparam = self.S_potentials.get_parameters('natural') num_steps = T.shape(S_natparam)[1] self.padded_S = stats.Gaussian(T.core.pad( self.S_potentials.get_parameters('natural'), [[0, 0], [0, self.horizon - num_steps], [0, 0], [0, 0]] ), 'natural') self.padded_A = stats.GaussianScaleDiag([ T.core.pad(self.A_potentials.get_parameters('regular')[0], [[0, 0], [0, self.horizon - num_steps], [0, 0]]), T.core.pad(self.A_potentials.get_parameters('regular')[1], [[0, 0], [0, self.horizon - num_steps], [0, 0]]) ], 'regular') self.q_S_padded, self.q_A_padded = self.prior.encode( self.padded_S, self.padded_A, dynamics_stats=self.dynamics_stats ) self.q_S_filter = self.q_S_padded.filter(max_steps=num_steps) self.q_A_filter = self.q_A_padded.__class__( self.q_A_padded.get_parameters('natural')[:, :num_steps] , 'natural') self.e_q_S_filter = self.q_S_filter.expected_value() self.e_q_A_filter = self.q_A_filter.expected_value() (self.q_S, self.q_A), self.prior_kl, self.kl_grads, self.info = self.prior.posterior_kl_grads( self.S_potentials, self.A_potentials, self.num_data ) self.q_S_sample = self.q_S.sample()[0] self.q_A_sample = self.q_A.sample()[0] self.q_O = util.map_network(self.state_decoder)(self.q_S_sample) self.q_U = util.map_network(self.action_decoder)(self.q_A_sample) self.q_O_sample = self.q_O.sample()[0] self.q_U_sample = self.q_U.sample()[0] self.q_O_ = util.map_network(self.state_decoder)(self.S) self.q_U_ = util.map_network(self.action_decoder)(self.A) self.q_O__sample = self.q_O_.sample()[0] self.q_U__sample = self.q_U_.sample()[0] self.cost_likelihood = self.cost.log_likelihood(self.q_S_sample, self.C) if self.cost.is_cost_function(): self.evaluated_cost = self.cost.evaluate(self.S) self.log_likelihood = T.sum(self.q_O.log_likelihood(self.O), axis=1) self.elbo = T.mean(self.log_likelihood + self.cost_likelihood - self.prior_kl) train_elbo = T.mean(self.log_likelihood + self.beta * (self.cost_likelihood - self.prior_kl)) T.core.summary.scalar("encoder-stdev", T.mean(self.S_potentials.get_parameters('regular')[0])) T.core.summary.scalar("log-likelihood", T.mean(self.log_likelihood)) T.core.summary.scalar("cost-likelihood", T.mean(self.cost_likelihood)) T.core.summary.scalar("prior-kl", T.mean(self.prior_kl)) T.core.summary.scalar("beta", self.beta) T.core.summary.scalar("elbo", self.elbo) T.core.summary.scalar("beta-elbo", train_elbo) for k, v in self.info.items(): T.core.summary.scalar(k, T.mean(v)) self.summary = T.core.summary.merge_all() neural_params = ( self.state_encoder.get_parameters() + self.state_decoder.get_parameters() + self.action_encoder.get_parameters() + self.action_decoder.get_parameters() ) cost_params = self.cost.get_parameters() if len(neural_params) > 0: optimizer = T.core.train.AdamOptimizer(self.learning_rate) gradients, variables = zip(*optimizer.compute_gradients(-train_elbo, var_list=neural_params)) gradients, _ = tf.clip_by_global_norm(gradients, 5.0) self.neural_op = optimizer.apply_gradients(zip(gradients, variables)) else: self.neural_op = T.core.no_op() if len(cost_params) > 0: self.cost_op = T.core.train.AdamOptimizer(self.learning_rate).minimize(-self.elbo, var_list=cost_params) else: self.cost_op = T.core.no_op() if len(self.kl_grads) > 0: if self.prior.is_dynamics_prior(): # opt = lambda x: T.core.train.MomentumOptimizer(x, 0.5) opt = lambda x: T.core.train.GradientDescentOptimizer(x) else: opt = T.core.train.AdamOptimizer self.dynamics_op = opt(self.model_learning_rate).apply_gradients([ (b, a) for a, b in self.kl_grads ]) else: self.dynamics_op = T.core.no_op() self.train_op = T.core.group(self.neural_op, self.dynamics_op, self.cost_op) self.session = T.interactive_session(graph=self.graph, allow_soft_placement=True, log_device_placement=False)