def unroll(self, seed_states): assert seed_states.shape.as_list() == [None, self.state_dim] no_samples = self.no_samples unroll_steps = self.unroll_steps self.reward_model = real_env_pendulum_reward()#Use true model. states = tf.expand_dims(seed_states, axis=1) states = tf.tile(states, [1, no_samples, 1]) states = tf.reshape(states, shape=[-1, self.state_dim]) costs = [] for unroll_step in range(unroll_steps): actions = self.build_policy(states) rewards = (self.discount_factor ** unroll_step) * self.reward_model.build(states, actions) rewards = tf.reshape(tf.squeeze(rewards, axis=-1), shape=[-1, no_samples]) costs.append(-rewards) states_actions = tf.concat([states, actions], axis=-1) next_states = self.get_next_states(states_actions) states = next_states costs = tf.stack(costs, axis=-1) self.loss = tf.reduce_mean(tf.reduce_sum(tf.reduce_mean(costs, axis=1), axis=-1)) self.opt = tf.train.AdamOptimizer().minimize(self.loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'policy_scope'))
def build_loss(self, trajectories): no_samples = 2 self.reward_model = real_env_pendulum_reward() #Use true model. costs = [] for i in range(len(trajectories)): samples_standard_normal = tf.random_normal( shape=([self.batch_size] + trajectories[i].shape.as_list()[1:-1] + [no_samples]), dtype=tf.float64) #samples_standard_normal = tf.random_normal(shape=tf.shape(tf.placeholder(shape=(trajectories[i].shape.as_list()[:-1] + [no_samples]), dtype=tf.float64)), dtype=tf.float64) samples = trajectories[i][..., 0:1] + tf.sqrt( trajectories[i][..., 1:2]) * samples_standard_normal samples_transposed = tf.transpose(samples, perm=[0, 2, 1]) samples_transposed_reshaped = tf.reshape( samples_transposed, shape=[-1, self.state_dim]) rewards = (self.discount_factor**i) * self.reward_model.build( samples_transposed_reshaped, self.build_policy(samples_transposed_reshaped)) rewards_reshaped = tf.reshape(rewards, shape=[-1, no_samples, 1]) costs.append(-tf.reduce_mean(tf.squeeze(rewards_reshaped, axis=-1), axis=-1)) loss = tf.reduce_mean(tf.reduce_sum(tf.stack(costs, axis=-1), axis=-1)) return loss
def main(): parser = argparse.ArgumentParser() parser.add_argument("--env", type=str, default='MountainCarContinuous-v0') parser.add_argument("--data-size", type=int, default=10000) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--iterations", type=int, default=5000) #parser.add_argument("--goal-position", type=float, default=-.4) args = parser.parse_args() print(args) env = gym.make(args.env) ann = ANN(env.observation_space.shape[0] + env.action_space.shape[0], 1, train_weights=True) reward_function = real_env_pendulum_reward() #state_function = real_env_pendulum_state() high = np.array([np.pi, 1.]) states = np.random.uniform(low=-high, high=high, size=[args.data_size, len(high)]) states = np.stack( [np.cos(states[:, 0]), np.sin(states[:, 0]), states[:, 1]], axis=-1) actions = np.random.uniform( env.action_space.low, env.action_space.high, size=[args.data_size, env.action_space.shape[0]]) rewards = reward_function.step_np(states, actions) #saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for it in range(args.iterations): for i in range(0, args.data_size, args.batch_size): inputs = np.concatenate([ states[i:i + args.batch_size, ...], actions[i:i + args.batch_size, ...] ], axis=-1) targets = rewards[i:i + args.batch_size, ...] loss, _ = sess.run([ann.loss, ann.opt], feed_dict={ ann.inputs: inputs, ann.targets: targets }) if it % 1000 == 0: print('iterations:', it, 'i:', i, 'loss:', loss) #print sess.run(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)) #saver.save(sess, './weights/pendulum_reward.ckpt') pickle.dump( sess.run(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)), open('./weights/pendulum_reward.p', 'wb'))
def __init__(self, env, state_dim, action_dim, action_space_high, action_space_low, batch_size, unroll_steps, discount_factor): #self.X = np.linspace(-2., 2., self.batch_size) #self.y = np.sin(self.X) + 5e-5 * np.random.randn(self.batch_size) #self.Xin = np.concatenate([self.X[..., np.newaxis], np.ones([self.batch_size, 1])], axis=-1) assert len(action_space_low.shape) == 1 np.testing.assert_equal(-action_space_low, action_space_high) self.action_space_high = action_space_high self.env = env self.state_dim = state_dim self.action_dim = action_dim self.hidden_dim = 32 self.h1 = np.random.normal(size=[self.state_dim, self.hidden_dim]) self.h2 = np.concatenate([ np.random.normal(size=[self.hidden_dim, self.hidden_dim]), np.random.uniform(-3e-3, 3e-3, size=[1, self.hidden_dim]) ], axis=0) self.o = np.concatenate([ np.random.normal(size=[self.hidden_dim, self.action_dim]), np.random.uniform(-3e-3, 3e-3, size=[1, self.action_dim]) ], axis=0) self.thetas = np.concatenate( [self.h1.flatten(), self.h2.flatten(), self.o.flatten()]) self.uuid = str(uuid.uuid4()) self.batch_size = batch_size self.unroll_steps = unroll_steps self.discount_factor = discount_factor if self.env == 'MountainCarContinuous-v0': self.reward_function = mountain_car_continuous_reward_function( goal_position=.45) self.state_function = mountain_car_continuous_state_function() elif self.env == 'Pendulum-v0': self.reward_function = real_env_pendulum_reward() self.state_function = real_env_pendulum_state() self.it = 0
def __init__(self, state_dim, action_dim, action_bound_high, \ action_bound_low, unroll_length, discount_factor, \ gradient_descent_steps, scope): self.state_dim = state_dim self.action_dim = action_dim self.action_bound_high = action_bound_high self.action_bound_low = action_bound_low self.unroll_length = unroll_length self.discount_factor = discount_factor self.gradient_descent_steps = gradient_descent_steps self.scope = scope #Make sure bounds are same (assumption can be relaxed later) np.testing.assert_array_equal(-self.action_bound_low, self.action_bound_high) #Flags self.policy_reuse_vars = None #Build computational graph (i.e., unroll policy) self.states = tf.placeholder(shape=[None, self.state_dim], dtype=tf.float32) self.policy = self.build_policy(self.states) self.state_model = real_env_pendulum_state() self.reward_model = real_env_pendulum_reward() self.action = self.build_policy(self.states) state = self.states action = self.build_policy(state) rewards = [] for i in range(self.unroll_length): reward = pow(self.discount_factor, i) * self.reward_model.build( state, action) rewards.append(reward) state = self.state_model.build(state, action) action = self.build_policy(state) rewards = tf.reduce_sum(tf.stack(rewards, axis=-1), axis=-1) self.loss = -tf.reduce_mean(tf.reduce_sum(rewards, axis=-1)) self.opt = tf.train.AdamOptimizer().minimize(self.loss) '''
def __init__(self, environment, state_size, action_size, hidden_size, it_tloop, it_dyn, bs_dyn, it_policy, bs_policy, K, T, action_bound_high, action_bound_low, discount_factor, moment_matching=True, scope='pai'): self.environment = environment self.state_size = state_size self.action_size = action_size self.hidden_size = hidden_size self.it_tloop = it_tloop self.it_dyn = it_dyn self.bs_dyn = bs_dyn self.it_policy = it_policy self.bs_policy = bs_policy self.K = K #Number of particles assert self.bs_policy == self.K #Does this have to be true? self.T = T #Time horizon self.action_bound_high = action_bound_high self.action_bound_low = action_bound_low self.discount_factor = discount_factor self.moment_matching = moment_matching self.scope = scope self.policy_reuse_vars = None # Assertion np.testing.assert_array_equal(-self.action_bound_low, self.action_bound_high) # Initialize the Bayesian neural network. self.bnn = bayesian_dynamics_model(self.state_size + self.action_size, self.state_size) self.bnn.initialize_inference(n_iter=self.it_tloop * self.it_dyn * 300, n_samples=10) # Declare variables and assignment operators for each W_k. self.assign_op = [] for k in range(K): self.declare_vars_and_assign_op(scope='W_' + str(k) + '_') # True reward model self.reward_model = real_env_pendulum_reward() rewards = [] # Predict x_t for t = 1,...,T. self.particles = tf.placeholder(shape=[self.K, self.state_size], dtype=tf.float32) self.action = self.build_policy(self.particles) particles = self.particles for t in range(T): actions = self.build_policy(particles) rewards.append((self.discount_factor**t) * self.reward_model.build(particles, actions)) states_actions = tf.concat([particles, actions], axis=-1) next_states = [] for k in range(K): W_k = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'W_' + str(k) + '_') next_state = self.bnn.build( *([tf.expand_dims(states_actions[k, :], axis=0)] + W_k)) next_states.append(next_state) next_states = tf.concat(next_states, axis=0) # Perform moment matching. mu, cov = self.mu_and_cov(next_states) cov = cov + 5e-5 * np.eye( self.state_size) # To prevent singular matrix particles = tfd.MultivariateNormalFullCovariance( loc=mu, covariance_matrix=cov).sample(self.K) # Maximize cumulative rewards in horizon T. rewards = tf.reduce_sum(tf.stack(rewards, axis=-1), axis=-1) self.loss = -tf.reduce_mean(tf.reduce_sum(rewards, axis=-1)) self.opt = tf.train.AdamOptimizer().minimize(self.loss)
def unroll2(self, states): assert states.shape.as_list() == [None, self.state_dim] self.reward_model = real_env_pendulum_reward() #Use true model. trajectories = [ tf.tile(tf.expand_dims(states, axis=1), [1, self.no_samples, 1]) ] costs = [] # Action self.actions = self.build_policy(states) # Posterior predictive distributions rewards = self.reward_model.build(states, self.actions) costs.append(-rewards) states_actions = tf.concat([states, self.actions], axis=-1) ppd = tf.stack([ self.model[i].posterior_predictive_distribution(states_actions, i) for i in range(len(self.model)) ], axis=1) particles = tfd.MultivariateNormalDiag( loc=ppd[..., 0], scale_diag=tf.sqrt(ppd[..., 1])).sample(self.no_samples) ''' particles = self.get_next_states(states_actions)# For testing purposes!! ''' for unroll_step in range(self.unroll_steps - 1): print 'unrolling step:', unroll_step particles_transposed = tf.transpose(particles, perm=[1, 0, 2]) trajectories.append(particles_transposed) particles_transposed_flattened = tf.reshape( particles_transposed, shape=[-1, self.state_dim]) actions = self.build_policy(particles_transposed_flattened) rewards = self.reward_model.build(particles_transposed_flattened, actions) rewards = tf.reshape(rewards, shape=[-1, self.no_samples, 1]) rewards = tf.reduce_mean( pow(self.discount_factor, unroll_step + 1) * rewards, axis=1) costs.append(-rewards) states_actions = tf.concat( [particles_transposed_flattened, actions], axis=-1) ppd = tf.stack([ self.model[i].posterior_predictive_distribution( states_actions, i) for i in range(len(self.model)) ], axis=1) ppd = tf.reshape(ppd, shape=[-1, self.no_samples, self.state_dim, 2]) random_selections = np.random.multinomial( self.no_samples, [1. / self.no_samples] * self.no_samples) particles = [] for i in range(len(random_selections)): if random_selections[i] > 0: particles.append( tfd.MultivariateNormalDiag( loc=ppd[:, i, :, 0], scale_diag=tf.sqrt(ppd[:, i, :, 1])).sample( random_selections[i])) particles = tf.concat(particles, axis=0) ''' particles = self.get_next_states(tf.reshape(states_actions, shape=[-1, self.no_samples, self.state_dim + self.action_dim])[:, 0, :])# For testing purposes!! ''' particles_transposed = tf.transpose(particles, perm=[1, 0, 2]) trajectories.append(particles_transposed) costs = tf.reduce_sum(tf.concat(costs, axis=-1), axis=-1) loss = tf.reduce_mean(costs) return trajectories, loss