class GYMMB_Pendulum(Env): metadata = PendulumEnv.metadata def __init__(self): self.env = PendulumEnv() self.action_space = self.env.action_space self.observation_space = self.env.observation_space def reset(self): return self.env.reset() def step(self, action): ob, _, _, info = self.env.step(action) return ob, None, None, info def seed(self, seed=None): return self.env.seed(seed) def render(self, mode='human'): return self.env.render(mode) def close(self): return self.env.close() @staticmethod def tasks(): return dict(standard=StandardTask(), poplin=StandardTask()) @staticmethod def is_done(states): bs = states.shape[0] return torch.zeros(size=[bs], dtype=torch.bool, device=states.device) # Always False
def viz_pendulum_rollout(states, actions): assert states.shape[0] == actions.shape[0] eps = jp.finfo(float).eps gymenv = PendulumEnv() gymenv.reset() for t in range(states.shape[0]): gymenv.state = states[t] + jp.pi # array(0.0) is False-y which causes problems. gymenv.last_u = actions[t] + eps gymenv.render() gymenv.close()
def test_vectorized_original_equality(self): venv = VectorizedPendulumEnv() state, action = self.state_action action = np.round(action) dim1, dim2 = self.dims venv.state = state vobs, vreward, vdone, _ = venv.step(action) env = PendulumEnv() for i in range(dim1): for j in range(dim2): env.state = state[i, j] obs, reward, done, _ = env.step(action[i, j]) np.testing.assert_allclose(obs, vobs[i, j]) np.testing.assert_allclose(reward, vreward[i, j]) np.testing.assert_allclose(done, vdone[i, j])
def record_pendulum_rollout(filepath, states, actions): assert states.shape[0] == actions.shape[0] eps = jnp.finfo(float).eps gymenv = PendulumEnv() gymenv.reset() video = VideoRecorder(gymenv, path=filepath) for t in range(states.shape[0]): gymenv.state = states[t] + jnp.pi # array(0.0) is False-y which causes problems. gymenv.last_u = actions[t] + eps # gymenv.step() gymenv.render() video.capture_frame() video.close()
def __init__(self): self.env = PendulumEnv() self.action_space = self.env.action_space self.observation_space = self.env.observation_space
from gym.envs.classic_control import PendulumEnv from rl_trainer.agent import RandomAgent from rl_trainer.experiment import Experiment env = PendulumEnv() experiment = Experiment( agent=RandomAgent(action_space=env.action_space), env=env, num_episodes=5, ) experiment.run(seed=0)
state = env.state action = choose_action(state, rollout, horizon) next_state, reward, done, info = env.step(action) episode_reward += reward # print('episode %d ends with reward %d' % (episode, episode_reward)) episode_reward_list.append(episode_reward) plt.plot(episode_reward_list, label='rollout=%d horizon=%d' % (rollout, horizon)) if __name__ == '__main__': seed = 777777 max_episodes = 50 max_episode_steps = 200 env = gym.make('Pendulum-v0').unwrapped sim_env = PendulumEnv() # additional model sim_env.reset() env.seed(seed) np.random.seed(seed) random.seed(seed) render = True # custom the rollout number and horizon number rollout_list = [50] horizon_list = [10] start() env.close() sim_env.close() plt.legend() plt.grid()
def main(): # parse command-line arguments parser = argparse.ArgumentParser() parser.add_argument('--job-dir', required=True) parser.add_argument('--seed', default=42, type=int) args = parser.parse_args() print(args) # create the hyperparameters params = Params() print(params) # enable TF Eager tf.enable_eager_execution() # create the environment env = PendulumEnv() # set random seeds for reproducibility and # easier comparisons between experiments env.seed(args.seed) random.seed(args.seed) np.random.seed(args.seed) tf.set_random_seed(args.seed) # create a rollout class, used to sample data from the environment rollout = Rollout(env, max_episode_steps=params.max_episode_steps) # sample training and evaluation rollouts from the environment # using a random policy (states_train, actions_train, rewards_train, next_states_train, weights_train) = rollout(lambda state: env.action_space.sample(), episodes=params.episodes_train) (states_eval, actions_eval, rewards_eval, next_states_eval, weights_eval) = rollout(lambda state: env.action_space.sample(), episodes=params.episodes_eval) # compute deltas between the next state and the current state # to use as targets deltas_train = next_states_train - states_train deltas_eval = next_states_eval - states_eval # create datasets for training and evaluation dataset_train = create_dataset( (states_train, actions_train, deltas_train, weights_train), batch_size=params.batch_size, shuffle=True) dataset_eval = create_dataset( (states_eval, actions_eval, deltas_eval, weights_eval), batch_size=params.batch_size, shuffle=True) # create normalizers for the features and targets state_normalizer = Normalizer(loc=states_train.mean(axis=(0, 1)), scale=states_train.std(axis=(0, 1))) delta_normalizer = Normalizer(loc=deltas_train.mean(axis=(0, 1)), scale=deltas_train.std(axis=(0, 1))) action_normalizer = Normalizer(loc=actions_train.mean(axis=(0, 1)), scale=actions_train.std(axis=(0, 1))) # create a forward model model = ForwardModel(output_units=env.observation_space.shape[-1]) # create an Adam optimizer which is slightly easier to tune than momentum # momentum typically provides better results when properly tuned optimizer = tf.train.AdamOptimizer(learning_rate=params.learning_rate) # create global step global_step = tf.train.create_global_step() # create a checkpoint with all objects with variables so it can be restored checkpoint = tf.train.Checkpoint(state_normalizer=state_normalizer, delta_normalizer=delta_normalizer, action_normalizer=action_normalizer, model=model, optimizer=optimizer, global_step=global_step) # restore a checkpoint if it exists checkpoint_path = tf.train.latest_checkpoint(args.job_dir) if checkpoint_path is not None: checkpoint.restore(checkpoint_path) # create a summary writer for TensorBoard summary_writer = tf.contrib.summary.create_file_writer(logdir=args.job_dir, max_queue=1, flush_millis=1000) summary_writer.set_as_default() # iterate for some number of epochs over the datasets for epoch in range(params.epochs): # loop over the training dataset for states, actions, deltas, weights in dataset_train: # normalize features and targets states_norm = state_normalizer(states) deltas_norm = delta_normalizer(deltas) actions_norm = action_normalizer(actions) # compute a forward pass and loss inside with a gradient tape so # the trainble variables are watched for gradient computation with tf.GradientTape() as tape: # compute a forward pass ensuring the RNN state is reset deltas_norm_pred = model(states_norm, actions_norm, training=True, reset_state=True) # compute the training loss # - use mean squared error for most regression problems # - optionally: use a Huber loss if there are lots of outliers # due to noise that cannot be filtered for some reason # - be sure to weight the loss so empty steps are not included loss = tf.losses.mean_squared_error( predictions=deltas_norm_pred, labels=deltas_norm, weights=weights) # compute gradients grads = tape.gradient(loss, model.trainable_variables) # clip the gradients by their global norm # returns gradients and global norm before clipping grads, grad_norm = tf.clip_by_global_norm(grads, params.grad_clipping) # update the model grads_and_vars = zip(grads, model.trainable_variables) optimizer.apply_gradients(grads_and_vars, global_step=global_step) # compute the clipped gradient norm for summaries grad_norm_clip = tf.global_norm(grads) # log training summaries, including clipped and unclipped grad norm with tf.contrib.summary.always_record_summaries(): tf.contrib.summary.scalar('loss/train', loss) tf.contrib.summary.scalar('grad_norm', grad_norm) tf.contrib.summary.scalar('grad_norm/clip', grad_norm_clip) # loop over the evaluation dataset for states, actions, deltas, weights in dataset_eval: # normalize features and targets states_norm = state_normalizer(states) deltas_norm = delta_normalizer(deltas) actions_norm = action_normalizer(actions) # compute a forward pass ensuring the RNN state is reset deltas_norm_pred = model(states_norm, actions_norm, training=False, reset_state=True) # compute the evaluation loss loss = tf.losses.mean_squared_error(predictions=deltas_norm_pred, labels=deltas_norm, weights=weights) # log evaluation summaries with tf.contrib.summary.always_record_summaries(): tf.contrib.summary.scalar('loss/eval', loss) # save a checkpoint after training checkpoint_path = os.path.join(args.job_dir, 'ckpt') checkpoint.save(checkpoint_path)
import numpy as np from gym.envs.classic_control import PendulumEnv from gym.wrappers.monitoring.video_recorder import VideoRecorder if __name__ == "__main__": T = 1000 gymenv = PendulumEnv() gymenv.reset() # Force the initialization. gymenv.state = [np.pi, 0] gymenv.last_u = None video = VideoRecorder(gymenv, path="openai_gym_constant_torque.mp4") for t in range(T): gymenv.step([-2]) gymenv.render() video.capture_frame() gymenv.close() video.close()