Exemplo n.º 1
0
class GYMMB_Pendulum(Env):
    metadata = PendulumEnv.metadata

    def __init__(self):
        self.env = PendulumEnv()
        self.action_space = self.env.action_space
        self.observation_space = self.env.observation_space

    def reset(self):
        return self.env.reset()

    def step(self, action):
        ob, _, _, info = self.env.step(action)
        return ob, None, None, info

    def seed(self, seed=None):
        return self.env.seed(seed)

    def render(self, mode='human'):
        return self.env.render(mode)

    def close(self):
        return self.env.close()

    @staticmethod
    def tasks():
        return dict(standard=StandardTask(), poplin=StandardTask())

    @staticmethod
    def is_done(states):
        bs = states.shape[0]
        return torch.zeros(size=[bs], dtype=torch.bool, device=states.device)  # Always False
Exemplo n.º 2
0
def viz_pendulum_rollout(states, actions):
  assert states.shape[0] == actions.shape[0]

  eps = jp.finfo(float).eps

  gymenv = PendulumEnv()
  gymenv.reset()

  for t in range(states.shape[0]):
    gymenv.state = states[t] + jp.pi
    # array(0.0) is False-y which causes problems.
    gymenv.last_u = actions[t] + eps
    gymenv.render()

  gymenv.close()
Exemplo n.º 3
0
    def test_vectorized_original_equality(self):
        venv = VectorizedPendulumEnv()
        state, action = self.state_action
        action = np.round(action)

        dim1, dim2 = self.dims

        venv.state = state
        vobs, vreward, vdone, _ = venv.step(action)

        env = PendulumEnv()
        for i in range(dim1):
            for j in range(dim2):
                env.state = state[i, j]
                obs, reward, done, _ = env.step(action[i, j])

                np.testing.assert_allclose(obs, vobs[i, j])
                np.testing.assert_allclose(reward, vreward[i, j])
                np.testing.assert_allclose(done, vdone[i, j])
Exemplo n.º 4
0
def record_pendulum_rollout(filepath, states, actions):
    assert states.shape[0] == actions.shape[0]

    eps = jnp.finfo(float).eps

    gymenv = PendulumEnv()
    gymenv.reset()
    video = VideoRecorder(gymenv, path=filepath)

    for t in range(states.shape[0]):
        gymenv.state = states[t] + jnp.pi
        # array(0.0) is False-y which causes problems.
        gymenv.last_u = actions[t] + eps
        # gymenv.step()
        gymenv.render()
        video.capture_frame()

    video.close()
Exemplo n.º 5
0
 def __init__(self):
     self.env = PendulumEnv()
     self.action_space = self.env.action_space
     self.observation_space = self.env.observation_space
Exemplo n.º 6
0
from gym.envs.classic_control import PendulumEnv

from rl_trainer.agent import RandomAgent
from rl_trainer.experiment import Experiment

env = PendulumEnv()
experiment = Experiment(
    agent=RandomAgent(action_space=env.action_space),
    env=env,
    num_episodes=5,
)

experiment.run(seed=0)
Exemplo n.º 7
0
                state = env.state
                action = choose_action(state, rollout, horizon)
                next_state, reward, done, info = env.step(action)
                episode_reward += reward
            # print('episode %d ends with reward %d' % (episode, episode_reward))
            episode_reward_list.append(episode_reward)
        plt.plot(episode_reward_list,
                 label='rollout=%d horizon=%d' % (rollout, horizon))


if __name__ == '__main__':
    seed = 777777
    max_episodes = 50
    max_episode_steps = 200
    env = gym.make('Pendulum-v0').unwrapped
    sim_env = PendulumEnv()  # additional model
    sim_env.reset()
    env.seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    render = True

    # custom the rollout number and horizon number
    rollout_list = [50]
    horizon_list = [10]
    start()

    env.close()
    sim_env.close()
    plt.legend()
    plt.grid()
Exemplo n.º 8
0
def main():
    # parse command-line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--job-dir', required=True)
    parser.add_argument('--seed', default=42, type=int)
    args = parser.parse_args()
    print(args)

    # create the hyperparameters
    params = Params()
    print(params)

    # enable TF Eager
    tf.enable_eager_execution()

    # create the environment
    env = PendulumEnv()

    # set random seeds for reproducibility and
    # easier comparisons between experiments
    env.seed(args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    # create a rollout class, used to sample data from the environment
    rollout = Rollout(env, max_episode_steps=params.max_episode_steps)

    # sample training and evaluation rollouts from the environment
    # using a random policy
    (states_train, actions_train, rewards_train, next_states_train,
     weights_train) = rollout(lambda state: env.action_space.sample(),
                              episodes=params.episodes_train)
    (states_eval, actions_eval, rewards_eval, next_states_eval,
     weights_eval) = rollout(lambda state: env.action_space.sample(),
                             episodes=params.episodes_eval)

    # compute deltas between the next state and the current state
    # to use as targets
    deltas_train = next_states_train - states_train
    deltas_eval = next_states_eval - states_eval

    # create datasets for training and evaluation
    dataset_train = create_dataset(
        (states_train, actions_train, deltas_train, weights_train),
        batch_size=params.batch_size,
        shuffle=True)
    dataset_eval = create_dataset(
        (states_eval, actions_eval, deltas_eval, weights_eval),
        batch_size=params.batch_size,
        shuffle=True)

    # create normalizers for the features and targets
    state_normalizer = Normalizer(loc=states_train.mean(axis=(0, 1)),
                                  scale=states_train.std(axis=(0, 1)))
    delta_normalizer = Normalizer(loc=deltas_train.mean(axis=(0, 1)),
                                  scale=deltas_train.std(axis=(0, 1)))
    action_normalizer = Normalizer(loc=actions_train.mean(axis=(0, 1)),
                                   scale=actions_train.std(axis=(0, 1)))

    # create a forward model
    model = ForwardModel(output_units=env.observation_space.shape[-1])

    # create an Adam optimizer which is slightly easier to tune than momentum
    # momentum typically provides better results when properly tuned
    optimizer = tf.train.AdamOptimizer(learning_rate=params.learning_rate)

    # create global step
    global_step = tf.train.create_global_step()

    # create a checkpoint with all objects with variables so it can be restored
    checkpoint = tf.train.Checkpoint(state_normalizer=state_normalizer,
                                     delta_normalizer=delta_normalizer,
                                     action_normalizer=action_normalizer,
                                     model=model,
                                     optimizer=optimizer,
                                     global_step=global_step)

    # restore a checkpoint if it exists
    checkpoint_path = tf.train.latest_checkpoint(args.job_dir)
    if checkpoint_path is not None:
        checkpoint.restore(checkpoint_path)

    # create a summary writer for TensorBoard
    summary_writer = tf.contrib.summary.create_file_writer(logdir=args.job_dir,
                                                           max_queue=1,
                                                           flush_millis=1000)
    summary_writer.set_as_default()

    # iterate for some number of epochs over the datasets
    for epoch in range(params.epochs):

        # loop over the training dataset
        for states, actions, deltas, weights in dataset_train:
            # normalize features and targets
            states_norm = state_normalizer(states)
            deltas_norm = delta_normalizer(deltas)
            actions_norm = action_normalizer(actions)

            # compute a forward pass and loss inside with a gradient tape so
            # the trainble variables are watched for gradient computation
            with tf.GradientTape() as tape:
                # compute a forward pass ensuring the RNN state is reset
                deltas_norm_pred = model(states_norm,
                                         actions_norm,
                                         training=True,
                                         reset_state=True)

                # compute the training loss
                # - use mean squared error for most regression problems
                # - optionally: use a Huber loss if there are lots of outliers
                # due to noise that cannot be filtered for some reason
                # - be sure to weight the loss so empty steps are not included
                loss = tf.losses.mean_squared_error(
                    predictions=deltas_norm_pred,
                    labels=deltas_norm,
                    weights=weights)

            # compute gradients
            grads = tape.gradient(loss, model.trainable_variables)

            # clip the gradients by their global norm
            # returns gradients and global norm before clipping
            grads, grad_norm = tf.clip_by_global_norm(grads,
                                                      params.grad_clipping)

            # update the model
            grads_and_vars = zip(grads, model.trainable_variables)
            optimizer.apply_gradients(grads_and_vars, global_step=global_step)

            # compute the clipped gradient norm for summaries
            grad_norm_clip = tf.global_norm(grads)

            # log training summaries, including clipped and unclipped grad norm
            with tf.contrib.summary.always_record_summaries():
                tf.contrib.summary.scalar('loss/train', loss)
                tf.contrib.summary.scalar('grad_norm', grad_norm)
                tf.contrib.summary.scalar('grad_norm/clip', grad_norm_clip)

        # loop over the evaluation dataset
        for states, actions, deltas, weights in dataset_eval:
            # normalize features and targets
            states_norm = state_normalizer(states)
            deltas_norm = delta_normalizer(deltas)
            actions_norm = action_normalizer(actions)

            # compute a forward pass ensuring the RNN state is reset
            deltas_norm_pred = model(states_norm,
                                     actions_norm,
                                     training=False,
                                     reset_state=True)

            # compute the evaluation loss
            loss = tf.losses.mean_squared_error(predictions=deltas_norm_pred,
                                                labels=deltas_norm,
                                                weights=weights)

            # log evaluation summaries
            with tf.contrib.summary.always_record_summaries():
                tf.contrib.summary.scalar('loss/eval', loss)

    # save a checkpoint after training
    checkpoint_path = os.path.join(args.job_dir, 'ckpt')
    checkpoint.save(checkpoint_path)
Exemplo n.º 9
0
import numpy as np
from gym.envs.classic_control import PendulumEnv
from gym.wrappers.monitoring.video_recorder import VideoRecorder

if __name__ == "__main__":
    T = 1000

    gymenv = PendulumEnv()
    gymenv.reset()
    # Force the initialization.
    gymenv.state = [np.pi, 0]
    gymenv.last_u = None
    video = VideoRecorder(gymenv, path="openai_gym_constant_torque.mp4")

    for t in range(T):
        gymenv.step([-2])
        gymenv.render()
        video.capture_frame()

    gymenv.close()
    video.close()