示例#1
0
def create_er_from_episodes(discrete, max_len, num_steps, num_episodes,
                            episode_length):
    """Rollout an environment and return an Experience Replay Buffer."""

    if discrete:
        env = GymEnvironment("NChain-v0")
        transformations = []
    else:
        env = GymEnvironment("Pendulum-v0")
        transformations = [
            MeanFunction(lambda state_, action_: state_),
            StateNormalizer(),
            ActionNormalizer(),
            RewardClipper(),
        ]

    memory = ExperienceReplay(max_len,
                              transformations=transformations,
                              num_steps=num_steps)

    for _ in range(num_episodes):
        state = env.reset()
        for _ in range(episode_length):
            action = env.action_space.sample()  # sample a random action.
            observation, state, done, info = step_env(env,
                                                      state,
                                                      action,
                                                      action_scale=1.0)
            memory.append(observation)
        memory.end_episode()

    return memory
示例#2
0
def rollout_agent(environment, base_agent, extender, num_steps, td_k=True):
    torch.manual_seed(SEED)
    np.random.seed(SEED)

    environment = GymEnvironment(environment, SEED)
    agent = extender.default(
        environment,
        base_agent_name=base_agent,
        num_steps=num_steps,
        num_samples=2,
        num_iter=2,
        num_epochs=2,
        td_k=td_k,
    )
    train_agent(
        agent,
        environment,
        num_episodes=NUM_EPISODES,
        max_steps=MAX_STEPS,
        plot_flag=False,
    )
    evaluate_agent(agent,
                   environment,
                   num_episodes=NUM_EPISODES,
                   max_steps=MAX_STEPS,
                   render=False)
    agent.logger.delete_directory()  # Cleanup directory.
示例#3
0
def init_experiment(args, **kwargs):
    """Initialize experiment."""
    arg_dict = vars(args)
    arg_dict.update(kwargs)
    arg_dict.update(parse_config_file(args.agent_config))
    arg_dict = {k: v for k, v in arg_dict.items() if v is not None}

    env_config = parse_config_file(args.env_config)
    args.max_steps = env_config.get("max_steps", 1000)
    # %% Set Random seeds.
    set_random_seed(args.seed)

    # %% Initialize environment.
    if env_config["name"] in gym_envs:
        environment = GymEnvironment(env_config["name"], seed=args.seed)
    else:
        env_name, env_task = env_config["name"].split("/")
        environment = DMSuiteEnvironment(env_name, env_task, seed=args.seed)

    # %% Initialize module.
    agent_module = importlib.import_module("rllib.agent")
    agent = getattr(agent_module, f"{args.agent}Agent").default(
        environment,
        reward_transformer=RewardTransformer(
            scale=arg_dict.get("reward_scale", 1.0)),
        **arg_dict,
    )
    agent.logger.save_hparams(arg_dict)

    return agent, environment
示例#4
0
def test_policies(environment, policy, batch_size):
    environment = GymEnvironment(environment, SEED)

    critic = NNQFunction(
        dim_state=environment.dim_observation,
        dim_action=environment.dim_action,
        num_states=environment.num_states,
        num_actions=environment.num_actions,
        layers=LAYERS,
        tau=TARGET_UPDATE_TAU,
    )

    policy = policy(critic, 0.1)

    optimizer = torch.optim.Adam(critic.parameters(), lr=LEARNING_RATE)
    criterion = torch.nn.MSELoss
    agent = SARSAAgent(
        critic=critic,
        policy=policy,
        criterion=criterion,
        optimizer=optimizer,
        batch_size=batch_size,
        target_update_frequency=TARGET_UPDATE_FREQUENCY,
        gamma=GAMMA,
    )
    train_agent(
        agent,
        environment,
        num_episodes=NUM_EPISODES,
        max_steps=MAX_STEPS,
        plot_flag=False,
    )
    evaluate_agent(agent, environment, 1, MAX_STEPS, render=False)
    agent.logger.delete_directory()  # Cleanup directory.
示例#5
0
 def get_env(self, gym_env, **kwargs):
     """Get environment."""
     if gym_env:
         env = GymEnvironment(f"{self.name}-v0", **kwargs)
     else:
         module = importlib.import_module("rllib.environment.mdps")
         env = getattr(module, self.name)(**kwargs)
     return env
示例#6
0
def main(args):
    """Run experiment."""
    set_random_seed(args.seed)
    env_config = parse_config_file(args.env_config_file)

    environment = GymEnvironment(
        env_config["name"], ctrl_cost_weight=env_config["action_cost"], seed=args.seed
    )
    reward_model = environment.env.reward_model()
    if args.exploration == "optimistic":
        dynamical_model = HallucinatedModel.default(environment, beta=args.beta)
        environment.add_wrapper(HallucinationWrapper)
    else:
        dynamical_model = TransformedModel.default(environment)
    kwargs = parse_config_file(args.agent_config_file)

    agent = getattr(
        importlib.import_module("rllib.agent"), f"{args.agent}Agent"
    ).default(
        environment=environment,
        dynamical_model=dynamical_model,
        reward_model=reward_model,
        thompson_sampling=args.exploration == "thompson",
        **kwargs,
    )
    train_agent(
        agent=agent,
        environment=environment,
        max_steps=env_config["max_steps"],
        num_episodes=args.train_episodes,
        render=args.render,
        print_frequency=1,
    )

    evaluate_agent(
        agent=agent,
        environment=environment,
        max_steps=env_config["max_steps"],
        num_episodes=args.test_episodes,
    )
示例#7
0
def rollout_agent(environment, agent):
    torch.manual_seed(SEED)
    np.random.seed(SEED)

    environment = GymEnvironment(environment, SEED)
    agent = agent.default(environment, num_iter=2, num_epochs=2)
    train_agent(
        agent,
        environment,
        num_episodes=NUM_EPISODES,
        max_steps=MAX_STEPS,
        plot_flag=False,
    )
    evaluate_agent(agent, environment, 1, MAX_STEPS, render=False)
    agent.logger.delete_directory()  # Cleanup directory.
示例#8
0
def test_reward(environment, action_cost, action_type):
    env_name, reward_model_ = environment
    if action_cost is not None:
        env = GymEnvironment(env_name, action_cost=action_cost)
    else:
        env = GymEnvironment(env_name)
    state = env.reset()
    if action_cost is not None:
        reward_model = reward_model_(action_cost=action_cost)
    else:
        reward_model = reward_model_()
    reward_model.set_goal(env.goal)
    for _ in range(50):
        if action_type == "random":
            action = env.action_space.sample()
        elif action_type == "zero":
            action = np.zeros(env.dim_action)
        else:
            raise NotImplementedError

        next_state, reward, done, info = env.step(action)
        if env.goal is not None:
            state = np.concatenate((state, env.goal))
        np.testing.assert_allclose(reward,
                                   reward_model(state, action, next_state)[0],
                                   rtol=1e-3,
                                   atol=1e-6)

        np.testing.assert_allclose(
            np.tile(reward, (5, )),
            reward_model(
                np.tile(state, (5, 1)),
                np.tile(action, (5, 1)),
                np.tile(next_state, (5, 1)),
            )[0],
            rtol=1e-3,
            atol=1e-6,
        )

        state = torch.tensor(state, dtype=torch.get_default_dtype())
        action = torch.tensor(action, dtype=torch.get_default_dtype())
        next_state = torch.tensor(next_state, dtype=torch.get_default_dtype())
        np.testing.assert_allclose(reward,
                                   reward_model(state, action, next_state)[0],
                                   rtol=1e-3,
                                   atol=1e-6)

        np.testing.assert_allclose(
            np.tile(reward, (5, 1)),
            reward_model(state.repeat(5, 1), action.repeat(5, 1),
                         next_state.repeat(5, 1))[0],
            rtol=1e-3,
            atol=1e-6,
        )

        state = next_state.numpy()
示例#9
0
    def test_set_state_np(self):
        env = GymEnvironment("VPendulum-v0")
        env.reset()

        state, action = self.state_action
        action = np.round(action)

        env.state = state
        obs, _, _, _ = env.step(action)
        state = env.state
        np.testing.assert_allclose(obs, state)
示例#10
0
def test_tolerance(action_cost):
    env_name, reward_model_ = ("MBRLReacher3D-v0", ReacherReward)
    if action_cost is not None:
        env = GymEnvironment(env_name, action_cost=action_cost, sparse=True)
    else:
        env = GymEnvironment(env_name, sparse=True)
    state = env.reset()
    if action_cost is not None:
        reward_model = reward_model_(action_cost=action_cost, sparse=True)
    else:
        reward_model = reward_model_(sparse=True)
    reward_model.set_goal(env.goal)

    for _ in range(50):
        action = env.action_space.sample()
        next_state, reward, done, info = env.step(action)
        if env.goal is not None:
            state = np.concatenate((state, env.goal))
        np.testing.assert_allclose(reward,
                                   reward_model(state, action, next_state)[0],
                                   rtol=1e-3,
                                   atol=1e-6)

        np.testing.assert_allclose(
            np.tile(reward, (5, )),
            reward_model(
                np.tile(state, (5, 1)),
                np.tile(action, (5, 1)),
                np.tile(next_state, (5, 1)),
            )[0],
            rtol=1e-3,
            atol=1e-6,
        )

        state = torch.tensor(state, dtype=torch.get_default_dtype())
        action = torch.tensor(action, dtype=torch.get_default_dtype())
        next_state = torch.tensor(next_state, dtype=torch.get_default_dtype())
        np.testing.assert_allclose(reward,
                                   reward_model(state, action, next_state)[0],
                                   rtol=1e-3,
                                   atol=1e-6)

        np.testing.assert_allclose(
            np.tile(reward, (5, 1)),
            reward_model(state.repeat(5, 1), action.repeat(5, 1),
                         next_state.repeat(5, 1))[0],
            rtol=1e-3,
            atol=1e-6,
        )

        state = next_state.numpy()
示例#11
0
"""Python Script Template."""
from rllib.environment import GymEnvironment
from rllib.util.utilities import set_random_seed
from qreps.environment.random_action_wrapper import RandomActionWrapper

import os

from exps.utilities import parse_arguments, run_experiment
from exps.environments.utilities import get_saddle_agents, get_benchmark_agents

args = parse_arguments()
args.env_name = "RiverSwim-v0"
args.lr = 0.01
args.eta = 5.0

set_random_seed(args.seed)
env = GymEnvironment(args.env_name, seed=args.seed)
env.add_wrapper(RandomActionWrapper, p=args.random_action_p)

agents = get_saddle_agents(env, **vars(args))
agents.update(get_benchmark_agents(env, **vars(args)))

df = run_experiment(agents, env, args)
df.to_pickle(f"river_swim_results_{args.seed}.pkl")

os.system("python river_swim_plot.py")
示例#12
0
                transitions[(state, action)].append({
                    "next_state": 0,
                    "reward": reward,
                    "probability": 1
                })  # Noisy transitions
        return transitions


if __name__ == "__main__":
    from rllib.environment import GymEnvironment
    from rllib.environment.utilities import transitions2kernelreward

    import qreps  # noqa: F401

    env = GymEnvironment("DeepSea-v0", side=5)
    kernel, reward = transitions2kernelreward(env.env.transitions,
                                              env.num_states, env.num_actions)
    state = env.reset()
    print(state)
    for i in range(10):
        action = env.action_space.sample()
        next_state, r, done, f = env.step(action)
        print(
            env.env._state_to_grid(state),
            env.env._state_to_grid(next_state),
            action,
            r,
            done,
        )
        state = next_state
def discrete_environment(request):
    return GymEnvironment(request.param)
def environment(request):
    return GymEnvironment(request.param)
示例#15
0
def continuous_action_environment(request):
    return GymEnvironment(request.param)
示例#16
0
import torch.optim

from rllib.agent import DPGAgent, TD3Agent  # noqa: F401
from rllib.dataset import ExperienceReplay, PrioritizedExperienceReplay  # noqa: F401
from rllib.environment import GymEnvironment
from rllib.util.parameter_decay import ExponentialDecay
from rllib.util.training.agent_training import evaluate_agent, train_agent

ENVIRONMENT = ["MountainCarContinuous-v0", "Pendulum-v0"][0]
NUM_EPISODES = 25
MAX_STEPS = 2500
GAMMA = 0.99
EPS_START = 1.0
EPS_END = 0.1
EPS_DECAY = 1e6
SEED = 0

torch.manual_seed(SEED)
np.random.seed(SEED)

environment = GymEnvironment(ENVIRONMENT, SEED)
noise = ExponentialDecay(EPS_START, EPS_END, EPS_DECAY)
agent = DPGAgent.default(environment, exploration_noise=noise, gamma=GAMMA)

train_agent(agent,
            environment,
            num_episodes=NUM_EPISODES,
            max_steps=MAX_STEPS,
            render=True)
evaluate_agent(agent, environment, 1, MAX_STEPS)
示例#17
0
                "probability": 0.5,
                "reward": reward
            })
        for j in range(8):
            for a in range(2):
                transitions[(3 + j, a)].append({
                    "next_state": 0,
                    "probability": 1.0,
                    "reward": 0
                })
        return transitions


if __name__ == "__main__":
    from rllib.environment import GymEnvironment
    from rllib.environment.utilities import transitions2kernelreward

    import qreps  # noqa: F401

    env = GymEnvironment("WideTree-v0", reward=1)
    kernel, reward = transitions2kernelreward(env.env.transitions,
                                              env.num_states, env.num_actions)
    print(kernel, reward)
    state = env.reset()
    print(state)
    for i in range(10):
        action = env.action_space.sample()
        next_state, r, done, f = env.step(action)
        print(state, action, next_state, r, done)
        state = next_state
示例#18
0
            "next_state": length,
            "probability": 0.9,
            "reward": 1
        })
        transitions[(length, 0)].append({
            "next_state": length - 1,
            "probability": 0.1,
            "reward": 0
        })

        return transitions


if __name__ == "__main__":
    from rllib.environment import GymEnvironment
    from rllib.environment.utilities import transitions2kernelreward

    import qreps  # noqa: F401

    env = GymEnvironment("RiverSwim-v0", length=5)
    kernel, reward = transitions2kernelreward(env.env.transitions,
                                              env.num_states, env.num_actions)
    print(kernel, reward)
    state = env.reset()
    print(state)
    for i in range(100):
        action = env.action_space.sample()
        next_state, r, done, f = env.step(action)
        print(state, action, next_state, r, done)
        state = next_state