def test_insert_demos(self):
        """
        Tests inserting into the demo memory.
        """
        env = OpenAIGymEnv.from_spec(self.env_spec)

        agent_config = config_from_path("configs/dqfd_agent_for_cartpole.json")
        agent = DQFDAgent.from_spec(
            agent_config,
            state_space=env.state_space,
            action_space=env.action_space
        )
        terminals = BoolBox(add_batch_rank=True)
        rewards = FloatBox(add_batch_rank=True)

        # Observe a single data point.
        agent.observe_demos(
            preprocessed_states=agent.preprocessed_state_space.with_batch_rank().sample(1),
            actions=env.action_space.with_batch_rank().sample(1),
            rewards=rewards.sample(1),
            next_states=agent.preprocessed_state_space.with_batch_rank().sample(1),
            terminals=terminals.sample(1),
        )

        # Observe a batch of demos.
        agent.observe_demos(
            preprocessed_states=agent.preprocessed_state_space.sample(10),
            actions=env.action_space.sample(10),
            rewards=FloatBox().sample(10),
            terminals=terminals.sample(10),
            next_states=agent.preprocessed_state_space.sample(10)
        )
    def test_impala_on_cart_pole(self):
        """
        Creates a single IMPALAAgent and runs it via a simple loop on CartPole-v0.
        """
        env_spec = dict(type="open-ai-gym",
                        gym_env="CartPole-v0",
                        seed=10,
                        visualize=self.is_windows)
        config_ = config_from_path("configs/impala_agent_for_cartpole.json")
        config_["environment_spec"] = env_spec
        dummy_env = OpenAIGymEnv.from_spec(env_spec)
        agent = IMPALAAgent.from_spec(config_,
                                      state_space=dummy_env.state_space,
                                      action_space=dummy_env.action_space,
                                      execution_spec=dict(seed=10))

        learn_updates = 300
        mean_returns = []
        for i in range(learn_updates):
            ret = agent.update()
            mean_return = self._calc_mean_return(ret)
            mean_returns.append(mean_return)
            print("i={}/{} Loss={:.4} Avg-reward={:.2}".format(
                i, learn_updates, float(ret[1]), mean_return))

        # Assume we have learned something.
        average_return_last_n_episodes = np.nanmean(mean_returns[:-100])
        print("Average return over last n episodes: {}".format(
            average_return_last_n_episodes))
        self.assertGreater(average_return_last_n_episodes, 30.0)

        time.sleep(3)
        agent.terminate()
        time.sleep(3)
    def test_update_online(self):
        """
        Tests if joint updates from demo and online memory work.
        """
        env = OpenAIGymEnv.from_spec(self.env_spec)
        agent_config = config_from_path("configs/dqfd_agent_for_cartpole.json")
        agent = DQFDAgent.from_spec(
            agent_config,
            state_space=env.state_space,
            action_space=env.action_space
        )
        terminals = BoolBox(add_batch_rank=True)

        # Observe a batch of demos.
        agent.observe_demos(
            preprocessed_states=agent.preprocessed_state_space.sample(32),
            actions=env.action_space.sample(32),
            rewards=FloatBox().sample(32),
            terminals=terminals.sample(32),
            next_states=agent.preprocessed_state_space.sample(32)
        )

        # Observe a batch of online data.
        agent._observe_graph(
            preprocessed_states=agent.preprocessed_state_space.sample(32),
            actions=env.action_space.sample(32),
            rewards=FloatBox().sample(32),
            internals=[],
            terminals=terminals.sample(32),
            next_states=agent.preprocessed_state_space.sample(32)
        )
        # Call update.
        agent.update()
    def test_actor_critic_on_cart_pole(self):
        """
        Creates an Actor-critic and runs it via a Runner on the CartPole Env.
        """
        env_spec = dict(type="open-ai-gym",
                        gym_env="CartPole-v0",
                        visualize=False)  #self.is_windows)
        dummy_env = OpenAIGymEnv.from_spec(env_spec)
        agent = ActorCriticAgent.from_spec(
            config_from_path("configs/actor_critic_agent_for_cartpole.json"),
            state_space=dummy_env.state_space,
            action_space=dummy_env.action_space)

        time_steps = 20000
        worker = SingleThreadedWorker(env_spec=env_spec,
                                      agent=agent,
                                      worker_executes_preprocessing=False)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print(results)

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], 20)
        self.assertGreaterEqual(results["max_episode_reward"], 100.0)
Exemplo n.º 5
0
    def test_pong_with_worker(self):
        env_spec = dict(
            type="openai",
            gym_env="PongNoFrameskip-v4",
            # The frameskip in the agent config will trigger worker skips, this
            # is used for internal env.
            frameskip=4,
            max_num_noops=30,
            episodic_life=False
        )

        env = OpenAIGymEnv.from_spec(env_spec)
        agent_config = config_from_path("configs/backend_performance_dqn_pong.json")

        # Test cpu settings for batching here.
        agent_config["update_spec"] = None

        agent = DQNAgent.from_spec(
            # Uses 2015 DQN parameters as closely as possible.
            agent_config,
            state_space=env.state_space,
            # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
            action_space=env.action_space
        )

        worker = SingleThreadedWorker(
            env_spec=env_spec,
            agent=agent,
            frameskip=1,
            preprocessing_spec=agent_config["preprocessing_spec"],
            worker_executes_preprocessing=True
        )

        result = worker.execute_timesteps(1000)
        print(result)
    def test_update_from_demos(self):
        """
        Tests the separate API method to update from demos.
        """
        env = OpenAIGymEnv.from_spec(self.env_spec)
        agent_config = config_from_path("configs/dqfd_agent_for_cartpole.json")
        agent = DQFDAgent.from_spec(agent_config,
                                    state_space=env.state_space,
                                    action_space=env.action_space)
        terminals = BoolBox(add_batch_rank=True)
        rewards = FloatBox(add_batch_rank=True)
        state_1 = agent.preprocessed_state_space.with_batch_rank().sample(1)
        action_1 = [1]
        state_2 = agent.preprocessed_state_space.with_batch_rank().sample(1)
        action_2 = [0]

        # Insert two states with fixed actions and a few random examples.
        for _ in range(10):
            # State with correct action
            agent.observe_demos(
                preprocessed_states=state_1,
                actions=action_1,
                rewards=rewards.sample(1),
                next_states=agent.preprocessed_state_space.with_batch_rank().
                sample(1),
                terminals=terminals.sample(1),
            )
            agent.observe_demos(
                preprocessed_states=state_2,
                actions=action_2,
                rewards=rewards.sample(1),
                next_states=agent.preprocessed_state_space.with_batch_rank().
                sample(1),
                terminals=terminals.sample(1),
            )

        # Update.
        agent.update_from_demos(num_updates=100, batch_size=8)

        # Test if fixed states and actions map.
        action = agent.get_action(states=state_1,
                                  apply_preprocessing=False,
                                  use_exploration=False)
        self.assertEqual(action, action_1)

        action = agent.get_action(states=state_2,
                                  apply_preprocessing=False,
                                  use_exploration=False)
        self.assertEqual(action, action_2)
Exemplo n.º 7
0
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config_path = os.path.join(os.getcwd(), FLAGS.config)
    with open(agent_config_path, 'rt') as fp:
        agent_config = json.load(fp)

    env = OpenAIGymEnv.from_spec({
        "type": "openai",
        "gym_env": FLAGS.env,
        "visualize": FLAGS.visualize
    })

    agent = Agent.from_spec(agent_config,
                            state_space=env.state_space,
                            action_space=env.action_space)

    episode_returns = []

    def episode_finished_callback(episode_return, duration, timesteps,
                                  **kwargs):
        episode_returns.append(episode_return)
        if len(episode_returns) % 10 == 0:
            print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".
                  format(len(episode_returns), episode_return,
                         np.mean(episode_returns[-10:])))

    worker = SingleThreadedWorker(
        env_spec=lambda: env,
        agent=agent,
        render=False,
        worker_executes_preprocessing=False,
        episode_finish_callback=episode_finished_callback)
    print(
        "Starting workload, this will take some time for the agents to build.")

    # Use exploration is true for training, false for evaluation.
    worker.execute_timesteps(20000, use_exploration=True)

    # Note: A basic actor critic is very sensitive to hyper-parameters and might collapse after reaching the maximum
    # reward. In practice, it would be recommended to stop training when a reward threshold is reached.
    print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.mean(episode_returns), np.mean(episode_returns[-10:])))
Exemplo n.º 8
0
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config_path = os.path.join(os.getcwd(), FLAGS.config)
    with open(agent_config_path, 'rt') as fp:
        agent_config = json.load(fp)

    env = OpenAIGymEnv.from_spec({
        "type": "openai",
        "gym_env": FLAGS.env
    })
    print(env.state_space)

    agent = Agent.from_spec(
        agent_config,
        state_space=env.state_space,
        action_space=env.action_space
    )

    episode_returns = []

    def episode_finished_callback(episode_return, duration, timesteps, **kwargs):
        episode_returns.append(episode_return)
        if len(episode_returns) % 5 == 0:
            print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".format(
                len(episode_returns), episode_return, np.mean(episode_returns[-5:])
            ))

    worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, render=FLAGS.render, worker_executes_preprocessing=False,
                                  episode_finish_callback=episode_finished_callback)
    print("Starting workload, this will take some time for the agents to build.")

    worker.execute_episodes(100, use_exploration=True)

    # Use exploration is true for training, false for evaluation.

    #worker.execute_episodes(100, use_exploration=False)

    print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.mean(episode_returns), np.mean(episode_returns[-10:])
    ))
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config_path = os.path.join(os.getcwd(), FLAGS.config)
    with open(agent_config_path, 'rt') as fp:
        agent_config = json.load(fp)

    # Override openAI gym env per command line.
    if FLAGS.env is None:
        env_spec = agent_config["environment_spec"]
    else:
        env_spec = dict(type="openai-gym", gym_env=FLAGS.env)
    # Override number of visualized envs per command line.
    if FLAGS.visualize != -1:
        env_spec["visualize"] = FLAGS.visualize

    dummy_env = OpenAIGymEnv.from_spec(env_spec)
    agent = Agent.from_spec(
        agent_config,
        state_space=dummy_env.state_space,
        action_space=dummy_env.action_space
    )
    dummy_env.terminate()

    learn_updates = 6000
    mean_returns = []
    for i in range(learn_updates):
        ret = agent.update()
        mean_return = _calc_mean_return(ret)
        mean_returns.append(mean_return)
        print("Iteration={} Loss={:.4f} Avg-reward={:.2f}".format(i, float(ret[1]), mean_return))

    print("Mean return: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.nanmean(mean_returns), np.nanmean(mean_returns[-10:])
    ))

    time.sleep(1)
    agent.terminate()
    time.sleep(3)
Exemplo n.º 10
0
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config = read_config_file(FLAGS.config)

    env = OpenAIGymEnv.from_spec({
        "type": "openai",
        "gym_env": FLAGS.env
    })

    agent = Agent.from_spec(
        agent_config,
        summary_spec=dict(
            summary_regexp=FLAGS.summary_regexp
        ),
        state_space=env.state_space,
        action_space=env.action_space
    )

    rewards = []

    def episode_finished_callback(reward, duration, timesteps, **kwargs):
        rewards.append(reward)
        if len(rewards) % 10 == 0:
            print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".format(
                len(rewards), reward, np.mean(rewards[-10:])
            ))

    worker = SingleThreadedWorker(
        env_spec=lambda: env, agent=agent, render=FLAGS.render, worker_executes_preprocessing=False,
        episode_finish_callback=episode_finished_callback
    )
    print("Starting workload, this will take some time for the agents to build.")
    results = worker.execute_episodes(200, use_exploration=True)

    print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.mean(rewards), np.mean(rewards[-10:])
    ))
Exemplo n.º 11
0
def evaluate(agent_obs, nChildren):
    envObs = OpenAIGymEnv.from_spec({
        "type": "openai",
        "gym_env": 'gym_SmartPrimer:TestEnv-v0'
    })

    improvements = []
    for i in range(0, nChildren):
        ob_obs = envObs.reset()
        ob_obs = (ob_obs - [-4.5, 0, -5, 0, 0, 1.5, 25, 0
                            ]) / [9, 1, 10, 1, 1, 3, 50, 1]
        # ob_obs = (ob_obs - [4, 4, 0.5, 0.5, 0.5, 1.5, 15, 5]) / [8, 4, 1, 1, 1, 3, 30, 10]
        # action_list_obs = []

        while True:
            time_percentage_obs = min(agent_obs.timesteps / 1e6, 1.0)
            action = agent_obs.get_action(ob_obs,
                                          time_percentage=time_percentage_obs)
            # action = np.random.randint(0, 4)
            # action = 3

            # action_list_obs.append(action)

            next_ob_obs, reward, done, Baseinfo = envObs.step(action)
            next_ob_obs = (next_ob_obs - [-4.5, 0, -5, 0, 0, 1.5, 25, 0
                                          ]) / [9, 1, 10, 1, 1, 3, 50, 1]
            # next_ob_obs = (next_ob_obs - [4, 4, 0.5, 0.5, 0.5, 1.5, 15, 5]) / [8, 4, 1, 1, 1, 3, 30, 10]

            # agent_obs.observe(ob_obs, action, None, reward, next_ob_obs, done)
            ob_obs = next_ob_obs

            if done:
                # print(envObs.gym_env.rewards)
                improvements.append(envObs.gym_env.rewards)

                agent_obs.reset()
                break

    return np.mean(improvements), np.std(improvements)
Exemplo n.º 12
0
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config_path = os.path.join(os.getcwd(), FLAGS.config)
    with open(agent_config_path, 'rt') as fp:
        agent_config = json.load(fp)

    env = OpenAIGymEnv.from_spec({"type": "openai", "gym_env": FLAGS.env})

    agent = Agent.from_spec(
        # Uses 2015 DQN parameters as closely as possible.
        agent_config,
        state_space=env.state_space,
        # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
        action_space=env.action_space)

    rewards = []

    def episode_finished_callback(reward, duration, timesteps, **kwargs):
        rewards.append(reward)
        if len(rewards) % 10 == 0:
            print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".
                  format(len(rewards), reward, np.mean(rewards[-10:])))

    worker = SingleThreadedWorker(
        env_spec=lambda: env,
        agent=agent,
        render=False,
        worker_executes_preprocessing=False,
        episode_finish_callback=episode_finished_callback)
    print(
        "Starting workload, this will take some time for the agents to build.")
    results = worker.execute_episodes(200, use_exploration=True)

    print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.mean(rewards), np.mean(rewards[-10:])))
def evaluate(agent_obs, nChildren):
    envObs = OpenAIGymEnv.from_spec({
        "type":
        "openai",
        "gym_env":
        'gym_SmartPrimer:SmartPrimer-realistic-v2'
    })

    improvements = []
    for i in range(0, nChildren):
        ob_obs = envObs.reset()
        ob_obs = (ob_obs - [4, 4, 0.5, 0.5, 0.5, 1.5, 15, 5
                            ]) / [8, 4, 1, 1, 1, 3, 30, 10]
        action_list_obs = []

        while True:
            time_percentage_obs = min(agent_obs.timesteps / 1e6, 1.0)
            action = agent_obs.get_action(ob_obs,
                                          time_percentage=time_percentage_obs)
            # action = np.random.randint(0, 4)
            # action = 3

            action_list_obs.append(action)

            next_ob_obs, reward, done, Baseinfo = envObs.step(action)
            next_ob_obs = (next_ob_obs - [4, 4, 0.5, 0.5, 0.5, 1.5, 15, 5
                                          ]) / [8, 4, 1, 1, 1, 3, 30, 10]

            ob_obs = next_ob_obs

            if done:
                improvements.append(envObs.gym_env.info['improvementPerChild'])

                agent_obs.reset()
                break

    return np.mean(improvements), np.std(improvements)
Exemplo n.º 14
0
    def test_with_final_eval(self):
        """
        Tests if apex can learn a simple environment using a single worker, thus replicating
        DQN.
        """
        env_spec = dict(type="openai", gym_env="CartPole-v0")
        agent_config = config_from_path("configs/apex_agent_cartpole.json")

        # Use n-step adjustments.
        agent_config["execution_spec"]["ray_spec"]["worker_spec"][
            "n_step_adjustment"] = 3
        agent_config["execution_spec"]["ray_spec"]["apex_replay_spec"][
            "n_step_adjustment"] = 3
        agent_config["n_step"] = 3

        executor = ApexExecutor(
            environment_spec=env_spec,
            agent_config=agent_config,
        )
        # Define executor, test assembly.
        print("Successfully created executor.")

        # Executes actual workload.
        result = executor.execute_workload(
            workload=dict(num_timesteps=20000,
                          report_interval=1000,
                          report_interval_min_seconds=1))
        print("Finished executing workload:")
        print(result)

        # Get agent.
        agent = executor.local_agent
        preprocessing_spec = agent_config["preprocessing_spec"]

        # Create env.
        env = OpenAIGymEnv.from_spec(env_spec)

        if preprocessing_spec is not None:
            preprocessing_spec = deepcopy(preprocessing_spec)
            in_space = env.state_space.with_batch_rank()
            in_space = deepcopy(in_space)
            # Set scopes.
            scopes = [
                preprocessor["scope"] for preprocessor in preprocessing_spec
            ]
            # Set backend to python.
            for spec in preprocessing_spec:
                spec["backend"] = "python"
            processor_stack = PreprocessorStack(*preprocessing_spec,
                                                backend="python")
            build_space = in_space
            for sub_comp_scope in scopes:
                processor_stack.sub_components[
                    sub_comp_scope].create_variables(
                        input_spaces=dict(preprocessing_inputs=build_space),
                        action_space=None)
                build_space = processor_stack.sub_components[
                    sub_comp_scope].get_preprocessed_space(build_space)
            processor_stack.reset()
        else:
            processor_stack = None

        ep_rewards = []
        print("finished learning, starting eval")
        for _ in range(10):
            state = env.reset()
            terminal = False
            ep_reward = 0
            while not terminal:
                state = agent.state_space.force_batch(state)
                if processor_stack is not None:
                    state = processor_stack.preprocess(state)

                actions = agent.get_action(states=state,
                                           use_exploration=False,
                                           apply_preprocessing=False)
                next_state, step_reward, terminal, info = env.step(
                    actions=actions[0])
                ep_reward += step_reward

                state = next_state
                if terminal:
                    ep_rewards.append(ep_reward)
                    break

        print("Eval episode rewards:")
        print(ep_rewards)
Exemplo n.º 15
0
        ob, reward, done, Baseinfo = env.step(action)
        if done:
            agent.reset()
            break

np.random.seed(2)

agent_config_path = os.path.abspath(os.path.dirname(
    os.path.dirname(__file__))) + '/agents/ppoSmartPrimer_config.json'

with open(agent_config_path, 'rt') as fp:
    agent_config = json.load(fp)

env = OpenAIGymEnv.from_spec({
    "type":
    "openai",
    "gym_env":
    'gym_SmartPrimer:SmartPrimer-realistic-v2'
})

agent = Agent.from_spec(agent_config,
                        state_space=env.state_space,
                        action_space=env.action_space)

episode_returns = []


def episode_finished_callback(episode_return, duration, timesteps, *args,
                              **kwargs):
    episode_returns.append(episode_return)
    if len(episode_returns) % 100 == 0:
        print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".
Exemplo n.º 16
0
import json
import os
import pickle
import argparse
import copy
import matplotlib.pyplot as plt

parser = argparse.ArgumentParser(description='example')
parser.add_argument('--seed', type=int, default=3, help='numpy seed ')
parser.add_argument('--time', type=int, default=5, help='numpy seed ')
args = parser.parse_args()

np.random.seed(args.seed)

env = OpenAIGymEnv.from_spec({
    "type": "openai",
    "gym_env": 'gym_SmartPrimer:TestEnv-v0'
})

# configure the agent settings in this file
agent_config_path = os.path.join(os.path.dirname(os.path.dirname(__file__)),
                                 'agents/ppoSmartPrimer_config.json')

with open(agent_config_path, 'rt') as fp:
    agent_config = json.load(fp)

# retreive the agent from RLgraph
agent = Agent.from_spec(agent_config,
                        state_space=env.state_space,
                        action_space=env.action_space)

# define number of children to simulate