def test_impala_on_cart_pole(self):
        """
        Creates a single IMPALAAgent and runs it via a simple loop on CartPole-v0.
        """
        env_spec = dict(type="open-ai-gym",
                        gym_env="CartPole-v0",
                        seed=10,
                        visualize=self.is_windows)
        config_ = config_from_path("configs/impala_agent_for_cartpole.json")
        config_["environment_spec"] = env_spec
        dummy_env = OpenAIGymEnv.from_spec(env_spec)
        agent = IMPALAAgent.from_spec(config_,
                                      state_space=dummy_env.state_space,
                                      action_space=dummy_env.action_space,
                                      execution_spec=dict(seed=10))

        learn_updates = 300
        mean_returns = []
        for i in range(learn_updates):
            ret = agent.update()
            mean_return = self._calc_mean_return(ret)
            mean_returns.append(mean_return)
            print("i={}/{} Loss={:.4} Avg-reward={:.2}".format(
                i, learn_updates, float(ret[1]), mean_return))

        # Assume we have learned something.
        average_return_last_n_episodes = np.nanmean(mean_returns[:-100])
        print("Average return over last n episodes: {}".format(
            average_return_last_n_episodes))
        self.assertGreater(average_return_last_n_episodes, 30.0)

        time.sleep(3)
        agent.terminate()
        time.sleep(3)
示例#2
0
    def test_impala_on_outbreak(self):
        """
        Creates a DQNAgent and runs it via a Runner on an openAI Pong Env.
        """
        env = OpenAIGymEnv("Breakout-v0",
                           frameskip=4,
                           max_num_noops=30,
                           episodic_life=True,
                           visualize=False)
        config_ = config_from_path("configs/impala_agent_for_breakout.json")
        agent = IMPALAAgent.from_spec(
            config_,
            state_space=env.state_space,
            action_space=env.action_space,
        )

        learn_updates = 4000000
        mean_returns = []
        for i in range(learn_updates):
            ret = agent.update()
            mean_return = self._calc_mean_return(ret)
            mean_returns.append(mean_return)
            print("i={} Loss={:.4} Avg-reward={:.2}".format(
                i, float(ret[1]), mean_return))

        time.sleep(3)
        agent.terminate()
        time.sleep(3)
示例#3
0
    def test_impala_actor_compilation(self):
        """
        Tests IMPALA agent compilation (actor).
        """
        try:
            from rlgraph.environments.deepmind_lab import DeepmindLabEnv
        except ImportError:
            print("Deepmind Lab not installed: Will skip this test.")
            return

        agent_config = config_from_path("configs/impala_agent_for_deepmind_lab_env.json")
        env = DeepmindLabEnv(
            level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED", "INSTR"], frameskip=4
        )

        actor_agent = IMPALAAgent.from_spec(
            agent_config,
            type="actor",
            state_space=env.state_space,
            action_space=env.action_space,
            internal_states_space=Tuple(FloatBox(shape=(256,)), FloatBox(shape=(256,)), add_batch_rank=True),
            # Make session-creation hang in docker.
            execution_spec=dict(disable_monitoring=True)
        )
        # Start Specifiable Server with Env manually.
        actor_agent.environment_stepper.environment_server.start()
        print("Compiled IMPALA type=actor agent.")
        actor_agent.environment_stepper.environment_server.stop()
示例#4
0
    def test_impala_learner_compilation(self):
        """
        Tests IMPALA agent compilation (learner).
        """
        return
        if get_backend() == "pytorch":
            return
        try:
            from rlgraph.environments.deepmind_lab import DeepmindLabEnv
        except ImportError:
            print("Deepmind Lab not installed: Will skip this test.")
            return

        agent_config = config_from_path(
            "configs/impala_agent_for_deepmind_lab_env.json")
        env_spec = dict(level_id="seekavoid_arena_01",
                        observations=["RGB_INTERLEAVED", "INSTR"],
                        frameskip=4)
        dummy_env = DeepmindLabEnv.from_spec(env_spec)
        learner_agent = IMPALAAgent.from_spec(
            agent_config,
            type="learner",
            state_space=dummy_env.state_space,
            action_space=dummy_env.action_space,
            internal_states_space=IMPALAAgent.default_internal_states_space,
            environment_spec=default_dict(dict(type="deepmind-lab"), env_spec),
            # Setup distributed tf.
            execution_spec=dict(
                mode="distributed",
                #gpu_spec=dict(
                #    gpus_enabled=True,
                #    max_usable_gpus=1,
                #    num_gpus=1
                #),
                distributed_spec=dict(job="learner",
                                      task_index=0,
                                      cluster_spec=self.impala_cluster_spec),
                session_config=dict(type="monitored-training-session",
                                    allow_soft_placement=True,
                                    log_device_placement=True,
                                    auto_start=False),
                disable_monitoring=True,
                enable_timeline=True,
            ))
        print(
            "Compiled IMPALA type=learner agent without starting the session (would block waiting for actor)."
        )

        ## Take one batch from the filled up queue and run an update_from_memory with the learner.
        #update_steps = 10
        #time_start = time.perf_counter()
        #for _ in range(update_steps):
        #    agent.call_api_method("update_from_memory")
        #time_total = time.perf_counter() - time_start
        #print("Done learning {}xbatch-of-{} in {}sec ({} updates/sec).".format(
        #    update_steps, agent.update_spec["batch_size"], time_total , update_steps / time_total)
        #)

        learner_agent.terminate()
示例#5
0
 def test_impala_single_agent_compilation(self):
     """
     Tests IMPALA agent compilation (single-node mode).
     """
     env = GridWorld("2x2")
     agent = IMPALAAgent.from_spec(
         config_from_path("configs/impala_agent_for_2x2_gridworld.json"),
         state_space=env.state_space,
         action_space=env.action_space,
         update_spec=dict(batch_size=16),
         optimizer_spec=dict(type="adam", learning_rate=0.05))
     agent.terminate()
     print("Compiled IMPALA type=single agent.")
示例#6
0
 def test_impala_single_agent_compilation(self):
     """
     Tests IMPALA agent compilation (single-node mode).
     """
     return
     if get_backend() == "pytorch":
         return
     env = GridWorld("2x2")
     agent = IMPALAAgent.from_spec(
         config_from_path("configs/impala_agent_for_2x2_gridworld.json"),
         state_space=env.state_space,
         action_space=env.action_space,
         update_spec=dict(batch_size=16),
         optimizer_spec=dict(type="adam", learning_rate=0.05),
         # Make session-creation hang in docker.
         execution_spec=dict(disable_monitoring=True))
     agent.terminate()
     print("Compiled {}".format(agent))
    def test_impala_on_2x2_grid_world(self):
        """
        Creates a single IMPALAAgent and runs it via a simple loop on a 2x2 GridWorld.
        """
        env = GridWorld("2x2")
        agent = IMPALAAgent.from_spec(
            config_from_path("configs/impala_agent_for_2x2_gridworld.json"),
            state_space=env.state_space,
            action_space=env.action_space,
            execution_spec=dict(seed=12),
            update_spec=dict(batch_size=16),
            optimizer_spec=dict(type="adam", learning_rate=0.05))

        learn_updates = 50
        for i in range(learn_updates):
            ret = agent.update()
            mean_return = self._calc_mean_return(ret)
            print("i={} Loss={:.4} Avg-reward={:.2}".format(
                i, float(ret[1]), mean_return))

        # Assume we have learned something.
        self.assertGreater(mean_return, -0.1)

        # Check the last action probs for the 2 valid next_states (start (after a reset) and one below start).
        action_probs = ret[3]["action_probs"].reshape((80, 4))
        next_states = ret[3]["states"][:, 1:].reshape((80, ))
        for s_, probs in zip(next_states, action_probs):
            # Start state:
            # - Assume we picked "right" in state=1 (in order to step into goal state).
            # - OR we picked "up" or "left" in state=0 (unlikely, but possible).
            if s_ == 0:
                recursive_assert_almost_equal(probs[0], 0.0, decimals=2)
                self.assertTrue(probs[1] > 0.99 or probs[2] > 0.99)
                recursive_assert_almost_equal(probs[3], 0.0, decimals=2)
            # One below start:
            # - Assume we picked "down" in start state with very large probability.
            # - OR we picked "left" or "down" in state=1 (unlikely, but possible).
            elif s_ == 1:
                recursive_assert_almost_equal(probs[0], 0.0, decimals=2)
                self.assertTrue(probs[1] > 0.99 or probs[2] > 0.99)
                recursive_assert_almost_equal(probs[3], 0.0, decimals=2)

        agent.terminate()
示例#8
0
    def test_impala_on_2x2_grid_world(self):
        """
        Creates a single IMPALAAgent and runs it via the IMPALAWorker on a simple 2x2 GridWorld.
        """
        env = GridWorld("2x2")
        agent = IMPALAAgent.from_spec(
            config_from_path("configs/impala_agent_for_2x2_gridworld.json"),
            state_space=env.state_space,
            action_space=env.action_space,
            execution_spec=dict(seed=12),
            update_spec=dict(update_interval=4, batch_size=16),
            optimizer_spec=dict(type="adam", learning_rate=0.05),
        )

        learn_updates = 1000
        # Setup the queue runner.
        agent.call_api_method("setup_queue_runner")
        for _ in range(learn_updates):
            agent.update()

        #print("STATES:\n{}".format(agent.last_q_table["states"]))
        #print("\n\nQ(s,a)-VALUES:\n{}".format(np.round_(agent.last_q_table["q_values"], decimals=2)))

        #self.assertEqual(results["timesteps_executed"], time_steps)
        #self.assertEqual(results["env_frames"], time_steps)
        #self.assertGreaterEqual(results["mean_episode_reward"], -3.5)
        #self.assertGreaterEqual(results["max_episode_reward"], 0.0)
        #self.assertLessEqual(results["episodes_executed"], 350)

        # Check q-table for correct values.
        expected_q_values_per_state = {
            (1.0, 0, 0, 0): (-1, -5, 0, -1),
            (0, 1.0, 0, 0): (-1, 1, 0, 0)
        }
        for state, q_values in zip(agent.last_q_table["states"],
                                   agent.last_q_table["q_values"]):
            state, q_values = tuple(state), tuple(q_values)
            assert state in expected_q_values_per_state, \
                "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state)
            recursive_assert_almost_equal(q_values,
                                          expected_q_values_per_state[state],
                                          decimals=0)
示例#9
0
    def test_impala_actor_compilation(self):
        """
        Tests IMPALA agent compilation (actor).
        """
        return
        if get_backend() == "pytorch":
            return
        try:
            from rlgraph.environments.deepmind_lab import DeepmindLabEnv
        except ImportError:
            print("Deepmind Lab not installed: Will skip this test.")
            return

        agent_config = config_from_path("configs/impala_agent_for_deepmind_lab_env.json")
        env_spec = dict(level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED", "INSTR"], frameskip=4)
        dummy_env = DeepmindLabEnv.from_spec(env_spec)
        agent = IMPALAAgent.from_spec(
            agent_config,
            type="actor",
            state_space=dummy_env.state_space,
            action_space=dummy_env.action_space,
            internal_states_space=Tuple(FloatBox(shape=(256,)), FloatBox(shape=(256,)), add_batch_rank=False),
            environment_spec=default_dict(dict(type="deepmind-lab"), env_spec),
            # Make session-creation hang in docker.
            execution_spec=dict(
                session_config=dict(
                    type="monitored-training-session",
                    auto_start=False
                ),
                disable_monitoring=True
            )
        )
        # Start Specifiable Server with Env manually (monitoring is disabled).
        agent.environment_stepper.environment_server.start_server()
        print("Compiled {}".format(agent))
        agent.environment_stepper.environment_server.stop_server()
        agent.terminate()
示例#10
0
    def test_impala_learner_compilation(self):
        """
        Tests IMPALA agent compilation (learner).
        """
        try:
            from rlgraph.environments.deepmind_lab import DeepmindLabEnv
        except ImportError:
            print("Deepmind Lab not installed: Will skip this test.")
            return

        agent_config = config_from_path("configs/impala_agent_for_deepmind_lab_env.json")
        env = DeepmindLabEnv(
            level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED", "INSTR"], frameskip=4
        )

        learner_agent = IMPALAAgent.from_spec(
            agent_config,
            type="learner",
            state_space=env.state_space,
            action_space=env.action_space,
            internal_states_space=Tuple(FloatBox(shape=(256,)), FloatBox(shape=(256,)), add_batch_rank=True),
        )

        print("Compiled IMPALA type=learner agent.")
示例#11
0
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config_path = os.path.join(os.getcwd(), FLAGS.config)
    with open(agent_config_path, 'rt') as fp:
        agent_config = json.load(fp)

    cluster_spec_config_path = os.path.join(os.getcwd(), FLAGS.cluster_spec)
    with open(cluster_spec_config_path, 'rt') as fp:
        cluster_spec = json.load(fp)

    # Environment options
    env_spec = {
        "type": "deepmind-lab",
        "level_id": FLAGS.level,
        "frameskip": 4,
        "observations": ["RGB_INTERLEAVED", "INSTR"]
    }

    # Verbose usage errors
    if FLAGS.actor and FLAGS.learner:
        print("Please only use either --actor or --learner, not both.")
        sys.exit(1)

    # We dynamically update the distributed spec according to the job and task index
    if FLAGS.actor:
        agent_type = 'actor'
        distributed_spec = dict(
            job='actor',
            task_index=FLAGS.task,
            cluster_spec=cluster_spec
        )
        # Actors should only act on CPUs
        agent_config['execution_spec']['gpu_spec'].update({
            "gpus_enabled": False,
            "max_usable_gpus": 0,
            "num_gpus": 0
        })
    elif FLAGS.learner:
        agent_type = 'learner'
        distributed_spec = dict(
            job='learner',
            task_index=FLAGS.task,
            cluster_spec=cluster_spec
        )
    else:
        print("Please pass either --learner or --actor (or look at the CartPole example for single trainig mode).")
        sys.exit(1)

    # Set the sample size for the workers
    worker_sample_size = 100

    # Since we dynamically update the cluster spec according to the job and task index, we need to
    # manually update the execution spec as well.
    execution_spec = agent_config['execution_spec']
    execution_spec.update(dict(
        mode="distributed",
        distributed_spec=distributed_spec
    ))

    # Now, create the environment
    env = Environment.from_spec(env_spec)

    agent_spec = dict(
        type=agent_type,
        architecture="large",
        environment_spec=env_spec,
        worker_sample_size=worker_sample_size,
        state_space=env.state_space,
        action_space=env.action_space,
        # TODO: automate this (by lookup from NN).
        internal_states_space=IMPALAAgent.default_internal_states_space,
        execution_spec=execution_spec,
        # update_spec=dict(batch_size=2),
        # Summarize time-steps to have an overview of the env-stepping speed.
        summary_spec=dict(summary_regexp="time-step", directory="/tmp/impala_{}_{}/".format(agent_type, FLAGS.task))
    )

    agent_config.update(agent_spec)

    agent = IMPALAAgent(
        **agent_config
    )

    if FLAGS.learner:
        print("Starting learner for {} updates.".format(FLAGS.updates))
        for _ in range(FLAGS.updates):
            start_time = time.perf_counter()
            results = agent.update()
    else:
        # Actor just acts
        print("Starting actor. Terminate with SIGINT (Ctrl+C).")
        while True:
            agent.call_api_method("perform_n_steps_and_insert_into_fifo")  #.monitored_session.run([agent.enqueue_op])

    learn_updates = 100
    mean_returns = []
    for i in range(learn_updates):
        ret = agent.update()
        mean_return = _calc_mean_return(ret)
        mean_returns.append(mean_return)
        print("Iteration={} Loss={:.4f} Avg-reward={:.2f}".format(i, float(ret[1]), mean_return))

    print("Mean return: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.nanmean(mean_returns), np.nanmean(mean_returns[-10:])
    ))

    time.sleep(1)
    agent.terminate()
    time.sleep(1)