def test_worker_weight_syncing(self):
        """
        Tests weight synchronization with a local agent and a remote worker.
        """
        # First, create a local agent
        env_spec = dict(
            type="openai",
            gym_env="PongNoFrameskip-v4",
            # The frameskip in the agent config will trigger worker skips, this
            # is used for internal env.
            frameskip=4,
            max_num_noops=30,
            episodic_life=True)
        env = Environment.from_spec(env_spec)
        agent_config = config_from_path("configs/ray_apex_for_pong.json")

        # Remove unneeded apex params.
        if "apex_replay_spec" in agent_config:
            agent_config.pop("apex_replay_spec")

        ray_spec = agent_config["execution_spec"].pop("ray_spec")
        local_agent = Agent.from_spec(agent_config,
                                      state_space=env.state_space,
                                      action_space=env.action_space)

        ray_spec["worker_spec"]["worker_sample_size"] = 50
        # Create a remote worker with the same agent config.
        worker = RayValueWorker.as_remote().remote(agent_config,
                                                   ray_spec["worker_spec"],
                                                   env_spec,
                                                   auto_build=True)

        # This imitates the initial executor sync without ray.put
        weights = RayWeight(local_agent.get_weights())
        print('Weight type in init sync = {}'.format(type(weights)))
        ret = worker.set_weights.remote(weights)
        ray.wait([ret])
        print('Init weight sync successful.')

        # Replicate worker syncing steps as done in e.g. Ape-X executor:
        weights = RayWeight(local_agent.get_weights())
        print('Weight type returned by ray put = {}'.format(type(weights)))
        ret = worker.set_weights.remote(weights)
        ray.wait([ret])
        print('Object store weight sync successful.')
示例#2
0
    def __init__(self, environment_spec, agent_config):
        """
        Args:
            environment_spec (dict, callable): Environment spec or callable returning an environment. Each worker
                in the cluster will instantiate an environment using this spec or callable.
            agent_config (dict): Config dict containing agent and execution specs.
        """
        ray_spec = agent_config["execution_spec"].pop("ray_spec")
        self.worker_spec = ray_spec.pop("worker_spec")
        self.compress_states = self.worker_spec["compress_states"]
        super(SyncBatchExecutor,
              self).__init__(executor_spec=ray_spec.pop("executor_spec"),
                             environment_spec=environment_spec,
                             worker_spec=self.worker_spec)

        # Must specify an agent type.
        assert "type" in agent_config
        self.agent_config = agent_config
        environment = None
        if isinstance(self.environment_spec, dict):
            environment = Environment.from_spec(self.environment_spec)
        elif hasattr(self.environment_spec, '__call__'):
            environment = self.environment_spec()
        self.agent_config["state_space"] = environment.state_space
        self.agent_config["action_space"] = environment.action_space

        self.local_agent = self.build_agent_from_config(self.agent_config)
        self.update_batch_size = self.agent_config["update_spec"]["batch_size"]

        # Create remote sample workers based on ray cluster spec.
        self.num_sample_workers = self.executor_spec["num_sample_workers"]

        # These are the tasks actually interacting with the environment.
        self.worker_sample_size = self.executor_spec["num_worker_samples"]

        assert not ray_spec, "ERROR: ray_spec still contains items: {}".format(
            ray_spec)
        self.logger.info("Setting up execution for Apex executor.")
        self.setup_execution()
示例#3
0
    def __init__(self, num_envs, env_spec):
        """
        Args:
            num_envs (int): Number of environments
            env_spec Union[callable, dict]: Environment spec dict.
        """
        self.num_envs = num_envs
        self.environments = list()

        for _ in range_(num_envs):
            if isinstance(env_spec, dict):
                env = Environment.from_spec(env_spec)
            elif hasattr(env_spec, '__call__'):
                env = env_spec()
            else:
                raise ValueError(
                    "Env_spec must be either a dict containing an environment spec or a callable"
                    "returning a new environment object.")
            self.environments.append(env)
        super(VectorEnv,
              self).__init__(state_space=self.environments[0].state_space,
                             action_space=self.environments[0].action_space)
示例#4
0
 def create_env():
     return Environment.from_spec(env_spec)
示例#5
0
    def test_environment_stepper_component_with_large_impala_architecture(
            self):
        env_spec = dict(type="deepmind_lab",
                        level_id="seekavoid_arena_01",
                        observations=["RGB_INTERLEAVED", "INSTR"],
                        frameskip=4)
        dummy_env = Environment.from_spec(env_spec)
        state_space = dummy_env.state_space
        action_space = dummy_env.action_space
        actor_component = ActorComponent(
            # Preprocessor spec (only for image and prev-action channel).
            dict(
                type="dict-preprocessor-stack",
                preprocessors=dict(
                    ## The images from the env  are divided by 255.
                    #RGB_INTERLEAVED=[dict(type="divide", divisor=255)],
                    # The prev. action/reward from the env must be flattened/bumped-up-to-(1,).
                    previous_action=[
                        dict(type="reshape",
                             flatten=True,
                             flatten_categories=action_space.num_categories)
                    ],
                    previous_reward=[
                        dict(type="reshape", new_shape=(1, )),
                        dict(type="convert_type", to_dtype="float32")
                    ],
                )),
            # Policy spec.
            dict(network_spec=LargeIMPALANetwork(), action_space=action_space),
            # Exploration spec.
            Exploration(epsilon_spec=dict(decay_spec=dict(type="linear_decay",
                                                          from_=1.0,
                                                          to_=0.1,
                                                          start_timestep=0,
                                                          num_timesteps=100))))
        environment_stepper = EnvironmentStepper(
            environment_spec=env_spec,
            actor_component_spec=actor_component,
            state_space=state_space,
            reward_space="float32",
            internal_states_space=self.internal_states_space,
            num_steps=100,
            # Add both prev-action and -reward into the state sent through the network.
            add_previous_action_to_state=True,
            add_previous_reward_to_state=True,
            add_action_probs=True,
            action_probs_space=self.action_probs_space)

        test = ComponentTest(
            component=environment_stepper,
            action_space=action_space,
        )
        # Reset the stepper.
        test.test("reset")

        # Step n times through the Env and collect results.
        # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each
        # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states.
        time_start = time.perf_counter()
        steps = 10
        for _ in range(steps):
            out = test.test("step")
        time_total = time.perf_counter() - time_start
        print(
            "Done running {}x{} steps in Deepmind Lab env using IMPALA network in {}sec ({} actions/sec)."
            .format(steps, environment_stepper.num_steps, time_total,
                    environment_stepper.num_steps * steps / time_total))

        # Check types of outputs.
        self.assertTrue(isinstance(
            out, DataOpTuple))  # the step results as a tuple (see below)

        # Check types of single data.
        self.assertTrue(out[0]["INSTR"].dtype == np.object)
        self.assertTrue(out[0]["RGB_INTERLEAVED"].dtype == np.float32)
        self.assertTrue(out[0]["RGB_INTERLEAVED"].min() >=
                        0.0)  # make sure we have pixels / 255
        self.assertTrue(out[0]["RGB_INTERLEAVED"].max() <= 1.0)
        self.assertTrue(out[1].dtype == np.int32)  # actions
        self.assertTrue(out[2].dtype == np.float32)  # rewards
        self.assertTrue(out[3].dtype == np.float32)  # episode return
        self.assertTrue(out[4].dtype == np.bool_)  # next-state is terminal?
        self.assertTrue(out[5]["INSTR"].dtype ==
                        np.object)  # next state (raw, not preprocessed)
        self.assertTrue(out[5]["RGB_INTERLEAVED"].dtype ==
                        np.uint8)  # next state (raw, not preprocessed)
        self.assertTrue(
            out[5]["RGB_INTERLEAVED"].min() >= 0)  # make sure we have pixels
        self.assertTrue(out[5]["RGB_INTERLEAVED"].max() <= 255)
        # action probs (test whether sum to one).
        self.assertTrue(out[6].dtype == np.float32)
        self.assertTrue(out[6].min() >= 0.0)
        self.assertTrue(out[6].max() <= 1.0)
        recursive_assert_almost_equal(
            out[6].sum(axis=-1, keepdims=False),
            np.ones(shape=(environment_stepper.num_steps, )),
            decimals=4)
        # internal states (c- and h-state)
        self.assertTrue(out[7][0].dtype == np.float32)
        self.assertTrue(out[7][1].dtype == np.float32)
        self.assertTrue(out[7][0].shape == (environment_stepper.num_steps,
                                            256))
        self.assertTrue(out[7][1].shape == (environment_stepper.num_steps,
                                            256))

        # Check whether episode returns match single rewards (including terminal signals).
        episode_returns = 0.0
        for i in range(environment_stepper.num_steps):
            episode_returns += out[2][i]
            self.assertAlmostEqual(episode_returns, out[3][i])
            # Terminal: Reset for next step.
            if out[4][i] is np.bool_(True):
                episode_returns = 0.0

        test.terminate()
示例#6
0
    def setup_execution(self):
        # Create local worker agent according to spec.
        # Extract states and actions space.
        environment = None
        if isinstance(self.environment_spec, dict):
            environment = Environment.from_spec(self.environment_spec)
        elif hasattr(self.environment_spec, '__call__'):
            environment = self.environment_spec()
        self.agent_config["state_space"] = environment.state_space
        self.agent_config["action_space"] = environment.action_space

        # Start Ray cluster and connect to it.
        self.local_agent = Agent.from_spec(self.agent_config)

        # Set up worker thread for performing updates.
        self.update_worker = UpdateWorker(
            agent=self.local_agent,
            in_queue_size=self.executor_spec["learn_queue_size"])
        self.ray_init()

        # Create remote sample workers based on ray cluster spec.
        self.num_replay_workers = self.executor_spec["num_replay_workers"]
        self.num_sample_workers = self.executor_spec["num_sample_workers"]

        self.logger.info("Initializing {} local replay memories.".format(
            self.num_replay_workers))
        # Update memory size for num of workers
        shard_size = int(self.apex_replay_spec["memory_spec"]["capacity"] /
                         self.num_replay_workers)
        self.apex_replay_spec["memory_spec"]["capacity"] = shard_size
        self.logger.info("Shard size per memory: {}".format(
            self.apex_replay_spec["memory_spec"]["capacity"]))
        min_sample_size = self.apex_replay_spec["min_sample_memory_size"]
        self.apex_replay_spec["min_sample_memory_size"] = int(
            min_sample_size / self.num_replay_workers)
        self.logger.info("Sampling for learning starts at: {}".format(
            self.apex_replay_spec["min_sample_memory_size"]))

        # Set sample batch size:
        self.apex_replay_spec["sample_batch_size"] = self.agent_config[
            "update_spec"]["batch_size"]
        self.logger.info("Sampling batch size {}".format(
            self.apex_replay_spec["sample_batch_size"]))

        self.ray_local_replay_memories = create_colocated_ray_actors(
            cls=RayMemoryActor.as_remote(
                num_cpus=self.num_cpus_per_replay_actor),
            config=self.apex_replay_spec,
            num_agents=self.num_replay_workers)

        # Create remote workers for data collection.
        self.worker_spec["worker_sample_size"] = self.worker_sample_size
        self.logger.info(
            "Initializing {} remote data collection agents, sample size: {}".
            format(self.num_sample_workers,
                   self.worker_spec["worker_sample_size"]))
        self.ray_env_sample_workers = self.create_remote_workers(
            RayValueWorker,
            self.num_sample_workers,
            self.agent_config,
            # *args
            self.worker_spec,
            self.environment_spec,
            self.worker_frame_skip)
        self.init_tasks()
    def test_environment_stepper_component_with_large_impala_architecture(
            self):
        worker_sample_size = 100
        env_spec = dict(type="deepmind_lab",
                        level_id="seekavoid_arena_01",
                        observations=["RGB_INTERLEAVED", "INSTR"],
                        frameskip=4)
        dummy_env = Environment.from_spec(env_spec)
        state_space = dummy_env.state_space
        action_space = dummy_env.action_space
        actor_component = ActorComponent(
            # Preprocessor spec (only for image and prev-action channel).
            dict(
                type="dict-preprocessor-stack",
                preprocessors=dict(
                    # The prev. action/reward from the env must be flattened/bumped-up-to-(1,).
                    previous_action=[
                        dict(type="reshape",
                             flatten=True,
                             flatten_categories=action_space.num_categories)
                    ],
                    previous_reward=[
                        dict(type="reshape", new_shape=(1, )),
                        dict(type="convert_type", to_dtype="float32")
                    ],
                )),
            # Policy spec. worker_sample_size=1 as its an actor network.
            dict(network_spec=LargeIMPALANetwork(worker_sample_size=1),
                 action_space=action_space))
        environment_stepper = EnvironmentStepper(
            environment_spec=env_spec,
            actor_component_spec=actor_component,
            state_space=state_space,
            reward_space="float32",
            internal_states_space=self.internal_states_space,
            num_steps=worker_sample_size,
            # Add both prev-action and -reward into the state sent through the network.
            add_previous_action_to_state=True,
            add_previous_reward_to_state=True,
            add_action_probs=True,
            action_probs_space=self.action_probs_space)

        test = ComponentTest(component=environment_stepper,
                             action_space=action_space,
                             execution_spec=dict(disable_monitoring=True))

        environment_stepper.environment_server.start_server()

        # Step n times through the Env and collect results.
        # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each
        # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states.
        time_start = time.perf_counter()
        steps = 10
        for _ in range(steps):
            out = test.test("step")
        time_total = time.perf_counter() - time_start
        print(
            "Done running {}x{} steps in Deepmind Lab env using IMPALA network in {}sec ({} actions/sec)."
            .format(steps, environment_stepper.num_steps, time_total,
                    environment_stepper.num_steps * steps / time_total))

        # Check types of outputs.
        self.assertTrue(isinstance(
            out, DataOpTuple))  # the step results as a tuple (see below)

        # Check types of single data.
        self.assertTrue(out[0].dtype == np.bool_)  # next-state is terminal?
        self.assertTrue(out[1]["INSTR"].dtype == np.object)
        self.assertTrue(out[1]["RGB_INTERLEAVED"].dtype == np.uint8)
        self.assertTrue(
            out[1]["RGB_INTERLEAVED"].shape == (worker_sample_size + 1, ) +
            state_space["RGB_INTERLEAVED"].shape)
        self.assertTrue(
            out[1]["RGB_INTERLEAVED"].min() >= 0)  # make sure we have pixels
        self.assertTrue(out[1]["RGB_INTERLEAVED"].max() <= 255)
        self.assertTrue(out[1]["previous_action"].dtype == np.int32)  # actions
        self.assertTrue(
            out[1]["previous_action"].shape == (worker_sample_size + 1, ))
        self.assertTrue(
            out[1]["previous_reward"].dtype == np.float32)  # rewards
        self.assertTrue(
            out[1]["previous_reward"].shape == (worker_sample_size + 1, ))
        # action probs (test whether sum to one).
        self.assertTrue(out[2].dtype == np.float32)
        self.assertTrue(out[2].shape == (100, action_space.num_categories))
        self.assertTrue(out[2].min() >= 0.0)
        self.assertTrue(out[2].max() <= 1.0)
        recursive_assert_almost_equal(out[2].sum(axis=-1, keepdims=False),
                                      np.ones(shape=(worker_sample_size, )),
                                      decimals=4)
        # internal states (c- and h-state)
        self.assertTrue(out[3][0].dtype == np.float32)
        self.assertTrue(out[3][0].shape == (worker_sample_size + 1, 256))
        self.assertTrue(out[3][1].dtype == np.float32)
        self.assertTrue(out[3][1].shape == (worker_sample_size + 1, 256))

        environment_stepper.environment_server.stop_server()

        test.terminate()
示例#8
0
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config_path = os.path.join(os.getcwd(), FLAGS.config)
    with open(agent_config_path, 'rt') as fp:
        agent_config = json.load(fp)

    cluster_spec_config_path = os.path.join(os.getcwd(), FLAGS.cluster_spec)
    with open(cluster_spec_config_path, 'rt') as fp:
        cluster_spec = json.load(fp)

    # Environment options
    env_spec = {
        "type": "deepmind-lab",
        "level_id": FLAGS.level,
        "frameskip": 4,
        "observations": ["RGB_INTERLEAVED", "INSTR"]
    }

    # Verbose usage errors
    if FLAGS.actor and FLAGS.learner:
        print("Please only use either --actor or --learner, not both.")
        sys.exit(1)

    # We dynamically update the distributed spec according to the job and task index
    if FLAGS.actor:
        agent_type = 'actor'
        distributed_spec = dict(
            job='actor',
            task_index=FLAGS.task,
            cluster_spec=cluster_spec
        )
        # Actors should only act on CPUs
        agent_config['execution_spec']['gpu_spec'].update({
            "gpus_enabled": False,
            "max_usable_gpus": 0,
            "num_gpus": 0
        })
    elif FLAGS.learner:
        agent_type = 'learner'
        distributed_spec = dict(
            job='learner',
            task_index=FLAGS.task,
            cluster_spec=cluster_spec
        )
    else:
        print("Please pass either --learner or --actor (or look at the CartPole example for single trainig mode).")
        sys.exit(1)

    # Set the sample size for the workers
    worker_sample_size = 100

    # Since we dynamically update the cluster spec according to the job and task index, we need to
    # manually update the execution spec as well.
    execution_spec = agent_config['execution_spec']
    execution_spec.update(dict(
        mode="distributed",
        distributed_spec=distributed_spec
    ))

    # Now, create the environment
    env = Environment.from_spec(env_spec)

    agent_spec = dict(
        type=agent_type,
        architecture="large",
        environment_spec=env_spec,
        worker_sample_size=worker_sample_size,
        state_space=env.state_space,
        action_space=env.action_space,
        # TODO: automate this (by lookup from NN).
        internal_states_space=IMPALAAgent.default_internal_states_space,
        execution_spec=execution_spec,
        # update_spec=dict(batch_size=2),
        # Summarize time-steps to have an overview of the env-stepping speed.
        summary_spec=dict(summary_regexp="time-step", directory="/tmp/impala_{}_{}/".format(agent_type, FLAGS.task))
    )

    agent_config.update(agent_spec)

    agent = IMPALAAgent(
        **agent_config
    )

    if FLAGS.learner:
        print("Starting learner for {} updates.".format(FLAGS.updates))
        for _ in range(FLAGS.updates):
            start_time = time.perf_counter()
            results = agent.update()
    else:
        # Actor just acts
        print("Starting actor. Terminate with SIGINT (Ctrl+C).")
        while True:
            agent.call_api_method("perform_n_steps_and_insert_into_fifo")  #.monitored_session.run([agent.enqueue_op])

    learn_updates = 100
    mean_returns = []
    for i in range(learn_updates):
        ret = agent.update()
        mean_return = _calc_mean_return(ret)
        mean_returns.append(mean_return)
        print("Iteration={} Loss={:.4f} Avg-reward={:.2f}".format(i, float(ret[1]), mean_return))

    print("Mean return: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.nanmean(mean_returns), np.nanmean(mean_returns[-10:])
    ))

    time.sleep(1)
    agent.terminate()
    time.sleep(1)