示例#1
0
    def __init__(self, config: Dict[str, any], result_dir: str, cache_stats: CacheInformation):
        super().__init__(config, result_dir, cache_stats)
        # evaluation specific variables
        self.observation_seen = 0
        self.episode_reward = 0
        self.checkpoint_steps = config['checkpoint_steps']

        self._incomplete_experiences = TTLCache(InMemoryStorage())
        self._incomplete_experiences.expired_entry_callback(self._observe_expired_incomplete_experience)

        self.experimental_reward = config.get('experimental_reward', False)
        agent_config = config['agent_config']
        self.converter = CachingStrategyRLConverter()
        # action space: should cache: true or false
        # state space: [capacity (1), query key(1), query result set(num_indexes)]
        fields_in_state = len(CachingAgentSystemState.__slots__)
        self.agent = Agent.from_spec(agent_config,
                                     state_space=FloatBox(shape=(fields_in_state,)),
                                     action_space=IntBox(2))

        self.logger = logging.getLogger(__name__)
        name = 'rl_caching_strategy'
        self.reward_logger = create_file_logger(name=f'{name}_reward_logger', result_dir=self.result_dir)
        self.loss_logger = create_file_logger(name=f'{name}_loss_logger', result_dir=self.result_dir)
        self.observation_logger = create_file_logger(name=f'{name}_observation_logger', result_dir=self.result_dir)
        self.entry_hits_logger = create_file_logger(name=f'{name}_entry_hits_logger', result_dir=self.result_dir)

        self.key_vocab = Vocabulary()
    def test_dqn_on_pong(self):
        """
        Creates a DQNAgent and runs it via a Runner on an openAI Pong Env.
        """
        env = OpenAIGymEnv("Pong-v0",
                           frameskip=4,
                           max_num_noops=30,
                           episodic_life=True,
                           visualize=False)
        agent_config = config_from_path("configs/dqn_agent_for_pong.json")
        preprocessing_spec = agent_config.pop("preprocessor_spec")
        agent = Agent.from_spec(
            # Uses 2015 DQN parameters as closely as possible.
            agent_config,
            state_space=self.pong_preprocessed_state_space,
            # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
            action_space=env.action_space)

        time_steps = 4000000
        worker = SingleThreadedWorker(env_spec=lambda: env,
                                      agent=agent,
                                      render=True,
                                      preprocessing_spec=preprocessing_spec,
                                      worker_executes_preprocessing=True)
        results = worker.execute_timesteps(time_steps, use_exploration=True)
示例#3
0
    def test_individual_env(self):
        env = Environment.from_spec(self.env_spec)
        agent = Agent.from_spec(
            # Uses 2015 DQN parameters as closely as possible.
            config_from_path("configs/dqn_agent_for_pong.json"),
            state_space=env.state_space,
            # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
            action_space=env.action_space)

        state = env.reset()
        start = time.monotonic()
        ep_length = 0
        for _ in range_(self.samples):
            action = agent.get_action(state)
            state, reward, terminal, info = env.step(action)

            ep_length += 1
            if terminal:
                print("reset after {} states".format(ep_length))
                env.reset()
                ep_length = 0

        runtime = time.monotonic() - start
        tp = self.samples / runtime

        print('Testing individual env {} performance:'.format(
            self.env_spec["gym_env"]))
        print('Ran {} steps, throughput: {} states/s, total time: {} s'.format(
            self.samples, tp, runtime))
    def test_policy_and_vf_weight_syncing(self):
        """
        Tests weight synchronization with a local agent and a remote worker.
        """
        # First, create a local agent
        env_spec = dict(type="openai", gym_env="CartPole-v0")
        env = Environment.from_spec(env_spec)
        agent_config = config_from_path("configs/sync_batch_ppo_cartpole.json")

        ray_spec = agent_config["execution_spec"].pop("ray_spec")
        local_agent = Agent.from_spec(agent_config,
                                      state_space=env.state_space,
                                      action_space=env.action_space)
        ray_spec["worker_spec"]["worker_sample_size"] = 50
        # Create a remote worker with the same agent config.
        worker = RayPolicyWorker.as_remote().remote(agent_config,
                                                    ray_spec["worker_spec"],
                                                    self.env_spec,
                                                    auto_build=True)

        # This imitates the initial executor sync without ray.put
        weights = RayWeight(local_agent.get_weights())
        print('Weight type in init sync = {}'.format(type(weights)))
        print("Weights = ", weights)
        worker.set_weights.remote(weights)
        print('Init weight sync successful.')

        # Replicate worker syncing steps as done in e.g. Ape-X executor:
        weights = RayWeight(local_agent.get_weights())
        print('Weight type returned by ray put = {}'.format(type(weights)))
        print(weights)
        ret = worker.set_weights.remote(weights)
        ray.wait([ret])
        print('Object store weight sync successful.')
示例#5
0
    def __init__(self, config: Dict[str, any], result_dir: str,
                 cache_stats: CacheInformation):
        super().__init__(config, result_dir, cache_stats)
        self.supported_observations = {
            ObservationType.Hit, ObservationType.Miss,
            ObservationType.Invalidate
        }

        # evaluation specific variables
        self.observation_seen = 0
        self.cum_reward = 0
        self.checkpoint_steps = config['checkpoint_steps']

        self._incomplete_experiences = TTLCache(InMemoryStorage())
        self._incomplete_experiences.expired_entry_callback(
            self._observe_expiry_eviction)
        self.non_terminal_observations = {
            ObservationType.EvictionPolicy, ObservationType.Expiration
        }

        agent_config = config['agent_config']
        self.maximum_ttl = config['max_ttl']

        fields_in_state = len(MultiTaskAgentSystemState.__slots__)

        action_space = RLDict({
            'ttl': IntBox(low=0, high=self.maximum_ttl),
            'eviction': IntBox(low=0, high=2)
        })

        self.agent = Agent.from_spec(
            agent_config,
            state_space=FloatBox(shape=(fields_in_state, )),
            action_space=action_space)

        # TODO refactor into common RL interface for all strategies
        self.logger = logging.getLogger(__name__)
        name = 'rl_multi_strategy'
        self.reward_logger = create_file_logger(name=f'{name}_reward_logger',
                                                result_dir=self.result_dir)
        self.loss_logger = create_file_logger(name=f'{name}_loss_logger',
                                              result_dir=self.result_dir)
        self.ttl_logger = create_file_logger(name=f'{name}_ttl_logger',
                                             result_dir=self.result_dir)
        self.observation_logger = create_file_logger(
            name=f'{name}_observation_logger', result_dir=self.result_dir)
        self.performance_logger = create_file_logger(
            name=f'{name}_performance_logger', result_dir=self.result_dir)
        self.key_vocab = Vocabulary()
示例#6
0
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config_path = os.path.join(os.getcwd(), FLAGS.config)
    with open(agent_config_path, 'rt') as fp:
        agent_config = json.load(fp)

    env = OpenAIGymEnv.from_spec({
        "type": "openai",
        "gym_env": FLAGS.env,
        "visualize": FLAGS.visualize
    })

    agent = Agent.from_spec(agent_config,
                            state_space=env.state_space,
                            action_space=env.action_space)

    episode_returns = []

    def episode_finished_callback(episode_return, duration, timesteps,
                                  **kwargs):
        episode_returns.append(episode_return)
        if len(episode_returns) % 10 == 0:
            print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".
                  format(len(episode_returns), episode_return,
                         np.mean(episode_returns[-10:])))

    worker = SingleThreadedWorker(
        env_spec=lambda: env,
        agent=agent,
        render=False,
        worker_executes_preprocessing=False,
        episode_finish_callback=episode_finished_callback)
    print(
        "Starting workload, this will take some time for the agents to build.")

    # Use exploration is true for training, false for evaluation.
    worker.execute_timesteps(20000, use_exploration=True)

    # Note: A basic actor critic is very sensitive to hyper-parameters and might collapse after reaching the maximum
    # reward. In practice, it would be recommended to stop training when a reward threshold is reached.
    print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.mean(episode_returns), np.mean(episode_returns[-10:])))
    def test_update_throughput(self):
        env = Environment.from_spec(self.env_spec)
        # TODO comment in for multi gpu
        # config_from_path("configs/multi_gpu_ray_apex_for_pong.json"),
        config = config_from_path("configs/ray_apex_for_pong.json")

        # Adjust to usable GPUs for test system.
        num_gpus = [1]
        for gpu_count in num_gpus:
            config["execution_spec"]["gpu_spec"]["num_gpus"] = gpu_count
            config["execution_spec"]["gpu_spec"]["per_process_gpu_memory_fraction"] = 1.0 / gpu_count

            agent = Agent.from_spec(
                # TODO replace with config from above
                config_from_path("configs/ray_apex_for_pong.json"),
                state_space=env.state_space,
                # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
                action_space=env.action_space
            )

            batch_space = Dict(
                states=agent.preprocessed_state_space,
                actions=env.action_space,
                rewards=FloatBox(),
                next_states=agent.preprocessed_state_space,
                terminals=IntBox(low=0, high=1),
                importance_weights=FloatBox(),
                add_batch_rank=True
            )

            batch_size = 512 * gpu_count
            num_samples = 50
            samples = [batch_space.sample(batch_size) for _ in range(num_samples)]

            times = []
            throughputs = []
            for sample in samples:
                start = time.perf_counter()
                agent.update(sample)
                runtime = time.perf_counter() - start
                times.append(runtime)
                throughputs.append(batch_size / runtime)

            print("Throughput: {} samples / s ({}) for {} GPUs".format(np.mean(throughputs),
                                                                       np.std(throughputs), gpu_count))
示例#8
0
    def test_worker_weight_syncing(self):
        """
        Tests weight synchronization with a local agent and a remote worker.
        """
        # First, create a local agent
        env_spec = dict(
            type="openai",
            gym_env="PongNoFrameskip-v4",
            # The frameskip in the agent config will trigger worker skips, this
            # is used for internal env.
            frameskip=4,
            max_num_noops=30,
            episodic_life=True)
        env = Environment.from_spec(env_spec)
        agent_config = config_from_path("configs/ray_apex_for_pong.json")

        # Remove unneeded apex params.
        if "apex_replay_spec" in agent_config:
            agent_config.pop("apex_replay_spec")

        ray_spec = agent_config["execution_spec"].pop("ray_spec")
        local_agent = Agent.from_spec(agent_config,
                                      state_space=env.state_space,
                                      action_space=env.action_space)

        # Create a remote worker with the same agent config.
        worker = RayWorker.as_remote().remote(agent_config,
                                              ray_spec["worker_spec"],
                                              self.env_spec,
                                              auto_build=True)

        # This imitates the initial executor sync without ray.put
        weights = local_agent.get_weights()
        print('Weight type in init sync = {}'.format(type(weights)))
        worker.set_weights.remote(weights["policy_weights"],
                                  weights["value_function_weights"])
        print('Init weight sync successful.')

        # Replicate worker syncing steps as done in e.g. Ape-X executor:
        weights = ray.put(local_agent.get_weights())
        print('Weight type returned by ray put = {}'.format(type(weights)))
        print(weights)
        worker.set_weights.remote(weights["policy_weights"],
                                  weights["value_function_weights"])
        print('Object store weight sync successful.')
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config_path = os.path.join(os.getcwd(), FLAGS.config)
    with open(agent_config_path, 'rt') as fp:
        agent_config = json.load(fp)

    env = OpenAIGymEnv.from_spec({
        "type": "openai",
        "gym_env": FLAGS.env
    })
    print(env.state_space)

    agent = Agent.from_spec(
        agent_config,
        state_space=env.state_space,
        action_space=env.action_space
    )

    episode_returns = []

    def episode_finished_callback(episode_return, duration, timesteps, **kwargs):
        episode_returns.append(episode_return)
        if len(episode_returns) % 5 == 0:
            print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".format(
                len(episode_returns), episode_return, np.mean(episode_returns[-5:])
            ))

    worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, render=FLAGS.render, worker_executes_preprocessing=False,
                                  episode_finish_callback=episode_finished_callback)
    print("Starting workload, this will take some time for the agents to build.")

    worker.execute_episodes(100, use_exploration=True)

    # Use exploration is true for training, false for evaluation.

    #worker.execute_episodes(100, use_exploration=False)

    print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.mean(episode_returns), np.mean(episode_returns[-10:])
    ))
示例#10
0
    def test_apex_weight_syncing(self):
        env = RandomEnv(state_space=spaces.IntBox(2),
                        action_space=spaces.IntBox(2),
                        deterministic=True)

        agent = Agent.from_spec(
            config_from_path("configs/apex_agent_for_random_env.json"),
            state_space=env.state_space,
            action_space=env.action_space)

        policy_weights = agent.get_policy_weights()
        print('policy weights: {}'.format(policy_weights))

        for variable, weights in policy_weights.items():
            weights += 0.01
        agent.set_policy_weights(policy_weights)

        new_weights = agent.get_policy_weights()
        recursive_assert_almost_equal(policy_weights, new_weights)
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config_path = os.path.join(os.getcwd(), FLAGS.config)
    with open(agent_config_path, 'rt') as fp:
        agent_config = json.load(fp)

    env = MLAgentsEnv()

    agent = Agent.from_spec(agent_config,
                            state_space=env.state_space,
                            action_space=env.action_space)
    episode_returns = []

    def episode_finished_callback(episode_return, duration, timesteps,
                                  **kwargs):
        episode_returns.append(episode_return)
        finished_episodes = len(episode_returns)
        if finished_episodes % 4 == 0:
            print(
                "Episode {} finished in {:d}sec: total avg. reward={:.2f}; last 10 episodes={:.2f}; last "
                "100 episodes={:.2f}".format(
                    finished_episodes, int(duration), np.mean(episode_returns),
                    np.mean(episode_returns[-min(finished_episodes, 10):]),
                    np.mean(episode_returns[-min(finished_episodes, 100):])))

    worker = SingleThreadedWorker(
        env_spec=env,
        agent=agent,
        render=False,
        worker_executes_preprocessing=False,
        episode_finish_callback=episode_finished_callback)
    print(
        "Starting workload, this will take some time for the agents to build.")

    # Use exploration is true for training, false for evaluation.
    worker.execute_timesteps(500000, use_exploration=True)

    print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.mean(episode_returns), np.mean(episode_returns[-10:])))
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config_path = os.path.join(os.getcwd(), FLAGS.config)
    with open(agent_config_path, 'rt') as fp:
        agent_config = json.load(fp)

    # Override openAI gym env per command line.
    if FLAGS.env is None:
        env_spec = agent_config["environment_spec"]
    else:
        env_spec = dict(type="openai-gym", gym_env=FLAGS.env)
    # Override number of visualized envs per command line.
    if FLAGS.visualize != -1:
        env_spec["visualize"] = FLAGS.visualize

    dummy_env = OpenAIGymEnv.from_spec(env_spec)
    agent = Agent.from_spec(
        agent_config,
        state_space=dummy_env.state_space,
        action_space=dummy_env.action_space
    )
    dummy_env.terminate()

    learn_updates = 6000
    mean_returns = []
    for i in range(learn_updates):
        ret = agent.update()
        mean_return = _calc_mean_return(ret)
        mean_returns.append(mean_return)
        print("Iteration={} Loss={:.4f} Avg-reward={:.2f}".format(i, float(ret[1]), mean_return))

    print("Mean return: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.nanmean(mean_returns), np.nanmean(mean_returns[-10:])
    ))

    time.sleep(1)
    agent.terminate()
    time.sleep(3)
示例#13
0
    def test_weights_getting_setting(self):
        """
        Tests getting and setting of the Agent's weights.
        """
        env = GridWorld(world="2x2")
        agent = Agent.from_spec(
            config_from_path("configs/dqn_agent_for_functionality_test.json"),
            state_space=env.state_space,
            action_space=env.action_space)

        weights = agent.get_weights()
        new_weights = {}
        for key, weight in weights["policy_weights"].items():
            new_weights[key] = weight + 0.01

        agent.set_weights(new_weights)
        new_actual_weights = agent.get_weights()

        recursive_assert_almost_equal(new_actual_weights["policy_weights"],
                                      new_weights)
示例#14
0
    def test_apex_weight_syncing(self):
        agent_config = config_from_path("configs/ray_apex_for_pong.json")
        agent_config["execution_spec"].pop("ray_spec")
        environment = OpenAIGymEnv("Pong-v0", frameskip=4)

        agent = Agent.from_spec(
            agent_config,
            state_space=environment.state_space,
            action_space=environment.action_space
        )

        weights = agent.get_weights()["policy_weights"]
        print("type weights = ", type(weights))
        for variable, value in weights.items():
            print("Type value = ", type(value))
            value += 0.01
        agent.set_weights(weights)

        new_weights = agent.get_weights()["policy_weights"]
        recursive_assert_almost_equal(weights, new_weights)
示例#15
0
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config = read_config_file(FLAGS.config)

    env = OpenAIGymEnv.from_spec({
        "type": "openai",
        "gym_env": FLAGS.env
    })

    agent = Agent.from_spec(
        agent_config,
        summary_spec=dict(
            summary_regexp=FLAGS.summary_regexp
        ),
        state_space=env.state_space,
        action_space=env.action_space
    )

    rewards = []

    def episode_finished_callback(reward, duration, timesteps, **kwargs):
        rewards.append(reward)
        if len(rewards) % 10 == 0:
            print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".format(
                len(rewards), reward, np.mean(rewards[-10:])
            ))

    worker = SingleThreadedWorker(
        env_spec=lambda: env, agent=agent, render=FLAGS.render, worker_executes_preprocessing=False,
        episode_finish_callback=episode_finished_callback
    )
    print("Starting workload, this will take some time for the agents to build.")
    results = worker.execute_episodes(200, use_exploration=True)

    print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.mean(rewards), np.mean(rewards[-10:])
    ))
示例#16
0
    def run_experiment(self, environment, experiment_num=0):
        environment = RLgraphEnvironmentWrapper(environment)
        environment.add_episode_end_callback(self.episode_finished,
                                             environment,
                                             runner_id=1)

        config = copy(self.config)

        max_episodes = config.pop('max_episodes', None)
        max_timesteps = config.pop('max_timesteps', None)
        max_episode_timesteps = config.pop('max_episode_timesteps')

        agent = Agent.from_spec(
            spec=config,
            state_space=environment.state_space,
            action_space=environment.action_space,
        )

        if experiment_num == 0 and self.load_model_file:
            logging.info("Loading model data from file: {}".format(
                self.load_model))
            agent.load_model(self.load_model_file)

        runner = SingleThreadedWorker(agent=agent, environment=environment)

        environment.reset()
        agent.reset_buffers()

        if max_timesteps:
            runner.execute_timesteps(
                num_timesteps=max_timesteps,
                max_timesteps_per_episode=max_episode_timesteps)
        else:
            runner.execute_episodes(
                num_episodes=max_episodes,
                max_timesteps_per_episode=max_episode_timesteps)

        return dict(initial_reset_time=0,
                    episode_rewards=runner.episode_rewards,
                    episode_timesteps=runner.episode_steps,
                    episode_end_times=runner.episode_durations)
示例#17
0
    def test_update_from_external(self):
        agent_config = config_from_path("configs/ray_apex_for_pong.json")
        agent_config["execution_spec"].pop("ray_spec")
        environment = OpenAIGymEnv("Pong-v0", frameskip=4)

        agent = Agent.from_spec(
            agent_config,
            state_space=environment.state_space,
            action_space=environment.action_space
        )

        batch = {
            "states": agent.preprocessed_state_space.sample(200),
            "actions": environment.action_space.sample(200),
            "rewards": np.zeros(200, dtype=np.float32),
            "terminals": [False] * 200,
            "next_states": agent.preprocessed_state_space.sample(200),
            "importance_weights":  np.ones(200, dtype=np.float32)
        }

        agent.update(batch)
示例#18
0
    def __init__(self, config: Dict[str, any], result_dir: str,
                 cache_stats: CacheInformation):
        super().__init__(config, result_dir, cache_stats)
        # evaluation specific variables
        self.observation_seen = 0
        self.episode_reward = 0
        self.checkpoint_steps = config['checkpoint_steps']

        self._incomplete_experiences = TTLCache(InMemoryStorage())
        self._incomplete_experiences.expired_entry_callback(
            self._observe_expired_incomplete_experience)
        self.view_of_the_cache = {}  # type: Dict[str, Dict[str, any]]
        self._end_episode_observation = {
            ObservationType.Invalidate, ObservationType.Miss,
            ObservationType.Expiration
        }

        # TODO refactor into common RL interface for all strategies
        # Agent configuration (can be shared with others)
        agent_config = config['agent_config']
        fields_in_state = len(EvictionAgentSystemState.__slots__)
        self.converter = EvictionStrategyRLConverter(self.result_dir)

        # State: fields to observe in question
        # Action: to evict or not that key
        self.agent = Agent.from_spec(
            agent_config,
            state_space=FloatBox(shape=(fields_in_state, )),
            action_space=IntBox(low=0, high=2))

        self.logger = logging.getLogger(__name__)
        name = 'rl_eviction_strategy'
        self.reward_logger = create_file_logger(name=f'{name}_reward_logger',
                                                result_dir=self.result_dir)
        self.loss_logger = create_file_logger(name=f'{name}_loss_logger',
                                              result_dir=self.result_dir)
        self.observation_logger = create_file_logger(
            name=f'{name}_observation_logger', result_dir=self.result_dir)
        self.key_vocab = Vocabulary()
示例#19
0
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config_path = os.path.join(os.getcwd(), FLAGS.config)
    with open(agent_config_path, 'rt') as fp:
        agent_config = json.load(fp)

    env = OpenAIGymEnv.from_spec({"type": "openai", "gym_env": FLAGS.env})

    agent = Agent.from_spec(
        # Uses 2015 DQN parameters as closely as possible.
        agent_config,
        state_space=env.state_space,
        # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
        action_space=env.action_space)

    rewards = []

    def episode_finished_callback(reward, duration, timesteps, **kwargs):
        rewards.append(reward)
        if len(rewards) % 10 == 0:
            print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".
                  format(len(rewards), reward, np.mean(rewards[-10:])))

    worker = SingleThreadedWorker(
        env_spec=lambda: env,
        agent=agent,
        render=False,
        worker_executes_preprocessing=False,
        episode_finish_callback=episode_finished_callback)
    print(
        "Starting workload, this will take some time for the agents to build.")
    results = worker.execute_episodes(200, use_exploration=True)

    print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.mean(rewards), np.mean(rewards[-10:])))
示例#20
0
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config_path = os.path.join(os.getcwd(), FLAGS.config)
    with open(agent_config_path, 'rt') as fp:
        agent_config = json.load(fp)

    env = MLAgentsEnv()

    agent = Agent.from_spec(agent_config,
                            state_space=env.state_space,
                            action_space=env.action_space)
    rewards = []

    def episode_finished_callback(reward, duration, timesteps, **kwargs):
        rewards.append(reward)
        if len(rewards) % 10 == 0:
            print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".
                  format(len(rewards), reward, np.mean(rewards[-10:])))

    worker = SingleThreadedWorker(
        env_spec=env,
        agent=agent,
        render=False,
        worker_executes_preprocessing=False,
        #synchronous_reset=True,
        episode_finish_callback=episode_finished_callback)
    print(
        "Starting workload, this will take some time for the agents to build.")

    # Use exploration is true for training, false for evaluation.
    worker.execute_timesteps(100000, use_exploration=True)

    print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.mean(rewards), np.mean(rewards[-10:])))
示例#21
0
    def __init__(self, config: Dict[str, any], result_dir: str,
                 cache_stats: CacheInformation):
        super().__init__(config, result_dir, cache_stats)
        self.observation_seen = 0
        self.cum_reward = 0
        self.checkpoint_steps = config['checkpoint_steps']

        self._incomplete_experiences = TTLCache(InMemoryStorage())
        self._incomplete_experiences.expired_entry_callback(
            self._observe_expiry_eviction)
        self.non_terminal_observations = {
            ObservationType.EvictionPolicy, ObservationType.Expiration
        }
        agent_config = config['agent_config']
        self.maximum_ttl = config['max_ttl']
        self.experimental_reward = config.get('experimental_reward', False)
        fields_in_state = len(TTLAgentSystemState.__slots__)
        self.agent = Agent.from_spec(
            agent_config,
            state_space=FloatBox(shape=(fields_in_state, )),
            action_space=FloatBox(low=0, high=self.maximum_ttl, shape=(1, )))

        # TODO refactor into common RL interface for all strategies
        self.logger = logging.getLogger(__name__)
        name = 'rl_ttl_strategy'
        self.reward_logger = create_file_logger(name=f'{name}_reward_logger',
                                                result_dir=self.result_dir)
        self.loss_logger = create_file_logger(name=f'{name}_loss_logger',
                                              result_dir=self.result_dir)
        self.ttl_logger = create_file_logger(name=f'{name}_ttl_logger',
                                             result_dir=self.result_dir)
        self.observation_logger = create_file_logger(
            name=f'{name}_observation_logger', result_dir=self.result_dir)
        self.key_vocab = Vocabulary()
        self.errors = create_file_logger(name=f'{name}_error_logger',
                                         result_dir=self.result_dir)
示例#22
0
    def test_sequential_vector_env(self):
        vector_env = SequentialVectorEnv(num_environments=self.num_vector_envs,
                                         env_spec=self.env_spec,
                                         num_background_envs=2)
        agent = Agent.from_spec(
            # Uses 2015 DQN parameters as closely as possible.
            config_from_path("configs/dqn_vector_env.json"),
            state_space=vector_env.state_space,
            # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
            action_space=vector_env.action_space)

        states = vector_env.reset_all()
        start = time.monotonic()
        ep_lengths = [0 for _ in range_(self.num_vector_envs)]

        for _ in range_(int(self.samples / self.num_vector_envs)):
            # Sample all envs at once.
            actions, preprocessed_states = agent.get_action(
                states, extra_returns="preprocessed_states")
            states, rewards, terminals, infos = vector_env.step(actions)
            ep_lengths = [ep_length + 1 for ep_length in ep_lengths]

            for i, terminal in enumerate(terminals):
                if terminal:
                    print("reset env {} after {} states".format(
                        i, ep_lengths[i]))
                    vector_env.reset(i)
                    ep_lengths[i] = 0

        runtime = time.monotonic() - start
        tp = self.samples / runtime

        print('Testing vector env {} performance:'.format(
            self.env_spec["gym_env"]))
        print('Ran {} steps, throughput: {} states/s, total time: {} s'.format(
            self.samples, tp, runtime))
示例#23
0
    def setup_execution(self):
        # Create local worker agent according to spec.
        # Extract states and actions space.
        environment = None
        if isinstance(self.environment_spec, dict):
            environment = Environment.from_spec(self.environment_spec)
        elif hasattr(self.environment_spec, '__call__'):
            environment = self.environment_spec()
        self.agent_config["state_space"] = environment.state_space
        self.agent_config["action_space"] = environment.action_space

        # Start Ray cluster and connect to it.
        self.local_agent = Agent.from_spec(self.agent_config)

        # Set up worker thread for performing updates.
        self.update_worker = UpdateWorker(
            agent=self.local_agent,
            in_queue_size=self.executor_spec["learn_queue_size"])
        self.ray_init()

        # Create remote sample workers based on ray cluster spec.
        self.num_replay_workers = self.executor_spec["num_replay_workers"]
        self.num_sample_workers = self.executor_spec["num_sample_workers"]

        self.logger.info("Initializing {} local replay memories.".format(
            self.num_replay_workers))
        # Update memory size for num of workers
        shard_size = int(self.apex_replay_spec["memory_spec"]["capacity"] /
                         self.num_replay_workers)
        self.apex_replay_spec["memory_spec"]["capacity"] = shard_size
        self.logger.info("Shard size per memory: {}".format(
            self.apex_replay_spec["memory_spec"]["capacity"]))
        min_sample_size = self.apex_replay_spec["min_sample_memory_size"]
        self.apex_replay_spec["min_sample_memory_size"] = int(
            min_sample_size / self.num_replay_workers)
        self.logger.info("Sampling for learning starts at: {}".format(
            self.apex_replay_spec["min_sample_memory_size"]))

        # Set sample batch size:
        self.apex_replay_spec["sample_batch_size"] = self.agent_config[
            "update_spec"]["batch_size"]
        self.logger.info("Sampling batch size {}".format(
            self.apex_replay_spec["sample_batch_size"]))

        self.ray_local_replay_memories = create_colocated_ray_actors(
            cls=RayMemoryActor.as_remote(
                num_cpus=self.num_cpus_per_replay_actor),
            config=self.apex_replay_spec,
            num_agents=self.num_replay_workers)

        # Create remote workers for data collection.
        self.worker_spec["worker_sample_size"] = self.worker_sample_size
        self.logger.info(
            "Initializing {} remote data collection agents, sample size: {}".
            format(self.num_sample_workers,
                   self.worker_spec["worker_sample_size"]))
        self.ray_env_sample_workers = self.create_remote_workers(
            RayValueWorker,
            self.num_sample_workers,
            self.agent_config,
            # *args
            self.worker_spec,
            self.environment_spec,
            self.worker_frame_skip)
        self.init_tasks()
示例#24
0
agent_config_path = os.path.abspath(os.path.dirname(
    os.path.dirname(__file__))) + '/agents/ppoSmartPrimer_config.json'

with open(agent_config_path, 'rt') as fp:
    agent_config = json.load(fp)

env = OpenAIGymEnv.from_spec({
    "type":
    "openai",
    "gym_env":
    'gym_SmartPrimer:SmartPrimer-realistic-v2'
})

agent = Agent.from_spec(agent_config,
                        state_space=env.state_space,
                        action_space=env.action_space)

episode_returns = []


def episode_finished_callback(episode_return, duration, timesteps, *args,
                              **kwargs):
    episode_returns.append(episode_return)
    if len(episode_returns) % 100 == 0:
        print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".
              format(len(episode_returns), episode_return,
                     np.mean(episode_returns[-100:])))


worker = SingleThreadedWorker(
示例#25
0
    def test_dqn_functionality(self):
        """
        Creates a DQNAgent and runs it for a few steps in a GridWorld to vigorously test
        all steps of the learning process.
        """
        env = GridWorld(world="2x2", save_mode=True)  # no holes, just fire
        agent = Agent.from_spec(  # type: DQNAgent
            config_from_path("configs/dqn_agent_for_functionality_test.json"),
            double_q=True,
            dueling_q=True,
            state_space=env.state_space,
            action_space=env.action_space,
            discount=0.95)
        worker = SingleThreadedWorker(
            env_spec=lambda: GridWorld(world="2x2", save_mode=True),
            agent=agent)
        test = AgentTest(worker=worker)

        # Helper python DQNLossFunc object.
        loss_func = DQNLossFunction(backend="python",
                                    double_q=True,
                                    discount=agent.discount)
        loss_func.when_input_complete(input_spaces=dict(loss_per_item=[
            spaces.FloatBox(shape=(4, ), add_batch_rank=True),
            spaces.IntBox(4, add_batch_rank=True),
            spaces.FloatBox(add_batch_rank=True),
            spaces.BoolBox(add_batch_rank=True),
            spaces.FloatBox(shape=(4, ), add_batch_rank=True),
            spaces.FloatBox(shape=(4, ), add_batch_rank=True)
        ]),
                                      action_space=env.action_space)

        matrix1_qnet = np.array([[0.9] * 2] * 4)
        matrix2_qnet = np.array([[0.8] * 5] * 2)
        matrix1_target_net = np.array([[0.9] * 2] * 4)
        matrix2_target_net = np.array([[0.8] * 5] * 2)

        a = self._calculate_action(0, matrix1_qnet, matrix2_qnet)

        # 1st step -> Expect insert into python-buffer.
        # action: up (0)
        test.step(1, reset=True)
        # Environment's new state.
        test.check_env("state", 0)
        # Agent's buffer.
        test.check_agent("states_buffer", [[1.0, 0.0, 0.0, 0.0]],
                         key_or_index="env_0")  # <- prev state (preprocessed)
        test.check_agent("actions_buffer", [a], key_or_index="env_0")
        test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0")
        test.check_agent("terminals_buffer", [False], key_or_index="env_0")
        # Memory contents.
        test.check_var("replay-memory/index", 0)
        test.check_var("replay-memory/size", 0)
        test.check_var("replay-memory/memory/states",
                       np.array([[0] * 4] * agent.memory.capacity))
        test.check_var("replay-memory/memory/actions",
                       np.array([0] * agent.memory.capacity))
        test.check_var("replay-memory/memory/rewards",
                       np.array([0] * agent.memory.capacity))
        test.check_var("replay-memory/memory/terminals",
                       np.array([False] * agent.memory.capacity))
        # Check policy and target-policy weights (should be the same).
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet)
        test.check_var(
            "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_qnet)
        test.check_var(
            "target-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_qnet)

        # 2nd step -> expect insert into memory (and python buffer should be empty again).
        # action: up (0)
        # Also check the policy and target policy values (Should be equal at this point).
        test.step(1)
        test.check_env("state", 0)
        test.check_agent("states_buffer", [], key_or_index="env_0")
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_var("replay-memory/index", 2)
        test.check_var("replay-memory/size", 2)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]] +
                     [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 2)))
        test.check_var("replay-memory/memory/actions",
                       np.array([0, 0] + [0] * (agent.memory.capacity - 2)))
        test.check_var(
            "replay-memory/memory/rewards",
            np.array([-1.0, -1.0] + [0.0] * (agent.memory.capacity - 2)))
        test.check_var(
            "replay-memory/memory/terminals",
            np.array([False, True] + [False] * (agent.memory.capacity - 2)))
        # Check policy and target-policy weights (should be the same).
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet)
        test.check_var(
            "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_qnet)
        test.check_var(
            "target-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_qnet)

        # 3rd and 4th step -> expect another insert into memory (and python buffer should be empty again).
        # actions: down (2), up (0)  <- exploring is True = more random actions
        # Expect an update to the policy variables (leave target as is (no sync yet)).
        test.step(2, use_exploration=True)
        test.check_env("state", 0)
        test.check_agent("states_buffer", [], key_or_index="env_0")
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_var("replay-memory/index", 4)
        test.check_var("replay-memory/size", 4)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0]] * 3 + [[0.0, 1.0, 0.0, 0.0]] +
                     [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 4)))
        test.check_var(
            "replay-memory/memory/actions",
            np.array([0, 0, 2, 0] + [0] * (agent.memory.capacity - 4)))
        test.check_var(
            "replay-memory/memory/rewards",
            np.array([-1.0] * 4 +  # + [-3.0] +
                     [0.0] * (agent.memory.capacity - 4)))
        test.check_var(
            "replay-memory/memory/terminals",
            np.array([False, True] * 2 + [False] *
                     (agent.memory.capacity - 4)))
        # Get the latest memory batch.
        expected_batch = dict(states=np.array([[1.0, 0.0, 0.0, 0.0],
                                               [1.0, 0.0, 0.0, 0.0]]),
                              actions=np.array([0, 1]),
                              rewards=np.array([-1.0, -3.0]),
                              terminals=np.array([False, True]),
                              next_states=np.array([[1.0, 0.0, 0.0, 0.0],
                                                    [0.0, 0.0, 0.0, 0.0]]))
        test.check_agent("last_memory_batch", expected_batch)

        # Calculate the weight updates and check against actually update weights by the AgentDQN.
        mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet,
                                                 matrix2_qnet,
                                                 matrix1_target_net,
                                                 matrix2_target_net, agent,
                                                 loss_func)
        # Check policy and target-policy weights (policy should be updated now).
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       mat_updated[0],
                       decimals=4)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_target_net)
        test.check_var(
            "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            mat_updated[1],
            decimals=4)
        test.check_var(
            "target-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_target_net)

        matrix1_qnet = mat_updated[0]
        matrix2_qnet = mat_updated[1]

        # 5th step -> Another buffer update check.
        # action: down (2) (weights have been updated -> different actions)
        test.step(1)
        test.check_env("state", 3)
        test.check_agent(
            "states_buffer", [], key_or_index="env_0"
        )  # <- all empty b/c we reached end of episode (buffer gets force-flushed)
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_agent("last_memory_batch", expected_batch)
        test.check_var("replay-memory/index", 5)
        test.check_var("replay-memory/size", 5)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] +
                     [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 5)))
        test.check_var("replay-memory/memory/actions",
                       np.array([0, 0, 0, 1, 2, 0]))
        test.check_var("replay-memory/memory/rewards",
                       np.array([-1.0] * 3 + [-3.0, 1.0, 0.0]))
        test.check_var("replay-memory/memory/terminals",
                       np.array([False, True] * 2 + [True, False]))
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet,
                       decimals=4)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_target_net)
        test.check_var(
            "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            mat_updated[1],
            decimals=4)
        test.check_var(
            "target-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_target_net)

        # 6th/7th step (with exploration enabled) -> Another buffer update check.
        # action: up, down (0, 2)
        test.step(2, use_exploration=True)
        test.check_env("state", 1)
        test.check_agent(
            "states_buffer", [], key_or_index="env_0"
        )  # <- all empty again; flushed after 6th step (when buffer was full).
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_agent("last_memory_batch", expected_batch)
        test.check_var("replay-memory/index",
                       1)  # index has been rolled over (memory capacity is 6)
        test.check_var("replay-memory/size", 6)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] +
                     [[1.0, 0.0, 0.0, 0.0]]))
        test.check_var("replay-memory/memory/actions",
                       np.array([2, 0, 0, 1, 2, 0]))
        test.check_var("replay-memory/memory/rewards",
                       np.array([-1.0] * 3 + [-3.0, 1.0, -1.0]))
        test.check_var("replay-memory/memory/terminals",
                       np.array([True, True, False, True, True, False]))

        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet,
                       decimals=4)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_target_net)
        test.check_var(
            "dueling-policy/dueling-action-adapter/action-layer/dense/kernel",
            matrix2_qnet,
            decimals=4)
        test.check_var(
            "target-policy/dueling-action-adapter/action-layer/dense/kernel",
            matrix2_target_net)

        # 8th step -> Another buffer update check and weights update and sync.
        # action: down (2)
        test.step(1)
        test.check_env("state", 1)
        test.check_agent("states_buffer", [1], key_or_index="env_0")
        test.check_agent("actions_buffer", [2], key_or_index="env_0")
        test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0")
        test.check_agent("terminals_buffer", [False], key_or_index="env_0")
        expected_batch = dict(
            states=np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]),
            actions=np.array([0, 1]),
            rewards=np.array([-1.0, -3.0]),
            terminals=np.array([True, True]),
            next_states=np.array([[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]])
            # TODO: <- This is wrong and must be fixed
            # (next-state of first item is from a previous insert and unrelated to first item)
        )
        test.check_agent("last_memory_batch", expected_batch)
        test.check_var("replay-memory/index", 1)
        test.check_var("replay-memory/size", 6)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] +
                     [[1.0, 0.0, 0.0, 0.0]]))
        test.check_var("replay-memory/memory/actions",
                       np.array([2, 0, 0, 1, 2, 0]))
        test.check_var("replay-memory/memory/rewards",
                       np.array([-1.0, -1.0, -1.0, -3.0, 1.0, -1.0]))
        test.check_var("replay-memory/memory/terminals",
                       np.array([True, True, False, True, True, False]))

        # Assume that the sync happens first (matrices are already the same when updating).
        mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet,
                                                 matrix2_qnet, matrix1_qnet,
                                                 matrix2_qnet, agent,
                                                 loss_func)

        # Now target-net should be again 1 step behind policy-net.
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       mat_updated[0],
                       decimals=2)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet,
                       decimals=2)  # again: old matrix
        test.check_var(
            "dueling-policy/dueling-action-adapter/action-layer/dense/kernel",
            mat_updated[1],
            decimals=2)
        test.check_var(
            "target-policy/dueling-action-adapter/action-layer/dense/kernel",
            matrix2_qnet,
            decimals=2)