def __init__(self, config: Dict[str, any], result_dir: str, cache_stats: CacheInformation): super().__init__(config, result_dir, cache_stats) # evaluation specific variables self.observation_seen = 0 self.episode_reward = 0 self.checkpoint_steps = config['checkpoint_steps'] self._incomplete_experiences = TTLCache(InMemoryStorage()) self._incomplete_experiences.expired_entry_callback(self._observe_expired_incomplete_experience) self.experimental_reward = config.get('experimental_reward', False) agent_config = config['agent_config'] self.converter = CachingStrategyRLConverter() # action space: should cache: true or false # state space: [capacity (1), query key(1), query result set(num_indexes)] fields_in_state = len(CachingAgentSystemState.__slots__) self.agent = Agent.from_spec(agent_config, state_space=FloatBox(shape=(fields_in_state,)), action_space=IntBox(2)) self.logger = logging.getLogger(__name__) name = 'rl_caching_strategy' self.reward_logger = create_file_logger(name=f'{name}_reward_logger', result_dir=self.result_dir) self.loss_logger = create_file_logger(name=f'{name}_loss_logger', result_dir=self.result_dir) self.observation_logger = create_file_logger(name=f'{name}_observation_logger', result_dir=self.result_dir) self.entry_hits_logger = create_file_logger(name=f'{name}_entry_hits_logger', result_dir=self.result_dir) self.key_vocab = Vocabulary()
def test_dqn_on_pong(self): """ Creates a DQNAgent and runs it via a Runner on an openAI Pong Env. """ env = OpenAIGymEnv("Pong-v0", frameskip=4, max_num_noops=30, episodic_life=True, visualize=False) agent_config = config_from_path("configs/dqn_agent_for_pong.json") preprocessing_spec = agent_config.pop("preprocessor_spec") agent = Agent.from_spec( # Uses 2015 DQN parameters as closely as possible. agent_config, state_space=self.pong_preprocessed_state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=env.action_space) time_steps = 4000000 worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, render=True, preprocessing_spec=preprocessing_spec, worker_executes_preprocessing=True) results = worker.execute_timesteps(time_steps, use_exploration=True)
def test_individual_env(self): env = Environment.from_spec(self.env_spec) agent = Agent.from_spec( # Uses 2015 DQN parameters as closely as possible. config_from_path("configs/dqn_agent_for_pong.json"), state_space=env.state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=env.action_space) state = env.reset() start = time.monotonic() ep_length = 0 for _ in range_(self.samples): action = agent.get_action(state) state, reward, terminal, info = env.step(action) ep_length += 1 if terminal: print("reset after {} states".format(ep_length)) env.reset() ep_length = 0 runtime = time.monotonic() - start tp = self.samples / runtime print('Testing individual env {} performance:'.format( self.env_spec["gym_env"])) print('Ran {} steps, throughput: {} states/s, total time: {} s'.format( self.samples, tp, runtime))
def test_policy_and_vf_weight_syncing(self): """ Tests weight synchronization with a local agent and a remote worker. """ # First, create a local agent env_spec = dict(type="openai", gym_env="CartPole-v0") env = Environment.from_spec(env_spec) agent_config = config_from_path("configs/sync_batch_ppo_cartpole.json") ray_spec = agent_config["execution_spec"].pop("ray_spec") local_agent = Agent.from_spec(agent_config, state_space=env.state_space, action_space=env.action_space) ray_spec["worker_spec"]["worker_sample_size"] = 50 # Create a remote worker with the same agent config. worker = RayPolicyWorker.as_remote().remote(agent_config, ray_spec["worker_spec"], self.env_spec, auto_build=True) # This imitates the initial executor sync without ray.put weights = RayWeight(local_agent.get_weights()) print('Weight type in init sync = {}'.format(type(weights))) print("Weights = ", weights) worker.set_weights.remote(weights) print('Init weight sync successful.') # Replicate worker syncing steps as done in e.g. Ape-X executor: weights = RayWeight(local_agent.get_weights()) print('Weight type returned by ray put = {}'.format(type(weights))) print(weights) ret = worker.set_weights.remote(weights) ray.wait([ret]) print('Object store weight sync successful.')
def __init__(self, config: Dict[str, any], result_dir: str, cache_stats: CacheInformation): super().__init__(config, result_dir, cache_stats) self.supported_observations = { ObservationType.Hit, ObservationType.Miss, ObservationType.Invalidate } # evaluation specific variables self.observation_seen = 0 self.cum_reward = 0 self.checkpoint_steps = config['checkpoint_steps'] self._incomplete_experiences = TTLCache(InMemoryStorage()) self._incomplete_experiences.expired_entry_callback( self._observe_expiry_eviction) self.non_terminal_observations = { ObservationType.EvictionPolicy, ObservationType.Expiration } agent_config = config['agent_config'] self.maximum_ttl = config['max_ttl'] fields_in_state = len(MultiTaskAgentSystemState.__slots__) action_space = RLDict({ 'ttl': IntBox(low=0, high=self.maximum_ttl), 'eviction': IntBox(low=0, high=2) }) self.agent = Agent.from_spec( agent_config, state_space=FloatBox(shape=(fields_in_state, )), action_space=action_space) # TODO refactor into common RL interface for all strategies self.logger = logging.getLogger(__name__) name = 'rl_multi_strategy' self.reward_logger = create_file_logger(name=f'{name}_reward_logger', result_dir=self.result_dir) self.loss_logger = create_file_logger(name=f'{name}_loss_logger', result_dir=self.result_dir) self.ttl_logger = create_file_logger(name=f'{name}_ttl_logger', result_dir=self.result_dir) self.observation_logger = create_file_logger( name=f'{name}_observation_logger', result_dir=self.result_dir) self.performance_logger = create_file_logger( name=f'{name}_performance_logger', result_dir=self.result_dir) self.key_vocab = Vocabulary()
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config_path = os.path.join(os.getcwd(), FLAGS.config) with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) env = OpenAIGymEnv.from_spec({ "type": "openai", "gym_env": FLAGS.env, "visualize": FLAGS.visualize }) agent = Agent.from_spec(agent_config, state_space=env.state_space, action_space=env.action_space) episode_returns = [] def episode_finished_callback(episode_return, duration, timesteps, **kwargs): episode_returns.append(episode_return) if len(episode_returns) % 10 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.". format(len(episode_returns), episode_return, np.mean(episode_returns[-10:]))) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, render=False, worker_executes_preprocessing=False, episode_finish_callback=episode_finished_callback) print( "Starting workload, this will take some time for the agents to build.") # Use exploration is true for training, false for evaluation. worker.execute_timesteps(20000, use_exploration=True) # Note: A basic actor critic is very sensitive to hyper-parameters and might collapse after reaching the maximum # reward. In practice, it would be recommended to stop training when a reward threshold is reached. print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format( np.mean(episode_returns), np.mean(episode_returns[-10:])))
def test_update_throughput(self): env = Environment.from_spec(self.env_spec) # TODO comment in for multi gpu # config_from_path("configs/multi_gpu_ray_apex_for_pong.json"), config = config_from_path("configs/ray_apex_for_pong.json") # Adjust to usable GPUs for test system. num_gpus = [1] for gpu_count in num_gpus: config["execution_spec"]["gpu_spec"]["num_gpus"] = gpu_count config["execution_spec"]["gpu_spec"]["per_process_gpu_memory_fraction"] = 1.0 / gpu_count agent = Agent.from_spec( # TODO replace with config from above config_from_path("configs/ray_apex_for_pong.json"), state_space=env.state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=env.action_space ) batch_space = Dict( states=agent.preprocessed_state_space, actions=env.action_space, rewards=FloatBox(), next_states=agent.preprocessed_state_space, terminals=IntBox(low=0, high=1), importance_weights=FloatBox(), add_batch_rank=True ) batch_size = 512 * gpu_count num_samples = 50 samples = [batch_space.sample(batch_size) for _ in range(num_samples)] times = [] throughputs = [] for sample in samples: start = time.perf_counter() agent.update(sample) runtime = time.perf_counter() - start times.append(runtime) throughputs.append(batch_size / runtime) print("Throughput: {} samples / s ({}) for {} GPUs".format(np.mean(throughputs), np.std(throughputs), gpu_count))
def test_worker_weight_syncing(self): """ Tests weight synchronization with a local agent and a remote worker. """ # First, create a local agent env_spec = dict( type="openai", gym_env="PongNoFrameskip-v4", # The frameskip in the agent config will trigger worker skips, this # is used for internal env. frameskip=4, max_num_noops=30, episodic_life=True) env = Environment.from_spec(env_spec) agent_config = config_from_path("configs/ray_apex_for_pong.json") # Remove unneeded apex params. if "apex_replay_spec" in agent_config: agent_config.pop("apex_replay_spec") ray_spec = agent_config["execution_spec"].pop("ray_spec") local_agent = Agent.from_spec(agent_config, state_space=env.state_space, action_space=env.action_space) # Create a remote worker with the same agent config. worker = RayWorker.as_remote().remote(agent_config, ray_spec["worker_spec"], self.env_spec, auto_build=True) # This imitates the initial executor sync without ray.put weights = local_agent.get_weights() print('Weight type in init sync = {}'.format(type(weights))) worker.set_weights.remote(weights["policy_weights"], weights["value_function_weights"]) print('Init weight sync successful.') # Replicate worker syncing steps as done in e.g. Ape-X executor: weights = ray.put(local_agent.get_weights()) print('Weight type returned by ray put = {}'.format(type(weights))) print(weights) worker.set_weights.remote(weights["policy_weights"], weights["value_function_weights"]) print('Object store weight sync successful.')
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config_path = os.path.join(os.getcwd(), FLAGS.config) with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) env = OpenAIGymEnv.from_spec({ "type": "openai", "gym_env": FLAGS.env }) print(env.state_space) agent = Agent.from_spec( agent_config, state_space=env.state_space, action_space=env.action_space ) episode_returns = [] def episode_finished_callback(episode_return, duration, timesteps, **kwargs): episode_returns.append(episode_return) if len(episode_returns) % 5 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".format( len(episode_returns), episode_return, np.mean(episode_returns[-5:]) )) worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, render=FLAGS.render, worker_executes_preprocessing=False, episode_finish_callback=episode_finished_callback) print("Starting workload, this will take some time for the agents to build.") worker.execute_episodes(100, use_exploration=True) # Use exploration is true for training, false for evaluation. #worker.execute_episodes(100, use_exploration=False) print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format( np.mean(episode_returns), np.mean(episode_returns[-10:]) ))
def test_apex_weight_syncing(self): env = RandomEnv(state_space=spaces.IntBox(2), action_space=spaces.IntBox(2), deterministic=True) agent = Agent.from_spec( config_from_path("configs/apex_agent_for_random_env.json"), state_space=env.state_space, action_space=env.action_space) policy_weights = agent.get_policy_weights() print('policy weights: {}'.format(policy_weights)) for variable, weights in policy_weights.items(): weights += 0.01 agent.set_policy_weights(policy_weights) new_weights = agent.get_policy_weights() recursive_assert_almost_equal(policy_weights, new_weights)
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config_path = os.path.join(os.getcwd(), FLAGS.config) with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) env = MLAgentsEnv() agent = Agent.from_spec(agent_config, state_space=env.state_space, action_space=env.action_space) episode_returns = [] def episode_finished_callback(episode_return, duration, timesteps, **kwargs): episode_returns.append(episode_return) finished_episodes = len(episode_returns) if finished_episodes % 4 == 0: print( "Episode {} finished in {:d}sec: total avg. reward={:.2f}; last 10 episodes={:.2f}; last " "100 episodes={:.2f}".format( finished_episodes, int(duration), np.mean(episode_returns), np.mean(episode_returns[-min(finished_episodes, 10):]), np.mean(episode_returns[-min(finished_episodes, 100):]))) worker = SingleThreadedWorker( env_spec=env, agent=agent, render=False, worker_executes_preprocessing=False, episode_finish_callback=episode_finished_callback) print( "Starting workload, this will take some time for the agents to build.") # Use exploration is true for training, false for evaluation. worker.execute_timesteps(500000, use_exploration=True) print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format( np.mean(episode_returns), np.mean(episode_returns[-10:])))
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config_path = os.path.join(os.getcwd(), FLAGS.config) with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) # Override openAI gym env per command line. if FLAGS.env is None: env_spec = agent_config["environment_spec"] else: env_spec = dict(type="openai-gym", gym_env=FLAGS.env) # Override number of visualized envs per command line. if FLAGS.visualize != -1: env_spec["visualize"] = FLAGS.visualize dummy_env = OpenAIGymEnv.from_spec(env_spec) agent = Agent.from_spec( agent_config, state_space=dummy_env.state_space, action_space=dummy_env.action_space ) dummy_env.terminate() learn_updates = 6000 mean_returns = [] for i in range(learn_updates): ret = agent.update() mean_return = _calc_mean_return(ret) mean_returns.append(mean_return) print("Iteration={} Loss={:.4f} Avg-reward={:.2f}".format(i, float(ret[1]), mean_return)) print("Mean return: {:.2f} / over the last 10 episodes: {:.2f}".format( np.nanmean(mean_returns), np.nanmean(mean_returns[-10:]) )) time.sleep(1) agent.terminate() time.sleep(3)
def test_weights_getting_setting(self): """ Tests getting and setting of the Agent's weights. """ env = GridWorld(world="2x2") agent = Agent.from_spec( config_from_path("configs/dqn_agent_for_functionality_test.json"), state_space=env.state_space, action_space=env.action_space) weights = agent.get_weights() new_weights = {} for key, weight in weights["policy_weights"].items(): new_weights[key] = weight + 0.01 agent.set_weights(new_weights) new_actual_weights = agent.get_weights() recursive_assert_almost_equal(new_actual_weights["policy_weights"], new_weights)
def test_apex_weight_syncing(self): agent_config = config_from_path("configs/ray_apex_for_pong.json") agent_config["execution_spec"].pop("ray_spec") environment = OpenAIGymEnv("Pong-v0", frameskip=4) agent = Agent.from_spec( agent_config, state_space=environment.state_space, action_space=environment.action_space ) weights = agent.get_weights()["policy_weights"] print("type weights = ", type(weights)) for variable, value in weights.items(): print("Type value = ", type(value)) value += 0.01 agent.set_weights(weights) new_weights = agent.get_weights()["policy_weights"] recursive_assert_almost_equal(weights, new_weights)
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config = read_config_file(FLAGS.config) env = OpenAIGymEnv.from_spec({ "type": "openai", "gym_env": FLAGS.env }) agent = Agent.from_spec( agent_config, summary_spec=dict( summary_regexp=FLAGS.summary_regexp ), state_space=env.state_space, action_space=env.action_space ) rewards = [] def episode_finished_callback(reward, duration, timesteps, **kwargs): rewards.append(reward) if len(rewards) % 10 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".format( len(rewards), reward, np.mean(rewards[-10:]) )) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, render=FLAGS.render, worker_executes_preprocessing=False, episode_finish_callback=episode_finished_callback ) print("Starting workload, this will take some time for the agents to build.") results = worker.execute_episodes(200, use_exploration=True) print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format( np.mean(rewards), np.mean(rewards[-10:]) ))
def run_experiment(self, environment, experiment_num=0): environment = RLgraphEnvironmentWrapper(environment) environment.add_episode_end_callback(self.episode_finished, environment, runner_id=1) config = copy(self.config) max_episodes = config.pop('max_episodes', None) max_timesteps = config.pop('max_timesteps', None) max_episode_timesteps = config.pop('max_episode_timesteps') agent = Agent.from_spec( spec=config, state_space=environment.state_space, action_space=environment.action_space, ) if experiment_num == 0 and self.load_model_file: logging.info("Loading model data from file: {}".format( self.load_model)) agent.load_model(self.load_model_file) runner = SingleThreadedWorker(agent=agent, environment=environment) environment.reset() agent.reset_buffers() if max_timesteps: runner.execute_timesteps( num_timesteps=max_timesteps, max_timesteps_per_episode=max_episode_timesteps) else: runner.execute_episodes( num_episodes=max_episodes, max_timesteps_per_episode=max_episode_timesteps) return dict(initial_reset_time=0, episode_rewards=runner.episode_rewards, episode_timesteps=runner.episode_steps, episode_end_times=runner.episode_durations)
def test_update_from_external(self): agent_config = config_from_path("configs/ray_apex_for_pong.json") agent_config["execution_spec"].pop("ray_spec") environment = OpenAIGymEnv("Pong-v0", frameskip=4) agent = Agent.from_spec( agent_config, state_space=environment.state_space, action_space=environment.action_space ) batch = { "states": agent.preprocessed_state_space.sample(200), "actions": environment.action_space.sample(200), "rewards": np.zeros(200, dtype=np.float32), "terminals": [False] * 200, "next_states": agent.preprocessed_state_space.sample(200), "importance_weights": np.ones(200, dtype=np.float32) } agent.update(batch)
def __init__(self, config: Dict[str, any], result_dir: str, cache_stats: CacheInformation): super().__init__(config, result_dir, cache_stats) # evaluation specific variables self.observation_seen = 0 self.episode_reward = 0 self.checkpoint_steps = config['checkpoint_steps'] self._incomplete_experiences = TTLCache(InMemoryStorage()) self._incomplete_experiences.expired_entry_callback( self._observe_expired_incomplete_experience) self.view_of_the_cache = {} # type: Dict[str, Dict[str, any]] self._end_episode_observation = { ObservationType.Invalidate, ObservationType.Miss, ObservationType.Expiration } # TODO refactor into common RL interface for all strategies # Agent configuration (can be shared with others) agent_config = config['agent_config'] fields_in_state = len(EvictionAgentSystemState.__slots__) self.converter = EvictionStrategyRLConverter(self.result_dir) # State: fields to observe in question # Action: to evict or not that key self.agent = Agent.from_spec( agent_config, state_space=FloatBox(shape=(fields_in_state, )), action_space=IntBox(low=0, high=2)) self.logger = logging.getLogger(__name__) name = 'rl_eviction_strategy' self.reward_logger = create_file_logger(name=f'{name}_reward_logger', result_dir=self.result_dir) self.loss_logger = create_file_logger(name=f'{name}_loss_logger', result_dir=self.result_dir) self.observation_logger = create_file_logger( name=f'{name}_observation_logger', result_dir=self.result_dir) self.key_vocab = Vocabulary()
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config_path = os.path.join(os.getcwd(), FLAGS.config) with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) env = OpenAIGymEnv.from_spec({"type": "openai", "gym_env": FLAGS.env}) agent = Agent.from_spec( # Uses 2015 DQN parameters as closely as possible. agent_config, state_space=env.state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=env.action_space) rewards = [] def episode_finished_callback(reward, duration, timesteps, **kwargs): rewards.append(reward) if len(rewards) % 10 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.". format(len(rewards), reward, np.mean(rewards[-10:]))) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, render=False, worker_executes_preprocessing=False, episode_finish_callback=episode_finished_callback) print( "Starting workload, this will take some time for the agents to build.") results = worker.execute_episodes(200, use_exploration=True) print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format( np.mean(rewards), np.mean(rewards[-10:])))
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config_path = os.path.join(os.getcwd(), FLAGS.config) with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) env = MLAgentsEnv() agent = Agent.from_spec(agent_config, state_space=env.state_space, action_space=env.action_space) rewards = [] def episode_finished_callback(reward, duration, timesteps, **kwargs): rewards.append(reward) if len(rewards) % 10 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.". format(len(rewards), reward, np.mean(rewards[-10:]))) worker = SingleThreadedWorker( env_spec=env, agent=agent, render=False, worker_executes_preprocessing=False, #synchronous_reset=True, episode_finish_callback=episode_finished_callback) print( "Starting workload, this will take some time for the agents to build.") # Use exploration is true for training, false for evaluation. worker.execute_timesteps(100000, use_exploration=True) print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format( np.mean(rewards), np.mean(rewards[-10:])))
def __init__(self, config: Dict[str, any], result_dir: str, cache_stats: CacheInformation): super().__init__(config, result_dir, cache_stats) self.observation_seen = 0 self.cum_reward = 0 self.checkpoint_steps = config['checkpoint_steps'] self._incomplete_experiences = TTLCache(InMemoryStorage()) self._incomplete_experiences.expired_entry_callback( self._observe_expiry_eviction) self.non_terminal_observations = { ObservationType.EvictionPolicy, ObservationType.Expiration } agent_config = config['agent_config'] self.maximum_ttl = config['max_ttl'] self.experimental_reward = config.get('experimental_reward', False) fields_in_state = len(TTLAgentSystemState.__slots__) self.agent = Agent.from_spec( agent_config, state_space=FloatBox(shape=(fields_in_state, )), action_space=FloatBox(low=0, high=self.maximum_ttl, shape=(1, ))) # TODO refactor into common RL interface for all strategies self.logger = logging.getLogger(__name__) name = 'rl_ttl_strategy' self.reward_logger = create_file_logger(name=f'{name}_reward_logger', result_dir=self.result_dir) self.loss_logger = create_file_logger(name=f'{name}_loss_logger', result_dir=self.result_dir) self.ttl_logger = create_file_logger(name=f'{name}_ttl_logger', result_dir=self.result_dir) self.observation_logger = create_file_logger( name=f'{name}_observation_logger', result_dir=self.result_dir) self.key_vocab = Vocabulary() self.errors = create_file_logger(name=f'{name}_error_logger', result_dir=self.result_dir)
def test_sequential_vector_env(self): vector_env = SequentialVectorEnv(num_environments=self.num_vector_envs, env_spec=self.env_spec, num_background_envs=2) agent = Agent.from_spec( # Uses 2015 DQN parameters as closely as possible. config_from_path("configs/dqn_vector_env.json"), state_space=vector_env.state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=vector_env.action_space) states = vector_env.reset_all() start = time.monotonic() ep_lengths = [0 for _ in range_(self.num_vector_envs)] for _ in range_(int(self.samples / self.num_vector_envs)): # Sample all envs at once. actions, preprocessed_states = agent.get_action( states, extra_returns="preprocessed_states") states, rewards, terminals, infos = vector_env.step(actions) ep_lengths = [ep_length + 1 for ep_length in ep_lengths] for i, terminal in enumerate(terminals): if terminal: print("reset env {} after {} states".format( i, ep_lengths[i])) vector_env.reset(i) ep_lengths[i] = 0 runtime = time.monotonic() - start tp = self.samples / runtime print('Testing vector env {} performance:'.format( self.env_spec["gym_env"])) print('Ran {} steps, throughput: {} states/s, total time: {} s'.format( self.samples, tp, runtime))
def setup_execution(self): # Create local worker agent according to spec. # Extract states and actions space. environment = None if isinstance(self.environment_spec, dict): environment = Environment.from_spec(self.environment_spec) elif hasattr(self.environment_spec, '__call__'): environment = self.environment_spec() self.agent_config["state_space"] = environment.state_space self.agent_config["action_space"] = environment.action_space # Start Ray cluster and connect to it. self.local_agent = Agent.from_spec(self.agent_config) # Set up worker thread for performing updates. self.update_worker = UpdateWorker( agent=self.local_agent, in_queue_size=self.executor_spec["learn_queue_size"]) self.ray_init() # Create remote sample workers based on ray cluster spec. self.num_replay_workers = self.executor_spec["num_replay_workers"] self.num_sample_workers = self.executor_spec["num_sample_workers"] self.logger.info("Initializing {} local replay memories.".format( self.num_replay_workers)) # Update memory size for num of workers shard_size = int(self.apex_replay_spec["memory_spec"]["capacity"] / self.num_replay_workers) self.apex_replay_spec["memory_spec"]["capacity"] = shard_size self.logger.info("Shard size per memory: {}".format( self.apex_replay_spec["memory_spec"]["capacity"])) min_sample_size = self.apex_replay_spec["min_sample_memory_size"] self.apex_replay_spec["min_sample_memory_size"] = int( min_sample_size / self.num_replay_workers) self.logger.info("Sampling for learning starts at: {}".format( self.apex_replay_spec["min_sample_memory_size"])) # Set sample batch size: self.apex_replay_spec["sample_batch_size"] = self.agent_config[ "update_spec"]["batch_size"] self.logger.info("Sampling batch size {}".format( self.apex_replay_spec["sample_batch_size"])) self.ray_local_replay_memories = create_colocated_ray_actors( cls=RayMemoryActor.as_remote( num_cpus=self.num_cpus_per_replay_actor), config=self.apex_replay_spec, num_agents=self.num_replay_workers) # Create remote workers for data collection. self.worker_spec["worker_sample_size"] = self.worker_sample_size self.logger.info( "Initializing {} remote data collection agents, sample size: {}". format(self.num_sample_workers, self.worker_spec["worker_sample_size"])) self.ray_env_sample_workers = self.create_remote_workers( RayValueWorker, self.num_sample_workers, self.agent_config, # *args self.worker_spec, self.environment_spec, self.worker_frame_skip) self.init_tasks()
agent_config_path = os.path.abspath(os.path.dirname( os.path.dirname(__file__))) + '/agents/ppoSmartPrimer_config.json' with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) env = OpenAIGymEnv.from_spec({ "type": "openai", "gym_env": 'gym_SmartPrimer:SmartPrimer-realistic-v2' }) agent = Agent.from_spec(agent_config, state_space=env.state_space, action_space=env.action_space) episode_returns = [] def episode_finished_callback(episode_return, duration, timesteps, *args, **kwargs): episode_returns.append(episode_return) if len(episode_returns) % 100 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.". format(len(episode_returns), episode_return, np.mean(episode_returns[-100:]))) worker = SingleThreadedWorker(
def test_dqn_functionality(self): """ Creates a DQNAgent and runs it for a few steps in a GridWorld to vigorously test all steps of the learning process. """ env = GridWorld(world="2x2", save_mode=True) # no holes, just fire agent = Agent.from_spec( # type: DQNAgent config_from_path("configs/dqn_agent_for_functionality_test.json"), double_q=True, dueling_q=True, state_space=env.state_space, action_space=env.action_space, discount=0.95) worker = SingleThreadedWorker( env_spec=lambda: GridWorld(world="2x2", save_mode=True), agent=agent) test = AgentTest(worker=worker) # Helper python DQNLossFunc object. loss_func = DQNLossFunction(backend="python", double_q=True, discount=agent.discount) loss_func.when_input_complete(input_spaces=dict(loss_per_item=[ spaces.FloatBox(shape=(4, ), add_batch_rank=True), spaces.IntBox(4, add_batch_rank=True), spaces.FloatBox(add_batch_rank=True), spaces.BoolBox(add_batch_rank=True), spaces.FloatBox(shape=(4, ), add_batch_rank=True), spaces.FloatBox(shape=(4, ), add_batch_rank=True) ]), action_space=env.action_space) matrix1_qnet = np.array([[0.9] * 2] * 4) matrix2_qnet = np.array([[0.8] * 5] * 2) matrix1_target_net = np.array([[0.9] * 2] * 4) matrix2_target_net = np.array([[0.8] * 5] * 2) a = self._calculate_action(0, matrix1_qnet, matrix2_qnet) # 1st step -> Expect insert into python-buffer. # action: up (0) test.step(1, reset=True) # Environment's new state. test.check_env("state", 0) # Agent's buffer. test.check_agent("states_buffer", [[1.0, 0.0, 0.0, 0.0]], key_or_index="env_0") # <- prev state (preprocessed) test.check_agent("actions_buffer", [a], key_or_index="env_0") test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0") test.check_agent("terminals_buffer", [False], key_or_index="env_0") # Memory contents. test.check_var("replay-memory/index", 0) test.check_var("replay-memory/size", 0) test.check_var("replay-memory/memory/states", np.array([[0] * 4] * agent.memory.capacity)) test.check_var("replay-memory/memory/actions", np.array([0] * agent.memory.capacity)) test.check_var("replay-memory/memory/rewards", np.array([0] * agent.memory.capacity)) test.check_var("replay-memory/memory/terminals", np.array([False] * agent.memory.capacity)) # Check policy and target-policy weights (should be the same). test.check_var("dueling-policy/neural-network/hidden/dense/kernel", matrix1_qnet) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_qnet) test.check_var( "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_qnet) test.check_var( "target-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_qnet) # 2nd step -> expect insert into memory (and python buffer should be empty again). # action: up (0) # Also check the policy and target policy values (Should be equal at this point). test.step(1) test.check_env("state", 0) test.check_agent("states_buffer", [], key_or_index="env_0") test.check_agent("actions_buffer", [], key_or_index="env_0") test.check_agent("rewards_buffer", [], key_or_index="env_0") test.check_agent("terminals_buffer", [], key_or_index="env_0") test.check_var("replay-memory/index", 2) test.check_var("replay-memory/size", 2) test.check_var( "replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]] + [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 2))) test.check_var("replay-memory/memory/actions", np.array([0, 0] + [0] * (agent.memory.capacity - 2))) test.check_var( "replay-memory/memory/rewards", np.array([-1.0, -1.0] + [0.0] * (agent.memory.capacity - 2))) test.check_var( "replay-memory/memory/terminals", np.array([False, True] + [False] * (agent.memory.capacity - 2))) # Check policy and target-policy weights (should be the same). test.check_var("dueling-policy/neural-network/hidden/dense/kernel", matrix1_qnet) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_qnet) test.check_var( "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_qnet) test.check_var( "target-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_qnet) # 3rd and 4th step -> expect another insert into memory (and python buffer should be empty again). # actions: down (2), up (0) <- exploring is True = more random actions # Expect an update to the policy variables (leave target as is (no sync yet)). test.step(2, use_exploration=True) test.check_env("state", 0) test.check_agent("states_buffer", [], key_or_index="env_0") test.check_agent("actions_buffer", [], key_or_index="env_0") test.check_agent("rewards_buffer", [], key_or_index="env_0") test.check_agent("terminals_buffer", [], key_or_index="env_0") test.check_var("replay-memory/index", 4) test.check_var("replay-memory/size", 4) test.check_var( "replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 3 + [[0.0, 1.0, 0.0, 0.0]] + [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 4))) test.check_var( "replay-memory/memory/actions", np.array([0, 0, 2, 0] + [0] * (agent.memory.capacity - 4))) test.check_var( "replay-memory/memory/rewards", np.array([-1.0] * 4 + # + [-3.0] + [0.0] * (agent.memory.capacity - 4))) test.check_var( "replay-memory/memory/terminals", np.array([False, True] * 2 + [False] * (agent.memory.capacity - 4))) # Get the latest memory batch. expected_batch = dict(states=np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]), actions=np.array([0, 1]), rewards=np.array([-1.0, -3.0]), terminals=np.array([False, True]), next_states=np.array([[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]])) test.check_agent("last_memory_batch", expected_batch) # Calculate the weight updates and check against actually update weights by the AgentDQN. mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet, matrix2_qnet, matrix1_target_net, matrix2_target_net, agent, loss_func) # Check policy and target-policy weights (policy should be updated now). test.check_var("dueling-policy/neural-network/hidden/dense/kernel", mat_updated[0], decimals=4) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_target_net) test.check_var( "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel", mat_updated[1], decimals=4) test.check_var( "target-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_target_net) matrix1_qnet = mat_updated[0] matrix2_qnet = mat_updated[1] # 5th step -> Another buffer update check. # action: down (2) (weights have been updated -> different actions) test.step(1) test.check_env("state", 3) test.check_agent( "states_buffer", [], key_or_index="env_0" ) # <- all empty b/c we reached end of episode (buffer gets force-flushed) test.check_agent("actions_buffer", [], key_or_index="env_0") test.check_agent("rewards_buffer", [], key_or_index="env_0") test.check_agent("terminals_buffer", [], key_or_index="env_0") test.check_agent("last_memory_batch", expected_batch) test.check_var("replay-memory/index", 5) test.check_var("replay-memory/size", 5) test.check_var( "replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] + [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 5))) test.check_var("replay-memory/memory/actions", np.array([0, 0, 0, 1, 2, 0])) test.check_var("replay-memory/memory/rewards", np.array([-1.0] * 3 + [-3.0, 1.0, 0.0])) test.check_var("replay-memory/memory/terminals", np.array([False, True] * 2 + [True, False])) test.check_var("dueling-policy/neural-network/hidden/dense/kernel", matrix1_qnet, decimals=4) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_target_net) test.check_var( "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel", mat_updated[1], decimals=4) test.check_var( "target-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_target_net) # 6th/7th step (with exploration enabled) -> Another buffer update check. # action: up, down (0, 2) test.step(2, use_exploration=True) test.check_env("state", 1) test.check_agent( "states_buffer", [], key_or_index="env_0" ) # <- all empty again; flushed after 6th step (when buffer was full). test.check_agent("actions_buffer", [], key_or_index="env_0") test.check_agent("rewards_buffer", [], key_or_index="env_0") test.check_agent("terminals_buffer", [], key_or_index="env_0") test.check_agent("last_memory_batch", expected_batch) test.check_var("replay-memory/index", 1) # index has been rolled over (memory capacity is 6) test.check_var("replay-memory/size", 6) test.check_var( "replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] + [[1.0, 0.0, 0.0, 0.0]])) test.check_var("replay-memory/memory/actions", np.array([2, 0, 0, 1, 2, 0])) test.check_var("replay-memory/memory/rewards", np.array([-1.0] * 3 + [-3.0, 1.0, -1.0])) test.check_var("replay-memory/memory/terminals", np.array([True, True, False, True, True, False])) test.check_var("dueling-policy/neural-network/hidden/dense/kernel", matrix1_qnet, decimals=4) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_target_net) test.check_var( "dueling-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_qnet, decimals=4) test.check_var( "target-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_target_net) # 8th step -> Another buffer update check and weights update and sync. # action: down (2) test.step(1) test.check_env("state", 1) test.check_agent("states_buffer", [1], key_or_index="env_0") test.check_agent("actions_buffer", [2], key_or_index="env_0") test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0") test.check_agent("terminals_buffer", [False], key_or_index="env_0") expected_batch = dict( states=np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]), actions=np.array([0, 1]), rewards=np.array([-1.0, -3.0]), terminals=np.array([True, True]), next_states=np.array([[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]]) # TODO: <- This is wrong and must be fixed # (next-state of first item is from a previous insert and unrelated to first item) ) test.check_agent("last_memory_batch", expected_batch) test.check_var("replay-memory/index", 1) test.check_var("replay-memory/size", 6) test.check_var( "replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] + [[1.0, 0.0, 0.0, 0.0]])) test.check_var("replay-memory/memory/actions", np.array([2, 0, 0, 1, 2, 0])) test.check_var("replay-memory/memory/rewards", np.array([-1.0, -1.0, -1.0, -3.0, 1.0, -1.0])) test.check_var("replay-memory/memory/terminals", np.array([True, True, False, True, True, False])) # Assume that the sync happens first (matrices are already the same when updating). mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet, matrix2_qnet, matrix1_qnet, matrix2_qnet, agent, loss_func) # Now target-net should be again 1 step behind policy-net. test.check_var("dueling-policy/neural-network/hidden/dense/kernel", mat_updated[0], decimals=2) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_qnet, decimals=2) # again: old matrix test.check_var( "dueling-policy/dueling-action-adapter/action-layer/dense/kernel", mat_updated[1], decimals=2) test.check_var( "target-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_qnet, decimals=2)