def __init__(self, env, eval_id, fetch_parameter, session_config, separate_plots=False): """ Display "reward" and "step_per_s" curves on Tensorboard Args: env: eval_id: fetch_parameter: lambda function that pulls from parameter server session_config: to construct AgentTensorplex - interval: log to Tensorplex every N episodes. - average_episodes: average rewards/speed over the last N episodes separate_plots: True to put reward plot in a separate section on Tensorboard, False to put all plots together """ super().__init__(env) self.tensorplex = get_tensorplex_client( '{}/{}'.format('eval', eval_id), session_config ) interval = session_config['tensorplex']['update_schedule']['eval_env'] self._periodic = PeriodicTracker(interval) self._avg = interval self._separate_plots = separate_plots self._throttle_sleep = \ session_config['tensorplex']['update_schedule']['eval_env_sleep'] self._fetch_parameter = fetch_parameter self._fetch_parameter() # if this eval is late to the party
def __init__(self, env, agent_id, session_config, separate_plots=True): """ Display "reward" and "step_per_s" curves on Tensorboard Args: env: agent_id: int. session_config: to construct AgentTensorplex - interval: log to Tensorplex every N episodes. - average_episodes: average rewards/speed over the last N episodes separate_plots: True to put reward plot in a separate section on Tensorboard, False to put all plots together """ super().__init__(env) U.assert_type(agent_id, int) self.tensorplex = get_tensorplex_client( '{}/{}'.format('agent', agent_id), session_config ) interval = session_config['tensorplex']['update_schedule']['training_env'] self._periodic = PeriodicTracker(interval) self._avg = interval self._separate_plots = separate_plots
def __init__(self, learner_config, env_config, session_config): super().__init__(learner_config, env_config, session_config) self.q_func, self.action_dim = build_ffqfunc(self.learner_config, self.env_config) self.algo = self.learner_config.algo self.q_target = self.q_func.clone() self.optimizer = torch.optim.Adam(self.q_func.parameters(), lr=self.algo.lr, eps=1e-4) self.target_update_tracker = PeriodicTracker( period=self.algo.target_network_update_freq, )
def __init__(self, *, host, port, flush_iteration): """ Args: flush_iteration: how many send() calls before we flush the buffer """ U.assert_type(flush_iteration, int) self._client = ZmqSender(host=host, port=port) self._exp_buffer = ExpBuffer() self._flush_tracker = PeriodicTracker(flush_iteration)
class EvalTensorplexMonitor(EpisodeMonitor): def __init__(self, env, eval_id, fetch_parameter, session_config, separate_plots=False): """ Display "reward" and "step_per_s" curves on Tensorboard Args: env: eval_id: fetch_parameter: lambda function that pulls from parameter server session_config: to construct AgentTensorplex - interval: log to Tensorplex every N episodes. - average_episodes: average rewards/speed over the last N episodes separate_plots: True to put reward plot in a separate section on Tensorboard, False to put all plots together """ super().__init__(env) self.tensorplex = get_tensorplex_client( '{}/{}'.format('eval', eval_id), session_config ) interval = session_config['tensorplex']['update_schedule']['eval_env'] self._periodic = PeriodicTracker(interval) self._avg = interval self._separate_plots = separate_plots self._throttle_sleep = \ session_config['tensorplex']['update_schedule']['eval_env_sleep'] self._fetch_parameter = fetch_parameter self._fetch_parameter() # if this eval is late to the party def _get_tag(self, tag): if self._separate_plots: return ':' + tag # see Tensorplex tag semantics else: return tag def _step(self, action): ob, r, done, info = super()._step(action) if done and self._periodic.track_increment(): scalar_values = { self._get_tag('reward'): U.mean(self.episode_rewards[-self._avg:]), 'step_per_s': self.step_per_sec(self._avg), } self.tensorplex.add_scalars( scalar_values, global_step=self.num_episodes ) time.sleep(self._throttle_sleep) self._fetch_parameter() return ob, r, done, info
def __init__(self, env, update_interval=10, average_over=10, extra_rows=None): """ Args: update_interval: print every N episodes average_over: average rewards/speed over the last N episodes extra_rows: an OrderedDict {'row caption': function(total_steps, num_episodes)} to generate extra rows to the printed table. """ super().__init__(env) self._periodic = PeriodicTracker(update_interval) self._avg = average_over if extra_rows is None: self._extra_rows = OrderedDict() else: assert isinstance(extra_rows, OrderedDict), \ 'extra_rows spec {"row caption": function(total_steps, ' \ 'num_episodes)} must be an OrderedDict' self._extra_rows = extra_rows
class TrainingTensorplexMonitor(EpisodeMonitor): def __init__(self, env, agent_id, session_config, separate_plots=True): """ Display "reward" and "step_per_s" curves on Tensorboard Args: env: agent_id: int. session_config: to construct AgentTensorplex - interval: log to Tensorplex every N episodes. - average_episodes: average rewards/speed over the last N episodes separate_plots: True to put reward plot in a separate section on Tensorboard, False to put all plots together """ super().__init__(env) U.assert_type(agent_id, int) self.tensorplex = get_tensorplex_client( '{}/{}'.format('agent', agent_id), session_config ) interval = session_config['tensorplex']['update_schedule']['training_env'] self._periodic = PeriodicTracker(interval) self._avg = interval self._separate_plots = separate_plots def _get_tag(self, tag): if self._separate_plots: return ':' + tag # see Tensorplex tag semantics else: return tag def _step(self, action): ob, r, done, info = super()._step(action) if done and self._periodic.track_increment(): scalar_values = { self._get_tag('reward'): U.mean(self.episode_rewards[-self._avg:]), 'step_per_s': self.step_per_sec(self._avg), } self.tensorplex.add_scalars( scalar_values, global_step=self.num_episodes ) return ob, r, done, info
class ConsoleMonitor(EpisodeMonitor): def __init__(self, env, update_interval=10, average_over=10, extra_rows=None): """ Args: update_interval: print every N episodes average_over: average rewards/speed over the last N episodes extra_rows: an OrderedDict {'row caption': function(total_steps, num_episodes)} to generate extra rows to the printed table. """ super().__init__(env) self._periodic = PeriodicTracker(update_interval) self._avg = average_over if extra_rows is None: self._extra_rows = OrderedDict() else: assert isinstance(extra_rows, OrderedDict), \ 'extra_rows spec {"row caption": function(total_steps, ' \ 'num_episodes)} must be an OrderedDict' self._extra_rows = extra_rows def _step(self, action): ob, r, done, info = super()._step(action) if done and self._periodic.track_increment(): info_table = [] avg_reward = U.mean(self.episode_rewards[-self._avg:]) info_table.append(['Last {} rewards'.format(self._avg), U.fformat(avg_reward, 3)]) avg_speed = self.step_per_sec(self._avg) info_table.append(['Speed iter/s', U.fformat(avg_speed, 1)]) info_table.append(['Total steps', self.total_steps]) info_table.append(['Episodes', self.num_episodes]) for row_caption, row_func in self._extra_rows.items(): row_value = row_func(self.total_steps, self.num_episodes) info_table.append([row_caption, str(row_value)]) # `fancy_grid` doesn't work in terminal that doesn't display unicode print(tabulate(info_table, tablefmt='simple', numalign='left')) return ob, r, done, info
class ExpSender(object): """ `send()` logic can be overwritten to support more complicated agent experiences, such as multiagent, self-play, etc. """ def __init__(self, *, host, port, flush_iteration): """ Args: flush_iteration: how many send() calls before we flush the buffer """ U.assert_type(flush_iteration, int) self._client = ZmqSender(host=host, port=port) self._exp_buffer = ExpBuffer() self._flush_tracker = PeriodicTracker(flush_iteration) def send(self, hash_dict, nonhash_dict): """ Args: hash_dict: Large/Heavy data that should be deduplicated by the caching mekanism nonhash_dict: Small data that we can afford to keep copies of """ self._exp_buffer.add( hash_dict=hash_dict, nonhash_dict=nonhash_dict, ) if self._flush_tracker.track_increment(): exp_binary = self._exp_buffer.flush() self._client.send(exp_binary) return U.binary_hash(exp_binary) else: return None
def _setup_parameter_pull(self): self._fetch_parameter_mode = self.session_config.agent.fetch_parameter_mode self._fetch_parameter_interval = self.session_config.agent.fetch_parameter_interval self._fetch_parameter_tracker = PeriodicTracker( self._fetch_parameter_interval)
class Agent(object, metaclass=U.AutoInitializeMeta): """ Important: When extending this class, make sure to follow the init method signature so that orchestrating functions can properly initialize custom agents. TODO: Extend the initilization to allow custom non-config per-agent settings. To be used to have a heterogeneous agent population """ def __init__(self, learner_config, env_config, session_config, agent_id, agent_mode, render=False): """ Initialize the agent class, """ self.learner_config = learner_config self.env_config = env_config self.session_config = session_config assert agent_mode in AGENT_MODES self.agent_mode = agent_mode self.agent_id = agent_id if self.agent_mode not in [ 'eval_deterministic_local', 'eval_stochastic_local' ]: self._setup_parameter_pull() self._setup_logging() self.current_episode = 0 self.cumulative_steps = 0 self.current_step = 0 self.actions_since_param_update = 0 self.episodes_since_param_update = 0 self.render = render ####### # Internal initialization methods ####### def _initialize(self): """ implements AutoInitializeMeta meta class. self.module_dict can only happen after the module is constructed by subclasses. """ if self.agent_mode not in [ 'eval_deterministic_local', 'eval_stochastic_local' ]: host, port = os.environ['SYMPH_PS_FRONTEND_HOST'], os.environ[ 'SYMPH_PS_FRONTEND_PORT'] self._module_dict = self.module_dict() if not isinstance(self._module_dict, ModuleDict): self._module_dict = ModuleDict(self._module_dict) self._ps_client = ParameterClient( host=host, port=port, ) def _setup_parameter_pull(self): self._fetch_parameter_mode = self.session_config.agent.fetch_parameter_mode self._fetch_parameter_interval = self.session_config.agent.fetch_parameter_interval self._fetch_parameter_tracker = PeriodicTracker( self._fetch_parameter_interval) def _setup_logging(self): """ Creates tensorplex logger and loggerplex logger Initializes bookkeeping values """ if self.agent_mode == 'training': logger_name = 'agent-{}'.format(self.agent_id) self.tensorplex = self._get_tensorplex('{}/{}'.format( 'agent', self.agent_id)) else: logger_name = 'eval-{}'.format(self.agent_id) self.tensorplex = self._get_tensorplex('{}/{}'.format( 'eval', self.agent_id)) self.log = get_loggerplex_client(logger_name, self.session_config) # record how long the current parameter have been used self.actions_since_param_update = 0 self.episodes_since_param_update = 0 # Weighted Average over ~100 parameter updates. self.actions_per_param_update = U.MovingAverageRecorder(decay=0.99) self.episodes_per_param_update = U.MovingAverageRecorder(decay=0.99) def _get_tensorplex(self, name): """ Get the periodic tensorplex object Args: @name: The name of the collection of metrics """ tp = get_tensorplex_client(name, self.session_config) periodic_tp = PeriodicTensorplex( tensorplex=tp, period=self.session_config.tensorplex.update_schedule.agent, is_average=True, keep_full_history=False) return periodic_tp ####### # Exposed abstract methods # Override in subclass, no need to call super().act etc. # Enough for basic usage ####### def act(self, obs): """ Abstract method for taking actions. You should check `self.agent_mode` in the function and change act() logic with respect to training VS evaluation. Args: obs: typically a single obs, make sure to vectorize it first before passing to the torch `model`. Returns: action to be executed in the env """ raise NotImplementedError def module_dict(self): """ Returns: a dict of name -> surreal.utils.pytorch.Module """ raise NotImplementedError ####### # Advanced exposed methods # Override in subclass, NEED to call super().on_parameter_fetched() etc. # User need to take care of agent mode # For advanced usage ####### def on_parameter_fetched(self, params, info): """ Called when a new parameter is fetched. """ if self.agent_mode == 'training': # The time it takes for parameter to go from learner to agent delay = time.time() - info['time'] self.actions_per_param_update.add_value( self.actions_since_param_update) self.episodes_per_param_update.add_value( self.episodes_since_param_update) self.tensorplex.add_scalars({ '.core/parameter_publish_delay_s': delay, '.core/actions_per_param_update': self.actions_per_param_update.cur_value(), '.core/episodes_per_param_update': self.episodes_per_param_update.cur_value() }) self.actions_since_param_update = 0 self.episodes_since_param_update = 0 return params def pre_action(self, obs): """ Called before act is called by agent main script """ if self.agent_mode == 'training': if self._fetch_parameter_mode == 'step' and \ self._fetch_parameter_tracker.track_increment(): self.fetch_parameter() def post_action(self, obs, action, obs_next, reward, done, info): """ Called after act is called by agent main script """ self.current_step += 1 self.cumulative_steps += 1 if self.agent_mode == 'training': self.actions_since_param_update += 1 if done: self.episodes_since_param_update += 1 def pre_episode(self): """ Called by agent process. Can beused to reset internal states before an episode starts """ if self.agent_mode == 'training': if self._fetch_parameter_mode == 'episode' and \ self._fetch_parameter_tracker.track_increment(): self.fetch_parameter() def post_episode(self): """ Called by agent process. Can beused to reset internal states after an episode ends I.e. after the post_action when done = True """ self.current_episode += 1 ####### # Main loops. # Customize this to fully customize the agent process ####### def main(self): """ Default Main loop Args: @env: the environment to run agent on """ self.main_setup() while True: self.main_loop() def main_setup(self): """ Setup before constant looping """ env = self.get_env() env = self.prepare_env(env) self.env = env if self.agent_mode == "training": self.fetch_parameter() def main_loop(self): """ One loop of agent, runs one episode of the environment """ env = self.env self.pre_episode() obs, info = env.reset() total_reward = 0.0 while True: if self.render: env.unwrapped.render( ) # TODO: figure out why it needs to be unwrapped self.pre_action(obs) action = self.act(obs) obs_next, reward, done, info = env.step(action) total_reward += reward self.post_action(obs, action, obs_next, reward, done, info) obs = obs_next if done: break self.post_episode() if self.agent_mode in [ 'eval_deterministic_local', 'eval_stochastic_local' ]: return if self.current_episode % 20 == 0: self.log.info('Episode {} reward {}'.format( self.current_episode, total_reward)) def get_env(self): """ Returns a subclass of EnvBase, created from self.env_config """ if self.agent_mode in ['eval_deterministic', 'eval_stochastic']: env, _ = make_env(self.env_config, mode='eval') else: env, _ = make_env(self.env_config) return env def prepare_env(self, env): """ Applies custom wrapper to the environment as necessary Args: @env: subclass of EnvBse Returns: @env: The (possibly wrapped) environment """ if self.agent_mode == 'training': return self.prepare_env_agent(env) else: return self.prepare_env_eval(env) def prepare_env_agent(self, env): """ Applies custom wrapper to the environment as necessary Only changes agent behavior """ # This has to go first as it alters step() return value limit_episode_length = self.env_config.limit_episode_length if limit_episode_length > 0: env = MaxStepWrapper(env, limit_episode_length) env = TrainingTensorplexMonitor(env, agent_id=self.agent_id, session_config=self.session_config, separate_plots=True) return env def prepare_env_eval(self, env): """ Applies custom wrapper to the environment as necessary Only changes eval behavior """ limit_episode_length = self.env_config.limit_episode_length if limit_episode_length > 0: env = MaxStepWrapper(env, limit_episode_length) if self.agent_mode not in [ 'eval_deterministic_local', 'eval_stochastic_local' ]: env = EvalTensorplexMonitor( env, eval_id=self.agent_id, fetch_parameter=self.fetch_parameter, session_config=self.session_config, ) env_category = self.env_config.env_name.split(':')[0] if self.env_config.video.record_video and self.agent_id == 0: # gym video recording not supported due to bug in OpenAI gym # https://github.com/openai/gym/issues/1050 env = VideoWrapper(env, self.env_config, self.session_config) return env def main_agent(self): """ Main loop ran by the agent script Override if you want to customize agent behavior completely """ self.main() def main_eval(self): """ Main loop ran by the eval script Override if you want to customize eval behavior completely """ self.main() ####### # Exposed public methods ####### def fetch_parameter(self): """ Extends base class fetch_parameters to add some logging """ params, info = self._ps_client.fetch_parameter_with_info() if params: params = U.deserialize(params) params = self.on_parameter_fetched(params, info) self._module_dict.load(params) def fetch_parameter_info(self): """ Fetch information about the parameters currently held by the parameter server """ return self._ps_client.fetch_info() def set_agent_mode(self, agent_mode): """ Args: agent_mode: 'training', 'eval_deterministic', or 'eval_stochastic' """ assert agent_mode in AGENT_MODES self.agent_mode = agent_mode
class DQNLearner(Learner): def __init__(self, learner_config, env_config, session_config): super().__init__(learner_config, env_config, session_config) self.q_func, self.action_dim = build_ffqfunc(self.learner_config, self.env_config) self.algo = self.learner_config.algo self.q_target = self.q_func.clone() self.optimizer = torch.optim.Adam(self.q_func.parameters(), lr=self.algo.lr, eps=1e-4) self.target_update_tracker = PeriodicTracker( period=self.algo.target_network_update_freq, ) def _update_target(self): self.q_target.copy_from(self.q_func) def _run_optimizer(self, loss): self.optimizer.zero_grad() loss.backward() norm_clip = self.algo.grad_norm_clipping if norm_clip is not None: self.q_func.clip_grad_norm(norm_clip) # torch.nn.utils.net_clip_grad_norm( # self.q_func.parameters(), # max_norm=norm_clip # ) self.optimizer.step() def _optimize(self, obs, actions, rewards, obs_next, dones, weights): # Compute Q(s_t, a) # columns of actions taken batch_size = obs.size(0) assert (U.shape(actions) == U.shape(rewards) == U.shape(dones) == (batch_size, 1)) q_t_at_action = self.q_func(obs).gather(1, actions) q_tp1 = self.q_target(obs_next) # Double Q if self.algo.double_q: # select argmax action using online weights instead of q_target q_tp1_online = self.q_func(obs_next) q_tp1_online_argmax = q_tp1_online.max(1, keepdim=True)[1] q_tp1_best = q_tp1.gather(1, q_tp1_online_argmax) else: # Minh 2015 Nature paper # use target network for both policy and value selection q_tp1_best = q_tp1.max(1, keepdim=True)[0] # Q value for terminal states are 0 q_tp1_best = (1.0 - dones) * q_tp1_best # .detach() stops gradient and makes the Variable forget its creator q_tp1_best = q_tp1_best.detach() # RHS of bellman equation q_expected = rewards + self.algo.gamma * q_tp1_best td_error = q_t_at_action - q_expected # torch_where raw_loss = U.huber_loss_per_element(td_error) weighted_loss = torch.mean(weights * raw_loss) self._run_optimizer(weighted_loss) return td_error def learn(self, batch_exp): weights = (U.torch_ones_like(batch_exp.rewards)) td_errors = self._optimize( batch_exp.obs, batch_exp.actions, batch_exp.rewards, batch_exp.obs_next, batch_exp.dones, weights, ) batch_size = batch_exp.obs.size(0) if self.target_update_tracker.track_increment(batch_size): # Update target network periodically. self._update_target() mean_td_error = U.to_scalar(torch.mean(torch.abs(td_errors))) self.tensorplex.add_scalars({'td_error': mean_td_error}) def default_config(self): return { 'model': { 'convs': '_list_', 'fc_hidden_sizes': '_list_', 'dueling': '_bool_' }, 'algo': { 'lr': 1e-3, 'optimizer': 'Adam', 'grad_norm_clipping': 10, 'gamma': .99, 'target_network_update_freq': '_int_', 'double_q': True, 'exploration': { 'schedule': 'linear', 'steps': '_int_', 'final_eps': 0.01, }, 'prioritized': { 'enabled': False, 'alpha': 0.6, 'beta0': 0.4, 'beta_anneal_iters': None, 'eps': 1e-6 }, }, } def module_dict(self): return {'q_func': self.q_func} """