def __init__( self, obs_spec, act_spec, model_fn=build_mlp, policy_cls=MultiPolicy, sess_mgr=None, n_envs=4, traj_len=16, batch_sz=16, discount=0.99, gae_lambda=0.95, clip_rewards=0.0, normalize_advantages=True, bootstrap_terminals=False, clip_grads_norm=0.0, value_coef=0.5, entropy_coef=0.001, optimizer=tf.train.AdamOptimizer(), logger=Logger(), ): self.value_coef = value_coef self.entropy_coef = entropy_coef SyncRunningAgent.__init__(self, n_envs) ActorCriticAgent.__init__(self, obs_spec, act_spec, model_fn, policy_cls, sess_mgr, traj_len, batch_sz, discount, gae_lambda, clip_rewards, normalize_advantages, bootstrap_terminals, clip_grads_norm, optimizer, logger)
def __init__( self, obs_spec: Spec, act_spec: Spec, model_fn: ModelBuilder = None, policy_cls: PolicyType = None, sess_mgr: SessionManager = None, optimizer: tf.train.Optimizer = None, n_envs=1, value_coef=DEFAULTS['value_coef'], entropy_coef=DEFAULTS['entropy_coef'], traj_len=DEFAULTS['traj_len'], batch_sz=DEFAULTS['batch_sz'], discount=DEFAULTS['discount'], gae_lambda=DEFAULTS['gae_lambda'], clip_rewards=DEFAULTS['clip_rewards'], clip_grads_norm=DEFAULTS['clip_grads_norm'], normalize_returns=DEFAULTS['normalize_returns'], normalize_advantages=DEFAULTS['normalize_advantages'], ): kwargs = { k: v for k, v in locals().items() if k in DEFAULTS and DEFAULTS[k] != v } SyncRunningAgent.__init__(self, n_envs) ActorCriticAgent.__init__(self, obs_spec, act_spec, sess_mgr=sess_mgr, **kwargs) self.logger = StreamLogger(n_envs=n_envs, log_freq=10, sess_mgr=self.sess_mgr)
def __init__( self, obs_spec: Spec, act_spec: Spec, model_fn: ModelBuilder, policy_cls: PolicyType, sess_mgr: SessionManager = None, n_envs=4, traj_len=16, batch_sz=16, discount=0.99, gae_lambda=0.95, clip_rewards=0.0, normalize_advantages=True, clip_grads_norm=0.0, value_coef=0.5, entropy_coef=0.001, optimizer=tf.train.AdamOptimizer(), logger=Logger(), ): self.value_coef = value_coef self.entropy_coef = entropy_coef SyncRunningAgent.__init__(self, n_envs) ActorCriticAgent.__init__(self, obs_spec, act_spec, model_fn, policy_cls, sess_mgr, traj_len, batch_sz, discount, gae_lambda, clip_rewards, normalize_advantages, clip_grads_norm, optimizer, logger)
def __init__( self, obs_spec, act_spec, model_fn=build_mlp, policy_cls=MultiPolicy, sess_mgr=None, n_envs=4, traj_len=16, batch_sz=16, discount=0.99, gae_lambda=0.95, clip_rewards=0.0, normalize_advantages=True, bootstrap_terminals=False, clip_grads_norm=0.0, value_coef=0.5, entropy_coef=0.001, optimizer=tf.train.AdamOptimizer(), logger=Logger(), ): self.value_coef = value_coef self.entropy_coef = entropy_coef SyncRunningAgent.__init__(self, n_envs) ActorCriticAgent.__init__( self, obs_spec, act_spec, model_fn, policy_cls, sess_mgr, traj_len, batch_sz, discount, gae_lambda, clip_rewards, normalize_advantages, bootstrap_terminals, clip_grads_norm, optimizer, logger )
def __init__( self, obs_spec: Spec, act_spec: Spec, model_fn: ModelBuilder = None, policy_cls: PolicyType = None, sess_mgr: SessionManager = None, optimizer: tf.train.Optimizer = None, n_envs=4, n_epochs=3, minibatch_sz=128, clip_ratio=0.2, clip_value=0.5, value_coef=DEFAULTS['value_coef'], entropy_coef=DEFAULTS['entropy_coef'], traj_len=DEFAULTS['traj_len'], batch_sz=DEFAULTS['batch_sz'], discount=DEFAULTS['discount'], gae_lambda=DEFAULTS['gae_lambda'], clip_rewards=DEFAULTS['clip_rewards'], clip_grads_norm=DEFAULTS['clip_grads_norm'], normalize_returns=DEFAULTS['normalize_returns'], normalize_advantages=DEFAULTS['normalize_advantages'], **kwargs, ): args = kwargs[ 'args'] if 'args' in kwargs else None #include the experimental args self.subenvs = subenvs = kwargs['subenvs'] if 'subenvs' in kwargs else [ ] # include specifed subenvs kwargs = { k: v for k, v in locals().items() if k in DEFAULTS and DEFAULTS[k] != v } kwargs['subenvs'] = subenvs self.n_epochs = n_epochs self.minibatch_sz = minibatch_sz self.clip_ratio = clip_ratio self.clip_value = clip_value SyncRunningAgent.__init__(self, n_envs, args) ActorCriticAgent.__init__(self, obs_spec, act_spec, sess_mgr=sess_mgr, **kwargs) self.logger = StreamLogger(n_envs=n_envs, log_freq=10, sess_mgr=self.sess_mgr) self.start_step = self.start_step // self.n_epochs
def __init__( self, obs_spec: Spec, act_spec: Spec, model_fn: ModelBuilder, policy_cls: PolicyType, sess_mgr: SessionManager = None, n_envs=4, traj_len=16, batch_sz=16, discount=0.99, gae_lambda=0.95, clip_rewards=0.0, normalize_advantages=True, bootstrap_terminals=False, clip_grads_norm=0.0, n_updates=3, minibatch_sz=128, clip_ratio=0.2, value_coef=0.5, entropy_coef=0.001, optimizer=tf.train.AdamOptimizer(), logger=Logger(), ): self.n_updates = n_updates self.minibatch_sz = minibatch_sz self.clip_ratio = clip_ratio self.value_coef = value_coef self.entropy_coef = entropy_coef SyncRunningAgent.__init__(self, n_envs) ActorCriticAgent.__init__(self, obs_spec, act_spec, model_fn, policy_cls, sess_mgr, traj_len, batch_sz, discount, gae_lambda, clip_rewards, normalize_advantages, bootstrap_terminals, clip_grads_norm, optimizer, logger) self.start_step = self.start_step // self.n_updates