Exemplo n.º 1
0
 def __init__(
     self,
     discount=0.99,
     batch_size=256,
     min_steps_learn=int(1e4),
     replay_size=int(1e6),
     training_ratio=256,  # data_consumption / data_generation
     target_update_tau=0.005,  # tau=1 for hard update.
     target_update_interval=1,  # interval=1000 for hard update.
     learning_rate=3e-4,
     OptimCls=torch.optim.Adam,
     optim_kwargs=None,
     initial_optim_state_dict=None,
     action_prior="uniform",  # or "gaussian"
     reward_scale=1,
     reparameterize=True,
     clip_grad_norm=1e6,
     policy_output_regularization=0.001,
     n_step_return=1,
 ):
     if optim_kwargs is None:
         optim_kwargs = dict()
     assert action_prior in ["uniform", "gaussian"]
     save__init__args(locals())
     self.update_counter = 0
Exemplo n.º 2
0
 def __init__(
         self,
         batch_size,
         learning_rate,
         replay_filepath,
         delta_T=0,
         OptimCls=torch.optim.Adam,
         optim_kwargs=None,
         initial_state_dict=None,
         clip_grad_norm=1000.,
         EncoderCls=EncoderModel,
         encoder_kwargs=None,
         latent_size=128,
         ReplayCls=UlForRlReplayBuffer,
         activation_loss_coefficient=0.0,
         learning_rate_anneal=None,  # cosine
         learning_rate_warmup=0,  # number of updates
         VaeHeadCls=VaeHeadModel,
         hidden_sizes=None,  # But maybe use for forward prediction
         DecoderCls=VaeDecoderModel,
         decoder_kwargs=None,
         kl_coeff=1.,
         onehot_action=True,
         validation_split=0.0,
         n_validation_batches=0,
         ):
     optim_kwargs = dict() if optim_kwargs is None else optim_kwargs
     encoder_kwargs = dict() if encoder_kwargs is None else encoder_kwargs
     decoder_kwargs = dict() if decoder_kwargs is None else decoder_kwargs
     save__init__args(locals())
     self.c_e_loss = torch.nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX)
     assert learning_rate_anneal in [None, "cosine"]
     self._replay_T = delta_T + 1
Exemplo n.º 3
0
 def __init__(
         self,
         discount=0.99,
         batch_size=256,
         min_steps_learn=int(1e4),
         replay_size=int(1e6),
         replay_ratio=256,  # data_consumption / data_generation
         target_update_tau=0.005,  # tau=1 for hard update.
         target_update_interval=1,  # 1000 for hard update, 1 for soft.
         learning_rate=3e-4,
         fixed_alpha=None,  # None for adaptive alpha, float for any fixed value
         OptimCls=torch.optim.Adam,
         optim_kwargs=None,
         initial_optim_state_dict=None,  # for all of them.
         action_prior="uniform",  # or "gaussian"
         reward_scale=1,
         target_entropy="auto",  # "auto", float, or None
         reparameterize=True,
         clip_grad_norm=1e9,
         # policy_output_regularization=0.001,
         n_step_return=1,
         updates_per_sync=1,  # For async mode only.
         bootstrap_timelimit=False,
         ReplayBufferCls=None,  # Leave None to select by above options.
 ):
     """Save input arguments."""
     if optim_kwargs is None:
         optim_kwargs = dict()
     assert action_prior in ["uniform", "gaussian"]
     self._batch_size = batch_size
     del batch_size  # Property.
     save__init__args(locals())
Exemplo n.º 4
0
 def __init__(
     self,
     discount=0.99,
     learning_rate=0.001,
     value_loss_coeff=1.,
     entropy_loss_coeff=0.01,
     OptimCls=torch.optim.Adam,
     optim_kwargs=None,
     clip_grad_norm=1.,
     initial_optim_state_dict=None,
     gae_lambda=1,
     minibatches=4,
     epochs=4,
     ratio_clip=0.1,
     linear_lr_schedule=True,
     normalize_advantage=False,
     clip_vf_loss=False,  # Clip VF_loss as in OpenAI?
     normalize_rewards=None,  # Can be 'return' (OpenAI, no mean subtraction), 'reward' (same as obs normalization) or None
     rew_clip=(
         -10, 10),  # Additional clipping for reward (if normalizing reward)
     rew_min_var=1e-6  # Minimum variance in running mean for reward (if normalizing reward)
 ):
     """Saves input settings."""
     if optim_kwargs is None:
         optim_kwargs = dict(eps=1e-5)
     save__init__args(locals())
Exemplo n.º 5
0
 def __init__(
     self,
     discount=0.99,
     learning_rate=0.001,
     vae_learning_rate=0.0001,
     value_loss_coeff=1.,
     entropy_loss_coeff=0.01,
     OptimCls=torch.optim.Adam,
     optim_kwargs={},
     VaeOptimCls=torch.optim.Adam,
     vae_optim_kwargs={},
     clip_grad_norm=1.,
     initial_optim_state_dict=None,
     gae_lambda=1,
     minibatches=4,
     epochs=4,
     ratio_clip=0.1,
     linear_lr_schedule=True,
     vae_linear_lr_schedule=False,
     normalize_advantage=False,
     normalize_rewards=True,
     vae_beta=1,
     vae_loss_coeff=0.1,
     vae_loss_type="l2",
     vae_update_freq=1,
     alternating_optim=False,
 ):
     """Saves input settings."""
     save__init__args(locals())
Exemplo n.º 6
0
 def __init__(
     self,
     discount=0.99,
     learning_rate=1e-4,
     T_target_steps=100,
     bootstrap_with_online_model=False,
     OptimCls=torch.optim.Adam,
     pop_art_reward_normalization=True,
     optim_kwargs=None,
     initial_optim_state_dict=None,
     minibatches=1,
     epochs=1,
     gae_lambda=0.97,
     discrete_actions=False,
     epsilon_eta=0.01,
     epsilon_alpha=0.01,
     initial_eta=1.0,
     initial_alpha=5.0,
     initial_alpha_mu=1.0,
     initial_alpha_sigma=1.0,
     epsilon_alpha_mu=0.0075,
     epsilon_alpha_sigma=1e-5,
 ):
     """Saves input settings."""
     if optim_kwargs is None:
         optim_kwargs = dict()
     self.pop_art_normalizer = PopArtLayer()
     save__init__args(locals())
Exemplo n.º 7
0
    def __init__(self,
                 discount=0.99,
                 learning_rate=0.001,
                 value_loss_coeff=1.,
                 entropy_loss_coeff=0.01,
                 OptimCls=torch.optim.Adam,
                 optim_kwargs=None,
                 clip_grad_norm=1.,
                 initial_optim_state_dict=None,
                 gae_lambda=1,
                 minibatches=4,
                 epochs=4,
                 ratio_clip=0.1,
                 linear_lr_schedule=True,
                 normalize_advantage=False,
                 normalize_reward=False,
                 kernel_params=None,
                 curiosity_type='none'):
        """Saves input settings."""
        if optim_kwargs is None:
            optim_kwargs = dict()
        save__init__args(locals())
        if self.normalize_reward:
            self.reward_ff = RewardForwardFilter(discount)
            self.reward_rms = RunningMeanStd()
        self.intrinsic_rewards = None

        if kernel_params is not None:
            self.mu, self.sigma = self.kernel_params
            self.kernel_line = lambda x: x
            self.kernel_gauss = lambda x: np.sign(x) * self.mu * np.exp(-(abs(
                x) - self.mu)**2 / (2 * self.sigma**2))
Exemplo n.º 8
0
 def __init__(
     self,
     discount=0.99,
     learning_rate=1e-3,  # Main learning rate
     termination_lr=5e-7,  # Termination learning rate
     pi_omega_lr=0.,  # policy over options learning rate
     interest_lr=1e-3,  # Learning rate for interest function
     value_loss_coeff=0.5,
     termination_loss_coeff=1.,  # Coefficient for termination loss component
     entropy_loss_coeff=0.01,  # Entropy loss for low-level policy
     omega_entropy_loss_coeff=0.01,  # Entropy loss for policy over options
     delib_cost=0.,  # Cost for switching options. Subtracted from rewards after normalization...Also added to termination advantage
     OptimCls=torch.optim.Adam,
     optim_kwargs=None,
     clip_grad_norm=1.,
     initial_optim_state_dict=None,
     gae_lambda=1,
     linear_lr_schedule=True,
     normalize_advantage=False,
     normalize_termination_advantage=False,  # Normalize termination advantage? Doesn't seem to be done
     normalize_rewards=None,  # Can be 'return' (OpenAI, no mean subtraction), 'reward' (same as obs normalization) or None
     rew_clip=(
         -10, 10),  # Additional clipping for reward (if normalizing reward)
     rew_min_var=1e-6  # Minimum variance in running mean for reward (if normalizing reward)
 ):
     """Saves input settings."""
     if optim_kwargs is None:
         optim_kwargs = dict(eps=1e-5)
     save__init__args(locals())
Exemplo n.º 9
0
 def __init__(
     self,
     batch_size,
     learning_rate,
     replay_filepath,
     delta_T=1,
     OptimCls=torch.optim.Adam,
     optim_kwargs=None,
     initial_state_dict=None,
     clip_grad_norm=10.,
     EncoderCls=EncoderModel,
     encoder_kwargs=None,
     ReplayCls=UlForRlReplayBuffer,
     onehot_actions=True,
     activation_loss_coefficient=0.0,
     learning_rate_anneal=None,  # cosine
     learning_rate_warmup=0,  # number of updates
     random_shift_prob=0.,
     random_shift_pad=4,
     InverseModelCls=InverseModel,
     inverse_model_kwargs=None,
     entropy_loss_coeff=0.01,
     validation_split=0.0,
     n_validation_batches=0,
 ):
     optim_kwargs = dict() if optim_kwargs is None else optim_kwargs
     encoder_kwargs = dict() if encoder_kwargs is None else encoder_kwargs
     inverse_model_kwargs = dict(
     ) if inverse_model_kwargs is None else inverse_model_kwargs
     save__init__args(locals())
     self.c_e_loss = torch.nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX)
     assert learning_rate_anneal in [None, "cosine"]
     assert onehot_actions  # needs discrete action space for now.
     assert delta_T > 0
     self._replay_T = delta_T + 1
Exemplo n.º 10
0
    def __init__(
        self,
        ModelCls=PiMlpModel,  # Pi model.
        QModelCls=QofMuMlpModel,
        model_kwargs=None,  # Pi model.
        q_model_kwargs=None,
        initial_model_state_dict=None,  # Pi model.
        action_squash=1,  # Max magnitude (or None).
        pretrain_std=0.75,  # High value to make near uniform sampling.
        max_q_eval_mode='none',
        n_qs=2,
    ):
        self._max_q_eval_mode = max_q_eval_mode
        if isinstance(ModelCls, str):
            ModelCls = eval(ModelCls)
        if isinstance(QModelCls, str):
            QModelCls = eval(QModelCls)

        if model_kwargs is None:
            model_kwargs = dict(hidden_sizes=[256, 256])
        if q_model_kwargs is None:
            q_model_kwargs = dict(hidden_sizes=[256, 256])
        super().__init__(ModelCls=ModelCls,
                         model_kwargs=model_kwargs,
                         initial_model_state_dict=initial_model_state_dict
                         )  # For async setup.
        save__init__args(locals())
        self.min_itr_learn = 0  # Get from algo.

        self.log_alpha = None
        print('n_qs', self.n_qs)

        global Models
        Models = namedtuple("Models",
                            ["pi"] + [f"q{i}" for i in range(self.n_qs)])
Exemplo n.º 11
0
 def __init__(
     self,
     discount=0.99,
     learning_rate=0.001,
     value_loss_coeff=1.,
     entropy_loss_coeff=0.01,
     OptimCls=torch.optim.Adam,
     optim_kwargs=None,
     VaeOptimCls=torch.optim.Adam,
     clip_grad_norm=1.,
     initial_optim_state_dict=None,
     gae_lambda=1,
     minibatches=4,
     epochs=4,
     ratio_clip=0.1,
     linear_lr_schedule=True,
     normalize_advantage=False,
     normalize_rewards=False,
     similarity_loss=False,
     similarity_coeff=0.1,
 ):
     """Saves input settings."""
     if optim_kwargs is None:
         optim_kwargs = dict()
     save__init__args(locals())
Exemplo n.º 12
0
 def __init__(
     self,
     replay_filepath,
     learning_rate,
     batch_B=64,
     batch_T=1,
     delta_T=1,
     use_global_global=False,
     use_global_local=True,
     use_local_local=True,
     local_conv_layer=1,  # 0-based indexing
     latent_size=256,
     target_update_tau=0.01,  # 1 for hard update
     target_update_interval=1,
     OptimCls=torch.optim.Adam,
     optim_kwargs=None,
     initial_state_dict=None,
     clip_grad_norm=100.0,
     EncoderCls=StDimEncoderModel,
     encoder_kwargs=None,
     ReplayCls=UlForRlReplayBuffer,
     anchor_hidden_sizes=512,
     activation_loss_coefficient=0.0,
     learning_rate_anneal=None,  # cosine
     learning_rate_warmup=0,  # number of updates
     validation_split=0.0,
     n_validation_batches=0,
 ):
     optim_kwargs = dict() if optim_kwargs is None else optim_kwargs
     encoder_kwargs = dict() if encoder_kwargs is None else encoder_kwargs
     save__init__args(locals())
     self.c_e_loss = torch.nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX)
     assert learning_rate_anneal in [None, "cosine"]
     self._replay_T = batch_T + delta_T
     self.batch_size = batch_B * batch_T  # for logging
 def __init__(
         self,
         replay_filepath,
         ReplayCls=UlForRlReplayBuffer,
         delta_T=1,
         batch_T=1,
         batch_B=256,
         learning_rate=1e-3,
         learning_rate_anneal=None,  # cosine
         learning_rate_warmup=0,  # number of updates
         OptimCls=torch.optim.Adam,
         optim_kwargs=None,
         clip_grad_norm=10.,
         target_update_tau=0.01,  # 1 for hard update
         target_update_interval=1,
         EncoderCls=EncoderModel,
         encoder_kwargs=None,
         latent_size=256,
         anchor_hidden_sizes=512,
         initial_state_dict=None,
         random_shift_prob=1.,
         random_shift_pad=4,
         activation_loss_coefficient=0.,  # rarely if ever use
         validation_split=0.0,
         n_validation_batches=0,  # usually don't do it.
 ):
     encoder_kwargs = dict() if encoder_kwargs is None else encoder_kwargs
     save__init__args(locals())
     assert learning_rate_anneal in [None, "cosine"]
     self.batch_size = batch_B * batch_T  # for logging only
     self._replay_T = batch_T + delta_T
Exemplo n.º 14
0
Arquivo: base.py Projeto: zizai/rlpyt
    def __init__(self,
                 ModelCls=None,
                 model_kwargs=None,
                 initial_model_state_dict=None):
        """
        Arguments are saved but no model initialization occurs.

        Args:
            ModelCls: The model class to be used.
            model_kwargs (optional): Any keyword arguments to pass when instantiating the model.
            initial_model_state_dict (optional): Initial model parameter values.
        """

        save__init__args(locals())
        self.model = None  # type: torch.nn.Module
        self.shared_model = None
        self.distribution = None
        self.device = torch.device("cpu")
        self._mode = None
        if self.model_kwargs is None:
            self.model_kwargs = dict()
        # The rest only for async operations:
        self._rw_lock = RWLock()
        self._send_count = mp.RawValue("l", 0)
        self._recv_count = 0
Exemplo n.º 15
0
 def __init__(
     self,
     batch_T,
     batch_B,
     learning_rate,
     replay_filepath,
     OptimCls=torch.optim.Adam,
     optim_kwargs=None,
     initial_state_dict=None,
     clip_grad_norm=10.,
     EncoderCls=EncoderModel,
     encoder_kwargs=None,
     ReplayCls=UlForRlReplayBuffer,
     onehot_actions=True,
     activation_loss_coefficient=0.0,
     learning_rate_anneal=None,  # cosine
     learning_rate_warmup=0,  # number of updates
     PixCtlModelCls=PixelControlModel,
     pixel_control_model_kwargs=None,
     pixel_control_filename="pixel_control_80x80_4x4.pkl",  # Looks in replay path.
     validation_split=0.0,
     n_validation_batches=0,
 ):
     optim_kwargs = dict() if optim_kwargs is None else optim_kwargs
     encoder_kwargs = dict() if encoder_kwargs is None else encoder_kwargs
     pixel_control_model_kwargs = (dict()
                                   if pixel_control_model_kwargs is None
                                   else pixel_control_model_kwargs)
     save__init__args(locals())
     assert learning_rate_anneal in [None, "cosine"]
     self._replay_T = batch_T
     self.batch_size = batch_T * batch_B  # for logging
 def __init__(
     self,
     discount=0.99,
     batch_size=256,
     min_steps_learn=int(1e4),
     replay_size=int(1e6),
     replay_ratio=256,  # data_consumption / data_generation
     target_update_tau=0.005,  # tau=1 for hard update.
     target_update_interval=1,  # 1000 for hard update, 1 for soft.
     learning_rate=3e-4,
     OptimCls=torch.optim.Adam,
     optim_kwargs=None,
     initial_optim_state_dict=None,  # for all of them.
     action_prior="uniform",  # or "gaussian"
     reward_scale=1,
     reparameterize=True,
     clip_grad_norm=1e9,
     policy_output_regularization=0.001,
     n_step_return=1,
     updates_per_sync=1,  # For async mode only.
     bootstrap_timelimit=True,
 ):
     if optim_kwargs is None:
         optim_kwargs = dict()
     assert action_prior in ["uniform", "gaussian"]
     self._batch_size = batch_size
     del batch_size  # Property.
     save__init__args(locals())
Exemplo n.º 17
0
    def __init__(
            self,
            ModelCls=PiMlpModel,  # Pi model.
            QModelCls=QofMuMlpModel,
            model_kwargs=None,  # Pi model.
            q_model_kwargs=None,
            initial_model_state_dict=None,  # Pi model.
            action_squash=1,  # Max magnitude (or None).
            pretrain_std=0.75,  # High value to make near uniform sampling.
    ):
        if isinstance(ModelCls, str):
            ModelCls = eval(ModelCls)
        if isinstance(QModelCls, str):
            QModelCls = eval(QModelCls)

        if model_kwargs is None:
            model_kwargs = dict(hidden_sizes=[256, 256])
        if q_model_kwargs is None:
            q_model_kwargs = dict(hidden_sizes=[256, 256])
        super().__init__(ModelCls=ModelCls,
                         model_kwargs=model_kwargs,
                         initial_model_state_dict=initial_model_state_dict
                         )  # For async setup.
        save__init__args(locals())
        self.min_itr_learn = 0  # Get from algo.

        self.log_alpha = None
Exemplo n.º 18
0
 def __init__(
     self,
     discount=0.99,
     batch_size=64,
     min_steps_learn=int(1e3),
     replay_size=int(1e6),
     replay_ratio=64,  # data_consumption / data_generation
     target_update_tau=0.01,
     target_update_interval=1,
     policy_update_interval=1,
     learning_rate=1e-4,
     q_learning_rate=5e-5,
     OptimCls=torch.optim.Adam,
     optim_kwargs=None,
     initial_optim_state_dict=None,
     clip_grad_norm=1e8,
     q_target_clip=1e6,
     n_step_return=1,
     updates_per_sync=1,  # For async mode only.
     bootstrap_timelimit=True,
     ReplayBufferCls=None,
     target=False,
 ):
     """Saves input arguments."""
     if optim_kwargs is None:
         optim_kwargs = dict()
     self._batch_size = batch_size
     del batch_size  # Property.
     save__init__args(locals())
Exemplo n.º 19
0
 def __init__(
     self,
     ModelCls=SacModel,
     ConvModelCls=SacConvModel,
     Fc1ModelCls=SacFc1Model,
     PiModelCls=SacActorModel,
     QModelCls=SacCriticModel,
     conv_kwargs=None,
     fc1_kwargs=None,
     pi_model_kwargs=None,
     q_model_kwargs=None,
     initial_state_dict=None,
     action_squash=1.0,
     pretrain_std=0.75,  # 0.75 gets pretty uniform squashed actions
     load_conv=False,
     load_all=False,
     state_dict_filename=None,
     store_latent=False,
 ):
     if conv_kwargs is None:
         conv_kwargs = dict()
     if fc1_kwargs is None:
         fc1_kwargs = dict(latent_size=50)  # default
     if pi_model_kwargs is None:
         pi_model_kwargs = dict(hidden_sizes=[1024, 1024])  # default
     if q_model_kwargs is None:
         q_model_kwargs = dict(hidden_sizes=[1024, 1024])  # default
     save__init__args(locals())
     super().__init__(ModelCls=SacModel)
     self.min_itr_learn = 0  # Get from algo.
     assert not (load_conv and load_all)
Exemplo n.º 20
0
 def __init__(
     self,
     alpha,
     beta,
     gamma,
 ):
     save__init__args(locals(), underscore=True)
Exemplo n.º 21
0
 def __init__(
         self,
         discount=0.99,
         batch_size=64,
         min_steps_learn=int(1e4),
         replay_size=int(1e6),
         replay_ratio=64,  # data_consumption / data_generation
         target_update_tau=0.01,
         target_update_interval=1,
         policy_update_interval=1,
         learning_rate=1e-4,
         q_learning_rate=1e-3,
         OptimCls=torch.optim.Adam,
         optim_kwargs=None,
         initial_optim_state_dict=None,
         clip_grad_norm=1e8,
         q_target_clip=1e6,
         n_step_return=1,
         updates_per_sync=1,  # For async mode only.
 ):
     if optim_kwargs is None:
         optim_kwargs = dict()
     self._batch_size = batch_size
     del batch_size  # Property.
     save__init__args(locals())
Exemplo n.º 22
0
 def __init__(
         self,
         discount=0.99,
         batch_size=500,
         buffer_size=int(1e6),
         min_steps_learn=int(1e1), # very efficient
         target_update_tau=0.9,
         target_update_interval=5,
         policy_update_interval=5,
         learning_rate=1e-2,
         d_learning_rate=1e-2,
         OptimCls=torch.optim.Adam,
         optim_kwargs=None,
         initial_optim_state_dict=None,
         clip_grad_norm=1e8,
         d_target_clip=1e6,
         updates_per_sync=1,  # For async mode only.
         bootstrap_timelimit=True,
         obs_cost_fn=None
         ):
     """Saves input arguments."""
     if optim_kwargs is None:
         optim_kwargs = dict()
     self._batch_size = batch_size
     del batch_size  # Property.
     save__init__args(locals())
Exemplo n.º 23
0
 def __init__(
         self,
         batch_B,
         batch_T,
         learning_rate,
         replay_filepath,
         warmup_T=0,
         rnn_size=256,
         latent_size=256,
         OptimCls=torch.optim.Adam,
         optim_kwargs=None,
         initial_state_dict=None,
         clip_grad_norm=1000.,
         validation_split=0.0,
         n_validation_batches=0,
         EncoderCls=EncoderModel,
         encoder_kwargs=None,
         ReplayCls=UlForRlReplayBuffer,
         onehot_actions=True,
         activation_loss_coefficient=0.,  # 0 for OFF
         learning_rate_anneal=None,  # cosine
         learning_rate_warmup=0,  # number of updates
 ):
     optim_kwargs = dict() if optim_kwargs is None else optim_kwargs
     encoder_kwargs = dict() if encoder_kwargs is None else encoder_kwargs
     save__init__args(locals())
     self.c_e_loss = torch.nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX)
     assert learning_rate_anneal in [None, "cosine"]
     self.batch_size = batch_B * batch_T  # for logging only
     self._replay_T = batch_T + warmup_T
Exemplo n.º 24
0
 def __init__(
         self,
         ModelCls=PiMlpModel,  # Pi model.
         QModelCls=QofMuMlpModel,
         model_kwargs=None,  # Pi model.
         q_model_kwargs=None,
         v_model_kwargs=None,
         initial_model_state_dict=None,  # All models.
         action_squash=1.0,  # Max magnitude (or None).
         pretrain_std=0.75,  # With squash 0.75 is near uniform.
 ):
     """Saves input arguments; network defaults stored within."""
     if model_kwargs is None:
         model_kwargs = dict(hidden_sizes=[256, 256])
     if q_model_kwargs is None:
         q_model_kwargs = dict(hidden_sizes=[256, 256])
     if v_model_kwargs is None:
         v_model_kwargs = dict(hidden_sizes=[256, 256])
     super().__init__(
         ModelCls=ModelCls,
         model_kwargs=model_kwargs,
         initial_model_state_dict=initial_model_state_dict,
     )
     save__init__args(locals())
     self.min_itr_learn = 0  # Get from algo.
Exemplo n.º 25
0
 def __init__(
         self,
         discount=0.99,
         batch_size=32,
         min_steps_learn=int(5e4),
         delta_clip=1.,
         replay_size=int(1e6),
         training_ratio=8,  # data_consumption / data_generation.
         target_update_steps=int(1e4),  # Per env steps sampled.
         n_step_return=1,
         learning_rate=2.5e-4,
         OptimCls=torch.optim.Adam,
         optim_kwargs=None,
         initial_optim_state_dict=None,
         clip_grad_norm=10.,
         eps_init=1,
         eps_final=0.01,
         eps_final_min=None,  # set < eps_final to use vector-valued eps.
         eps_steps=int(1e6),
         eps_eval=0.001,
         double_dqn=False,
         prioritized_replay=False,
         pri_alpha=0.6,
         pri_beta_init=0.4,
         pri_beta_final=1.,
         pri_beta_steps=int(50e6),
         default_priority=None,
         ReplayBufferCls=None,  # Leave None to select by above options.
 ):
     if optim_kwargs is None:
         optim_kwargs = dict(eps=0.01 / batch_size)
     if default_priority is None:
         default_priority = delta_clip
     save__init__args(locals())
Exemplo n.º 26
0
 def __init__(
         self,
         discount=0.997,
         lambda_coef=1.0,
         batch_T=12,  # replay trajectory length
         batch_B=64,
         warmup_T=0,  # originally 40
         store_rnn_state_interval=9,  # 0 for none, 1 for all. default was 40
         min_steps_learn=int(1e5),
         delta_clip=None,  # Typically use squared-error loss (Steven).
         replay_size=int(1e6),
         replay_ratio=1,
         target_update_interval=2500,  # (Steven says 2500 but maybe faster.)
         n_step_return=1,  # originally 5, minimum is 1
         learning_rate=1e-4,
         OptimCls=torch.optim.Adam,
         optim_kwargs=None,
         initial_optim_state_dict=None,
         clip_grad_norm=80.,  # 80 (Steven)
         eps_steps=int(1e6),  # STILL IN ALGO; conver to itr, give to agent.
         double_dqn=False,  # originally True
         prioritized_replay=True,
         pri_alpha=0.6,
         pri_beta_init=0.9,
         pri_beta_final=0.9,
         pri_beta_steps=int(50e6),
         pri_eta=0.9,
         default_priority=None,
         input_priorities=False,  # default True, not sure what it is used for
         input_priority_shift=None,
         value_scale_eps=1e-3,  # 1e-3 (Steven).
         ReplayBufferCls=None,  # leave None to select by above options
         updates_per_sync=1,  # For async mode only.
 ):
     """
     :param discount:
     :param lambda_coef: lambda return coefficient
     :param delta_clip:
     :param target_update_interval:
     :param learning_rate:
     :param OptimCls:
     :param optim_kwargs:
     :param initial_optim_state_dict:
     :param clip_grad_norm:
     :param eps_steps:
     :param double_dqn:
     :param value_scale_eps:
     :param ReplayBufferCls:
     """
     if optim_kwargs is None:
         optim_kwargs = dict(eps=1e-3)  # Assumes Adam.
     if default_priority is None:
         default_priority = delta_clip or 1.
     # if input_priority_shift is None:  # only used in prioritized replay and warmup i think NOTE
     #     input_priority_shift = warmup_T // store_rnn_state_interval
     save__init__args(locals())
     self._batch_size = (self.batch_T + self.warmup_T) * self.batch_B
Exemplo n.º 27
0
 def __init__(
     self,
     envs,
     agent,
     TrajInfoCls,
     max_T,
     max_trajectories=None,
 ):
     save__init__args(locals())
    def __init__(self,
                 game="pong",
                 frame_skip=4,  # Frames per step (>=1).
                 num_img_obs=4,  # Number of (past) frames in observation (>=1) - "frame stacking".
                 clip_reward=True,
                 episodic_lives=True,
                 fire_on_reset=False,
                 max_start_noops=30,
                 repeat_action_probability=0.,
                 horizon=27000,
                 no_extrinsic=False,
                 no_negative_reward=False,
                 normalize_obs=False,
                 normalize_obs_steps=10000,
                 downsampling_scheme='classical',
                 record_freq=0,
                 record_dir=None
                 ):
        save__init__args(locals(), underscore=True)

        # ALE
        game_path = atari_py.get_game_path(game)
        if not os.path.exists(game_path):
            raise IOError("You asked for game {} but path {} does not "
                " exist".format(game, game_path))
        self.ale = atari_py.ALEInterface()
        self.ale.setFloat(b'repeat_action_probability', repeat_action_probability)
        self.ale.loadROM(game_path)

        # Spaces
        self._action_set = self.ale.getMinimalActionSet()
        self._action_space = IntBox(low=0, high=len(self._action_set))
        if downsampling_scheme == 'classical':
            self._frame_shape = (84, 84) # (W, H)
        elif downsampling_scheme == 'new':
            self._frame_shape = (80, 104)
        obs_shape = (num_img_obs, self._frame_shape[1], self._frame_shape[0])
        self._observation_space = IntBox(low=0, high=255, shape=obs_shape, dtype="uint8")
        self._max_frame = self.ale.getScreenGrayscale()
        self._raw_frame_1 = self._max_frame.copy()
        self._raw_frame_2 = self._max_frame.copy()
        self._obs = np.zeros(shape=obs_shape, dtype="uint8")

        # Settings
        self._has_fire = "FIRE" in self.get_action_meanings()
        self._has_up = "UP" in self.get_action_meanings()
        self._horizon = int(horizon)

        # Recording
        self.record_env = False # set in samping_process for environment 0
        self._record_episode = False
        self._record_freq = record_freq
        self._video_dir = os.path.join(record_dir, 'videos')
        self._frames_dir = os.path.join(self._video_dir, 'frames')
        self._episode_number = 0

        self.reset()
Exemplo n.º 29
0
 def __init__(self, alpha=0.6, beta=0.4, default_priority=1, unique=False,
         **kwargs):
     """Fix the SampleFromReplay length here, so priority tree can
     track where not to sample (else would have to temporarily subtract
     from tree every time sampling)."""
     super().__init__(**kwargs)
     save__init__args(locals())
     assert self.batch_T is not None  # Must assign.
     self.init_priority_tree()
Exemplo n.º 30
0
 def __init__(self,
              alpha=0.6,
              beta=0.4,
              default_priority=1,
              unique=False,
              **kwargs):
     super().__init__(**kwargs)
     save__init__args(locals())
     self.init_priority_tree()